2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Generate a spatial analysis against an arbitrary library.
8 To use, build the 'binary_size_tool' target. Then run this tool, passing
9 in the location of the library to be analyzed along with any other options
16 import multiprocessing
26 import binary_size_utils
28 # This path changee is not beautiful. Temporary (I hope) measure until
29 # the chromium project has figured out a proper way to organize the
30 # library of python tools. http://crbug.com/375725
31 elf_symbolizer_path
= os
.path
.abspath(os
.path
.join(
32 os
.path
.dirname(__file__
),
38 sys
.path
.append(elf_symbolizer_path
)
39 import symbols
.elf_symbolizer
as elf_symbolizer
# pylint: disable=F0401
42 # TODO(andrewhayden): Only used for legacy reports. Delete.
43 def FormatBytes(byte_count
):
44 """Pretty-print a number of bytes."""
46 byte_count
= byte_count
/ 1.0e6
47 return '%.1fm' % byte_count
49 byte_count
= byte_count
/ 1.0e3
50 return '%.1fk' % byte_count
51 return str(byte_count
)
54 # TODO(andrewhayden): Only used for legacy reports. Delete.
55 def SymbolTypeToHuman(symbol_type
):
56 """Convert a symbol type as printed by nm into a human-readable name."""
59 'r': 'read-only data',
62 'v': 'weak symbol'}[symbol_type
]
65 def _MkChild(node
, name
):
66 child
= node
['children'].get(name
)
68 child
= {'n': name
, 'children': {}}
69 node
['children'][name
] = child
73 def MakeChildrenDictsIntoLists(node
):
75 if 'children' in node
:
76 largest_list_len
= len(node
['children'])
78 for child
in node
['children'].itervalues():
79 child_largest_list_len
= MakeChildrenDictsIntoLists(child
)
80 if child_largest_list_len
> largest_list_len
:
81 largest_list_len
= child_largest_list_len
82 child_list
.append(child
)
83 node
['children'] = child_list
85 return largest_list_len
88 def MakeCompactTree(symbols
):
89 result
= {'n': '/', 'children': {}, 'k': 'p', 'maxDepth': 0}
90 seen_symbol_with_path
= False
91 for symbol_name
, symbol_type
, symbol_size
, file_path
in symbols
:
93 if 'vtable for ' in symbol_name
:
94 symbol_type
= '@' # hack to categorize these separately
95 # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
97 file_path
= os
.path
.normpath(file_path
)
98 seen_symbol_with_path
= True
100 file_path
= '(No Path)'
102 if file_path
.startswith('/'):
103 file_path
= file_path
[1:]
104 path_parts
= file_path
.split('/')
106 # Find pre-existing node in tree, or update if it already exists
109 while len(path_parts
) > 0:
110 path_part
= path_parts
.pop(0)
111 if len(path_part
) == 0:
114 node
= _MkChild(node
, path_part
)
115 assert not 'k' in node
or node
['k'] == 'p'
116 node
['k'] = 'p' # p for path
118 # 'node' is now the file node. Find the symbol-type bucket.
119 node
['lastPathElement'] = True
120 node
= _MkChild(node
, symbol_type
)
121 assert not 'k' in node
or node
['k'] == 'b'
122 node
['t'] = symbol_type
123 node
['k'] = 'b' # b for bucket
126 # 'node' is now the symbol-type bucket. Make the child entry.
127 node
= _MkChild(node
, symbol_name
)
128 if 'children' in node
:
130 logging
.warning('A container node used as symbol for %s.' % symbol_name
)
131 # This is going to be used as a leaf so no use for child list.
133 node
['value'] = symbol_size
134 node
['t'] = symbol_type
135 node
['k'] = 's' # s for symbol
137 result
['maxDepth'] = max(result
['maxDepth'], depth
)
139 if not seen_symbol_with_path
:
140 logging
.warning('Symbols lack paths. Data will not be structured.')
142 largest_list_len
= MakeChildrenDictsIntoLists(result
)
144 if largest_list_len
> 1000:
145 logging
.warning('There are sections with %d nodes. '
146 'Results might be unusable.' % largest_list_len
)
150 # TODO(andrewhayden): Only used for legacy reports. Delete.
151 def TreeifySymbols(symbols
):
152 """Convert symbols into a path-based tree, calculating size information
155 The result is a dictionary that contains two kinds of nodes:
156 1. Leaf nodes, representing source code locations (e.g., c++ files)
157 These nodes have the following dictionary entries:
158 sizes: a dictionary whose keys are categories (such as code, data,
159 vtable, etceteras) and whose values are the size, in bytes, of
161 size: the total size, in bytes, of all the entries in the sizes dict
162 2. Non-leaf nodes, representing directories
163 These nodes have the following dictionary entries:
164 children: a dictionary whose keys are names (path entries; either
165 directory or file names) and whose values are other nodes;
166 size: the total size, in bytes, of all the leaf nodes that are
167 contained within the children dict (recursively expanded)
169 The result object is itself a dictionary that represents the common ancestor
170 of all child nodes, e.g. a path to which all other nodes beneath it are
171 relative. The 'size' attribute of this dict yields the sum of the size of all
172 leaf nodes within the data structure.
174 dirs
= {'children': {}, 'size': 0}
175 for sym
, symbol_type
, size
, path
in symbols
:
178 path
= os
.path
.normpath(path
)
179 if path
.startswith('/'):
184 parts
= path
.split('/')
188 file_key
= parts
.pop()
191 # Traverse the tree to the parent of the file node, creating as needed
194 if part
not in tree
['children']:
195 tree
['children'][part
] = {'children': {}, 'size': 0}
196 tree
= tree
['children'][part
]
199 # Get (creating if necessary) the node for the file
200 # This node doesn't have a 'children' attribute
201 if file_key
not in tree
['children']:
202 tree
['children'][file_key
] = {'sizes': collections
.defaultdict(int),
204 tree
= tree
['children'][file_key
]
207 # Accumulate size into a bucket within the file
208 symbol_type
= symbol_type
.lower()
209 if 'vtable for ' in sym
:
210 tree
['sizes']['[vtable]'] += size
211 elif 'r' == symbol_type
:
212 tree
['sizes']['[rodata]'] += size
213 elif 'd' == symbol_type
:
214 tree
['sizes']['[data]'] += size
215 elif 'b' == symbol_type
:
216 tree
['sizes']['[bss]'] += size
217 elif 't' == symbol_type
:
218 # 'text' in binary parlance means 'code'.
219 tree
['sizes']['[code]'] += size
220 elif 'w' == symbol_type
:
221 tree
['sizes']['[weak]'] += size
223 tree
['sizes']['[other]'] += size
225 print >> sys
.stderr
, sym
, parts
, file_key
228 key
= 'symbols without paths'
229 if key
not in dirs
['children']:
230 dirs
['children'][key
] = {'sizes': collections
.defaultdict(int),
232 tree
= dirs
['children'][key
]
234 if (sym
.endswith('::__FUNCTION__') or
235 sym
.endswith('::__PRETTY_FUNCTION__')):
236 subkey
= '__FUNCTION__'
237 elif sym
.startswith('CSWTCH.'):
240 subkey
= sym
[0:sym
.find('::') + 2]
241 tree
['sizes'][subkey
] = tree
['sizes'].get(subkey
, 0) + size
246 # TODO(andrewhayden): Only used for legacy reports. Delete.
247 def JsonifyTree(tree
, name
):
248 """Convert TreeifySymbols output to a JSON treemap.
250 The format is very similar, with the notable exceptions being
251 lists of children instead of maps and some different attribute names."""
254 '[vtable]': 'vtable',
255 '[rodata]': 'read-only_data',
259 '[weak]': 'weak_symbol'
261 if 'children' in tree
:
262 # Non-leaf node. Recurse.
263 for child_name
, child
in tree
['children'].iteritems():
264 children
.append(JsonifyTree(child
, child_name
))
266 # Leaf node; dump per-file stats as entries in the treemap
267 for kind
, size
in tree
['sizes'].iteritems():
268 child_json
= {'name': kind
+ ' (' + FormatBytes(size
) + ')',
269 'data': { '$area': size
}}
270 css_class
= css_class_map
.get(kind
)
271 if css_class
is not None:
272 child_json
['data']['$symbol'] = css_class
273 children
.append(child_json
)
274 # Sort children by size, largest to smallest.
275 children
.sort(key
=lambda child
: -child
['data']['$area'])
277 # For leaf nodes, the 'size' attribute is the size of the leaf;
278 # Non-leaf nodes don't really have a size, but their 'size' attribute is
279 # the sum of the sizes of all their children.
280 return {'name': name
+ ' (' + FormatBytes(tree
['size']) + ')',
281 'data': { '$area': tree
['size'] },
282 'children': children
}
284 def DumpCompactTree(symbols
, outfile
):
285 tree_root
= MakeCompactTree(symbols
)
286 with
open(outfile
, 'w') as out
:
287 out
.write('var tree_data = ')
288 json
.dump(tree_root
, out
)
289 print('Writing %d bytes json' % os
.path
.getsize(outfile
))
292 # TODO(andrewhayden): Only used for legacy reports. Delete.
293 def DumpTreemap(symbols
, outfile
):
294 dirs
= TreeifySymbols(symbols
)
295 out
= open(outfile
, 'w')
297 out
.write('var kTree = ' + json
.dumps(JsonifyTree(dirs
, '/')))
303 # TODO(andrewhayden): Only used for legacy reports. Delete.
304 def DumpLargestSymbols(symbols
, outfile
, n
):
305 # a list of (sym, symbol_type, size, path); sort by size.
306 symbols
= sorted(symbols
, key
=lambda x
: -x
[2])
308 out
= open(outfile
, 'w')
310 out
.write('var largestSymbols = [\n')
311 for sym
, symbol_type
, size
, path
in symbols
:
312 if symbol_type
in ('b', 'w'):
313 continue # skip bss and weak symbols
316 entry
= {'size': FormatBytes(size
),
318 'type': SymbolTypeToHuman(symbol_type
),
320 out
.write(json
.dumps(entry
))
331 def MakeSourceMap(symbols
):
333 for _sym
, _symbol_type
, size
, path
in symbols
:
336 key
= os
.path
.normpath(path
)
339 if key
not in sources
:
340 sources
[key
] = {'path': path
, 'symbol_count': 0, 'size': 0}
341 record
= sources
[key
]
342 record
['size'] += size
343 record
['symbol_count'] += 1
347 # TODO(andrewhayden): Only used for legacy reports. Delete.
348 def DumpLargestSources(symbols
, outfile
, n
):
349 source_map
= MakeSourceMap(symbols
)
350 sources
= sorted(source_map
.values(), key
=lambda x
: -x
['size'])
352 out
= open(outfile
, 'w')
354 out
.write('var largestSources = [\n')
355 for record
in sources
:
356 entry
= {'size': FormatBytes(record
['size']),
357 'symbol_count': str(record
['symbol_count']),
358 'location': record
['path']}
359 out
.write(json
.dumps(entry
))
370 # TODO(andrewhayden): Only used for legacy reports. Delete.
371 def DumpLargestVTables(symbols
, outfile
, n
):
373 for symbol
, _type
, size
, path
in symbols
:
374 if 'vtable for ' in symbol
:
375 vtables
.append({'symbol': symbol
, 'path': path
, 'size': size
})
376 vtables
= sorted(vtables
, key
=lambda x
: -x
['size'])
378 out
= open(outfile
, 'w')
380 out
.write('var largestVTables = [\n')
381 for record
in vtables
:
382 entry
= {'size': FormatBytes(record
['size']),
383 'symbol': record
['symbol'],
384 'location': record
['path']}
385 out
.write(json
.dumps(entry
))
396 # Regex for parsing "nm" output. A sample line looks like this:
397 # 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
399 # The fields are: address, size, type, name, source location
400 # Regular expression explained ( see also: https://xkcd.com/208 ):
401 # ([0-9a-f]{8,}+) The address
402 # [\s]+ Whitespace separator
403 # ([0-9a-f]{8,}+) The size. From here on out it's all optional.
404 # [\s]+ Whitespace separator
405 # (\S?) The symbol type, which is any non-whitespace char
406 # [\s*] Whitespace separator
407 # ([^\t]*) Symbol name, any non-tab character (spaces ok!)
408 # [\t]? Tab separator
409 # (.*) The location (filename[:linennum|?][ (discriminator n)]
410 sNmPattern
= re
.compile(
411 r
'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
418 self
.time_last_output
= time
.time()
419 self
.count_last_output
= 0
422 def RunElfSymbolizer(outfile
, library
, addr2line_binary
, nm_binary
, jobs
):
423 nm_output
= RunNm(library
, nm_binary
)
424 nm_output_lines
= nm_output
.splitlines()
425 nm_output_lines_len
= len(nm_output_lines
)
427 progress
= Progress()
428 def map_address_symbol(symbol
, addr
):
430 if addr
in address_symbol
:
431 # 'Collision between %s and %s.' % (str(symbol.name),
432 # str(address_symbol[addr].name))
433 progress
.collisions
+= 1
435 address_symbol
[addr
] = symbol
438 if progress
.count
% progress_chunk
== 0:
439 time_now
= time
.time()
440 time_spent
= time_now
- progress
.time_last_output
442 # Only output at most once per second.
443 progress
.time_last_output
= time_now
444 chunk_size
= progress
.count
- progress
.count_last_output
445 progress
.count_last_output
= progress
.count
447 speed
= chunk_size
/ time_spent
450 progress_percent
= (100.0 * (progress
.count
+ progress
.skip_count
) /
452 print('%.1f%%: Looked up %d symbols (%d collisions) - %.1f lookups/s.' %
453 (progress_percent
, progress
.count
, progress
.collisions
, speed
))
455 symbolizer
= elf_symbolizer
.ELFSymbolizer(library
, addr2line_binary
,
457 max_concurrent_jobs
=jobs
)
458 user_interrupted
= False
460 for line
in nm_output_lines
:
461 match
= sNmPattern
.match(line
)
463 location
= match
.group(5)
465 addr
= int(match
.group(1), 16)
466 size
= int(match
.group(2), 16)
467 if addr
in address_symbol
: # Already looked up, shortcut
469 map_address_symbol(address_symbol
[addr
], addr
)
472 # Save time by not looking up empty symbols (do they even exist?)
473 print('Empty symbol: ' + line
)
475 symbolizer
.SymbolizeAsync(addr
, addr
)
478 progress
.skip_count
+= 1
479 except KeyboardInterrupt:
480 user_interrupted
= True
481 print('Interrupting - killing subprocesses. Please wait.')
485 except KeyboardInterrupt:
486 # Don't want to abort here since we will be finished in a few seconds.
487 user_interrupted
= True
488 print('Patience you must have my young padawan.')
491 print('Skipping the rest of the file mapping. '
492 'Output will not be fully classified.')
494 with
open(outfile
, 'w') as out
:
495 for line
in nm_output_lines
:
496 match
= sNmPattern
.match(line
)
498 location
= match
.group(5)
500 addr
= int(match
.group(1), 16)
501 symbol
= address_symbol
.get(addr
)
502 if symbol
is not None:
504 if symbol
.source_path
is not None:
505 path
= symbol
.source_path
507 if symbol
.source_line
is not None:
508 line_number
= symbol
.source_line
509 out
.write('%s\t%s:%d\n' % (line
, path
, line_number
))
512 out
.write('%s\n' % line
)
514 print('%d symbols in the results.' % len(address_symbol
))
517 def RunNm(binary
, nm_binary
):
519 cmd
= [nm_binary
, '-C', '--print-size', '--size-sort', '--reverse-sort',
521 nm_process
= subprocess
.Popen(cmd
,
522 stdout
=subprocess
.PIPE
,
523 stderr
=subprocess
.PIPE
)
524 (process_output
, err_output
) = nm_process
.communicate()
526 if nm_process
.returncode
!= 0:
528 raise Exception, err_output
530 raise Exception, process_output
533 return process_output
536 def GetNmSymbols(nm_infile
, outfile
, library
, jobs
, verbose
,
537 addr2line_binary
, nm_binary
):
538 if nm_infile
is None:
540 outfile
= tempfile
.NamedTemporaryFile(delete
=False).name
543 print 'Running parallel addr2line, dumping symbols to ' + outfile
544 RunElfSymbolizer(outfile
, library
, addr2line_binary
, nm_binary
, jobs
)
549 print 'Using nm input from ' + nm_infile
550 with
file(nm_infile
, 'r') as infile
:
551 return list(binary_size_utils
.ParseNm(infile
))
554 def _find_in_system_path(binary
):
555 """Locate the full path to binary in the system path or return None
557 system_path
= os
.environ
["PATH"].split(os
.pathsep
)
558 for path
in system_path
:
559 binary_path
= os
.path
.join(path
, binary
)
560 if os
.path
.isfile(binary_path
):
566 usage
= """%prog [options]
568 Runs a spatial analysis on a given library, looking up the source locations
569 of its symbols and calculating how much space each directory, source file,
570 and so on is taking. The result is a report that can be used to pinpoint
571 sources of large portions of the binary, etceteras.
573 Under normal circumstances, you only need to pass two arguments, thusly:
575 %prog --library /path/to/library --destdir /path/to/output
577 In this mode, the program will dump the symbols from the specified library
578 and map those symbols back to source locations, producing a web-based
579 report in the specified output directory.
581 Other options are available via '--help'.
583 parser
= optparse
.OptionParser(usage
=usage
)
584 parser
.add_option('--nm-in', metavar
='PATH',
585 help='if specified, use nm input from <path> instead of '
586 'generating it. Note that source locations should be '
587 'present in the file; i.e., no addr2line symbol lookups '
588 'will be performed when this option is specified. '
589 'Mutually exclusive with --library.')
590 parser
.add_option('--destdir', metavar
='PATH',
591 help='write output to the specified directory. An HTML '
592 'report is generated here along with supporting files; '
593 'any existing report will be overwritten.')
594 parser
.add_option('--library', metavar
='PATH',
595 help='if specified, process symbols in the library at '
596 'the specified path. Mutually exclusive with --nm-in.')
597 parser
.add_option('--nm-binary',
598 help='use the specified nm binary to analyze library. '
599 'This is to be used when the nm in the path is not for '
600 'the right architecture or of the right version.')
601 parser
.add_option('--addr2line-binary',
602 help='use the specified addr2line binary to analyze '
603 'library. This is to be used when the addr2line in '
604 'the path is not for the right architecture or '
605 'of the right version.')
606 parser
.add_option('--jobs',
607 help='number of jobs to use for the parallel '
608 'addr2line processing pool; defaults to 1. More '
609 'jobs greatly improve throughput but eat RAM like '
610 'popcorn, and take several gigabytes each. Start low '
611 'and ramp this number up until your machine begins to '
612 'struggle with RAM. '
613 'This argument is only valid when using --library.')
614 parser
.add_option('-v', dest
='verbose', action
='store_true',
615 help='be verbose, printing lots of status information.')
616 parser
.add_option('--nm-out', metavar
='PATH',
617 help='keep the nm output file, and store it at the '
618 'specified path. This is useful if you want to see the '
619 'fully processed nm output after the symbols have been '
620 'mapped to source locations. By default, a tempfile is '
621 'used and is deleted when the program terminates.'
622 'This argument is only valid when using --library.')
623 parser
.add_option('--legacy', action
='store_true',
624 help='emit legacy binary size report instead of modern')
625 opts
, _args
= parser
.parse_args()
627 if ((not opts
.library
) and (not opts
.nm_in
)) or (opts
.library
and opts
.nm_in
):
628 parser
.error('exactly one of --library or --nm-in is required')
631 print >> sys
.stderr
, ('WARNING: --jobs has no effect '
632 'when used with --nm-in')
634 parser
.error('--destdir is required argument')
636 # Use the number of processors but cap between 2 and 4 since raw
637 # CPU power isn't the limiting factor. It's I/O limited, memory
638 # bus limited and available-memory-limited. Too many processes and
639 # the computer will run out of memory and it will be slow.
640 opts
.jobs
= max(2, min(4, str(multiprocessing
.cpu_count())))
642 if opts
.addr2line_binary
:
643 assert os
.path
.isfile(opts
.addr2line_binary
)
644 addr2line_binary
= opts
.addr2line_binary
646 addr2line_binary
= _find_in_system_path('addr2line')
647 assert addr2line_binary
, 'Unable to find addr2line in the path. '\
648 'Use --addr2line-binary to specify location.'
651 assert os
.path
.isfile(opts
.nm_binary
)
652 nm_binary
= opts
.nm_binary
654 nm_binary
= _find_in_system_path('nm')
655 assert nm_binary
, 'Unable to find nm in the path. Use --nm-binary '\
656 'to specify location.'
658 print('nm: %s' % nm_binary
)
659 print('addr2line: %s' % addr2line_binary
)
661 symbols
= GetNmSymbols(opts
.nm_in
, opts
.nm_out
, opts
.library
,
662 opts
.jobs
, opts
.verbose
is True,
663 addr2line_binary
, nm_binary
)
664 if not os
.path
.exists(opts
.destdir
):
665 os
.makedirs(opts
.destdir
, 0755)
668 if opts
.legacy
: # legacy report
669 DumpTreemap(symbols
, os
.path
.join(opts
.destdir
, 'treemap-dump.js'))
670 DumpLargestSymbols(symbols
,
671 os
.path
.join(opts
.destdir
, 'largest-symbols.js'), 100)
672 DumpLargestSources(symbols
,
673 os
.path
.join(opts
.destdir
, 'largest-sources.js'), 100)
674 DumpLargestVTables(symbols
,
675 os
.path
.join(opts
.destdir
, 'largest-vtables.js'), 100)
676 treemap_out
= os
.path
.join(opts
.destdir
, 'webtreemap')
677 if not os
.path
.exists(treemap_out
):
678 os
.makedirs(treemap_out
, 0755)
679 treemap_src
= os
.path
.join('third_party', 'webtreemap', 'src')
680 shutil
.copy(os
.path
.join(treemap_src
, 'COPYING'), treemap_out
)
681 shutil
.copy(os
.path
.join(treemap_src
, 'webtreemap.js'), treemap_out
)
682 shutil
.copy(os
.path
.join(treemap_src
, 'webtreemap.css'), treemap_out
)
683 shutil
.copy(os
.path
.join('tools', 'binary_size', 'legacy_template',
684 'index.html'), opts
.destdir
)
685 else: # modern report
686 DumpCompactTree(symbols
, os
.path
.join(opts
.destdir
, 'data.js'))
687 d3_out
= os
.path
.join(opts
.destdir
, 'd3')
688 if not os
.path
.exists(d3_out
):
689 os
.makedirs(d3_out
, 0755)
690 d3_src
= os
.path
.join(os
.path
.dirname(__file__
),
693 'third_party', 'd3', 'src')
694 template_src
= os
.path
.join(os
.path
.dirname(__file__
),
696 shutil
.copy(os
.path
.join(d3_src
, 'LICENSE'), d3_out
)
697 shutil
.copy(os
.path
.join(d3_src
, 'd3.js'), d3_out
)
698 shutil
.copy(os
.path
.join(template_src
, 'index.html'), opts
.destdir
)
699 shutil
.copy(os
.path
.join(template_src
, 'D3SymbolTreeMap.js'), opts
.destdir
)
701 print 'Report saved to ' + opts
.destdir
+ '/index.html'
704 if __name__
== '__main__':