tools/deep_memory_profiler/dmprof.py

   1 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """The deep heap profiler script for Chrome."""
   6
   7 import copy
   8 import datetime
   9 import json
  10 import logging
  11 import optparse
  12 import os
  13 import re
  14 import subprocess
  15 import sys
  16 import tempfile
  17 import time
  18 import zipfile
  19
  20 from range_dict import ExclusiveRangeDict
  21
  22 BASE_PATH = os.path.dirname(os.path.abspath(__file__))
  23 FIND_RUNTIME_SYMBOLS_PATH = os.path.join(
  24     BASE_PATH, os.pardir, 'find_runtime_symbols')
  25 sys.path.append(FIND_RUNTIME_SYMBOLS_PATH)
  26
  27 import find_runtime_symbols
  28 import prepare_symbol_info
  29 import proc_maps
  30
  31 from find_runtime_symbols import FUNCTION_SYMBOLS
  32 from find_runtime_symbols import SOURCEFILE_SYMBOLS
  33 from find_runtime_symbols import TYPEINFO_SYMBOLS
  34
  35 BUCKET_ID = 5
  36 VIRTUAL = 0
  37 COMMITTED = 1
  38 ALLOC_COUNT = 2
  39 FREE_COUNT = 3
  40 NULL_REGEX = re.compile('')
  41
  42 LOGGER = logging.getLogger('dmprof')
  43 POLICIES_JSON_PATH = os.path.join(BASE_PATH, 'policies.json')
  44
  45
  46 # Heap Profile Dump versions
  47
  48 # DUMP_DEEP_[1-4] are obsolete.
  49 # DUMP_DEEP_2+ distinct mmap regions and malloc chunks.
  50 # DUMP_DEEP_3+ don't include allocation functions in their stack dumps.
  51 # DUMP_DEEP_4+ support comments with '#' and global stats "nonprofiled-*".
  52 # DUMP_DEEP_[1-2] should be processed by POLICY_DEEP_1.
  53 # DUMP_DEEP_[3-4] should be processed by POLICY_DEEP_2 or POLICY_DEEP_3.
  54 DUMP_DEEP_1 = 'DUMP_DEEP_1'
  55 DUMP_DEEP_2 = 'DUMP_DEEP_2'
  56 DUMP_DEEP_3 = 'DUMP_DEEP_3'
  57 DUMP_DEEP_4 = 'DUMP_DEEP_4'
  58
  59 DUMP_DEEP_OBSOLETE = (DUMP_DEEP_1, DUMP_DEEP_2, DUMP_DEEP_3, DUMP_DEEP_4)
  60
  61 # DUMP_DEEP_5 doesn't separate sections for malloc and mmap.
  62 # malloc and mmap are identified in bucket files.
  63 # DUMP_DEEP_5 should be processed by POLICY_DEEP_4.
  64 DUMP_DEEP_5 = 'DUMP_DEEP_5'
  65
  66 # DUMP_DEEP_6 adds a mmap list to DUMP_DEEP_5.
  67 DUMP_DEEP_6 = 'DUMP_DEEP_6'
  68
  69 # Heap Profile Policy versions
  70
  71 # POLICY_DEEP_1 DOES NOT include allocation_type columns.
  72 # mmap regions are distincted w/ mmap frames in the pattern column.
  73 POLICY_DEEP_1 = 'POLICY_DEEP_1'
  74
  75 # POLICY_DEEP_2 DOES include allocation_type columns.
  76 # mmap regions are distincted w/ the allocation_type column.
  77 POLICY_DEEP_2 = 'POLICY_DEEP_2'
  78
  79 # POLICY_DEEP_3 is in JSON format.
  80 POLICY_DEEP_3 = 'POLICY_DEEP_3'
  81
  82 # POLICY_DEEP_3 contains typeinfo.
  83 POLICY_DEEP_4 = 'POLICY_DEEP_4'
  84
  85
  86 class EmptyDumpException(Exception):
  87   def __init__(self, value=''):
  88     super(EmptyDumpException, self).__init__()
  89     self.value = value
  90   def __str__(self):
  91     return repr(self.value)
  92
  93
  94 class ParsingException(Exception):
  95   def __init__(self, value=''):
  96     super(ParsingException, self).__init__()
  97     self.value = value
  98   def __str__(self):
  99     return repr(self.value)
 100
 101
 102 class InvalidDumpException(ParsingException):
 103   def __init__(self, value):
 104     super(InvalidDumpException, self).__init__()
 105     self.value = value
 106   def __str__(self):
 107     return "invalid heap profile dump: %s" % repr(self.value)
 108
 109
 110 class ObsoleteDumpVersionException(ParsingException):
 111   def __init__(self, value):
 112     super(ObsoleteDumpVersionException, self).__init__()
 113     self.value = value
 114   def __str__(self):
 115     return "obsolete heap profile dump version: %s" % repr(self.value)
 116
 117
 118 class ListAttribute(ExclusiveRangeDict.RangeAttribute):
 119   """Represents a list for an attribute in range_dict.ExclusiveRangeDict."""
 120   def __init__(self):
 121     super(ListAttribute, self).__init__()
 122     self._list = []
 123
 124   def __str__(self):
 125     return str(self._list)
 126
 127   def __repr__(self):
 128     return 'ListAttribute' + str(self._list)
 129
 130   def __len__(self):
 131     return len(self._list)
 132
 133   def __iter__(self):
 134     for x in self._list:
 135       yield x
 136
 137   def __getitem__(self, index):
 138     return self._list[index]
 139
 140   def __setitem__(self, index, value):
 141     if index >= len(self._list):
 142       self._list.extend([None] * (index + 1 - len(self._list)))
 143     self._list[index] = value
 144
 145   def copy(self):
 146     new_list = ListAttribute()
 147     for index, item in enumerate(self._list):
 148       new_list[index] = copy.deepcopy(item)
 149     return new_list
 150
 151
 152 class ProcMapsEntryAttribute(ExclusiveRangeDict.RangeAttribute):
 153   """Represents an entry of /proc/maps in range_dict.ExclusiveRangeDict."""
 154   _DUMMY_ENTRY = proc_maps.ProcMapsEntry(
 155       0,     # begin
 156       0,     # end
 157       '-',   # readable
 158       '-',   # writable
 159       '-',   # executable
 160       '-',   # private
 161       0,     # offset
 162       '00',  # major
 163       '00',  # minor
 164       0,     # inode
 165       ''     # name
 166       )
 167
 168   def __init__(self):
 169     super(ProcMapsEntryAttribute, self).__init__()
 170     self._entry = self._DUMMY_ENTRY.as_dict()
 171
 172   def __str__(self):
 173     return str(self._entry)
 174
 175   def __repr__(self):
 176     return 'ProcMapsEntryAttribute' + str(self._entry)
 177
 178   def __getitem__(self, key):
 179     return self._entry[key]
 180
 181   def __setitem__(self, key, value):
 182     if key not in self._entry:
 183       raise KeyError(key)
 184     self._entry[key] = value
 185
 186   def copy(self):
 187     new_entry = ProcMapsEntryAttribute()
 188     for key, value in self._entry.iteritems():
 189       new_entry[key] = copy.deepcopy(value)
 190     return new_entry
 191
 192
 193 def skip_while(index, max_index, skipping_condition):
 194   """Increments |index| until |skipping_condition|(|index|) is False.
 195
 196   Returns:
 197       A pair of an integer indicating a line number after skipped, and a
 198       boolean value which is True if found a line which skipping_condition
 199       is False for.
 200   """
 201   while skipping_condition(index):
 202     index += 1
 203     if index >= max_index:
 204       return index, False
 205   return index, True
 206
 207
 208 class SymbolDataSources(object):
 209   """Manages symbol data sources in a process.
 210
 211   The symbol data sources consist of maps (/proc/<pid>/maps), nm, readelf and
 212   so on.  They are collected into a directory '|prefix|.symmap' from the binary
 213   files by 'prepare()' with tools/find_runtime_symbols/prepare_symbol_info.py.
 214
 215   Binaries are not mandatory to profile.  The prepared data sources work in
 216   place of the binary even if the binary has been overwritten with another
 217   binary.
 218
 219   Note that loading the symbol data sources takes a long time.  They are often
 220   very big.  So, the 'dmprof' profiler is designed to use 'SymbolMappingCache'
 221   which caches actually used symbols.
 222   """
 223   def __init__(self, prefix, fake_directories=None):
 224     self._prefix = prefix
 225     self._prepared_symbol_data_sources_path = None
 226     self._loaded_symbol_data_sources = None
 227     self._fake_directories = fake_directories or {}
 228
 229   def prepare(self):
 230     """Prepares symbol data sources by extracting mapping from a binary.
 231
 232     The prepared symbol data sources are stored in a directory.  The directory
 233     name is stored in |self._prepared_symbol_data_sources_path|.
 234
 235     Returns:
 236         True if succeeded.
 237     """
 238     LOGGER.info('Preparing symbol mapping...')
 239     self._prepared_symbol_data_sources_path, used_tempdir = (
 240         prepare_symbol_info.prepare_symbol_info(
 241             self._prefix + '.maps',
 242             output_dir_path=self._prefix + '.symmap',
 243             fake_directories=self._fake_directories,
 244             use_tempdir=True,
 245             use_source_file_name=True))
 246     if self._prepared_symbol_data_sources_path:
 247       LOGGER.info('  Prepared symbol mapping.')
 248       if used_tempdir:
 249         LOGGER.warn('  Using a temporary directory for symbol mapping.')
 250         LOGGER.warn('  Delete it by yourself.')
 251         LOGGER.warn('  Or, move the directory by yourself to use it later.')
 252       return True
 253     else:
 254       LOGGER.warn('  Failed to prepare symbol mapping.')
 255       return False
 256
 257   def get(self):
 258     """Returns the prepared symbol data sources.
 259
 260     Returns:
 261         The prepared symbol data sources.  None if failed.
 262     """
 263     if not self._prepared_symbol_data_sources_path and not self.prepare():
 264       return None
 265     if not self._loaded_symbol_data_sources:
 266       LOGGER.info('Loading symbol mapping...')
 267       self._loaded_symbol_data_sources = (
 268           find_runtime_symbols.RuntimeSymbolsInProcess.load(
 269               self._prepared_symbol_data_sources_path))
 270     return self._loaded_symbol_data_sources
 271
 272   def path(self):
 273     """Returns the path of the prepared symbol data sources if possible."""
 274     if not self._prepared_symbol_data_sources_path and not self.prepare():
 275       return None
 276     return self._prepared_symbol_data_sources_path
 277
 278
 279 class SymbolFinder(object):
 280   """Finds corresponding symbols from addresses.
 281
 282   This class does only 'find()' symbols from a specified |address_list|.
 283   It is introduced to make a finder mockable.
 284   """
 285   def __init__(self, symbol_type, symbol_data_sources):
 286     self._symbol_type = symbol_type
 287     self._symbol_data_sources = symbol_data_sources
 288
 289   def find(self, address_list):
 290     return find_runtime_symbols.find_runtime_symbols(
 291         self._symbol_type, self._symbol_data_sources.get(), address_list)
 292
 293
 294 class SymbolMappingCache(object):
 295   """Caches mapping from actually used addresses to symbols.
 296
 297   'update()' updates the cache from the original symbol data sources via
 298   'SymbolFinder'.  Symbols can be looked up by the method 'lookup()'.
 299   """
 300   def __init__(self):
 301     self._symbol_mapping_caches = {
 302         FUNCTION_SYMBOLS: {},
 303         SOURCEFILE_SYMBOLS: {},
 304         TYPEINFO_SYMBOLS: {},
 305         }
 306
 307   def update(self, symbol_type, bucket_set, symbol_finder, cache_f):
 308     """Updates symbol mapping cache on memory and in a symbol cache file.
 309
 310     It reads cached symbol mapping from a symbol cache file |cache_f| if it
 311     exists.  Unresolved addresses are then resolved and added to the cache
 312     both on memory and in the symbol cache file with using 'SymbolFinder'.
 313
 314     A cache file is formatted as follows:
 315       <Address> <Symbol>
 316       <Address> <Symbol>
 317       <Address> <Symbol>
 318       ...
 319
 320     Args:
 321         symbol_type: A type of symbols to update.  It should be one of
 322             FUNCTION_SYMBOLS, SOURCEFILE_SYMBOLS and TYPEINFO_SYMBOLS.
 323         bucket_set: A BucketSet object.
 324         symbol_finder: A SymbolFinder object to find symbols.
 325         cache_f: A readable and writable IO object of the symbol cache file.
 326     """
 327     cache_f.seek(0, os.SEEK_SET)
 328     self._load(cache_f, symbol_type)
 329
 330     unresolved_addresses = sorted(
 331         address for address in bucket_set.iter_addresses(symbol_type)
 332         if address not in self._symbol_mapping_caches[symbol_type])
 333
 334     if not unresolved_addresses:
 335       LOGGER.info('No need to resolve any more addresses.')
 336       return
 337
 338     cache_f.seek(0, os.SEEK_END)
 339     LOGGER.info('Loading %d unresolved addresses.' %
 340                 len(unresolved_addresses))
 341     symbol_dict = symbol_finder.find(unresolved_addresses)
 342
 343     for address, symbol in symbol_dict.iteritems():
 344       stripped_symbol = symbol.strip() or '?'
 345       self._symbol_mapping_caches[symbol_type][address] = stripped_symbol
 346       cache_f.write('%x %s\n' % (address, stripped_symbol))
 347
 348   def lookup(self, symbol_type, address):
 349     """Looks up a symbol for a given |address|.
 350
 351     Args:
 352         symbol_type: A type of symbols to update.  It should be one of
 353             FUNCTION_SYMBOLS, SOURCEFILE_SYMBOLS and TYPEINFO_SYMBOLS.
 354         address: An integer that represents an address.
 355
 356     Returns:
 357         A string that represents a symbol.
 358     """
 359     return self._symbol_mapping_caches[symbol_type].get(address)
 360
 361   def _load(self, cache_f, symbol_type):
 362     try:
 363       for line in cache_f:
 364         items = line.rstrip().split(None, 1)
 365         if len(items) == 1:
 366           items.append('??')
 367         self._symbol_mapping_caches[symbol_type][int(items[0], 16)] = items[1]
 368       LOGGER.info('Loaded %d entries from symbol cache.' %
 369                      len(self._symbol_mapping_caches[symbol_type]))
 370     except IOError as e:
 371       LOGGER.info('The symbol cache file is invalid: %s' % e)
 372
 373
 374 class Rule(object):
 375   """Represents one matching rule in a policy file."""
 376
 377   def __init__(self,
 378                name,
 379                mmap,
 380                stackfunction_pattern=None,
 381                stacksourcefile_pattern=None,
 382                typeinfo_pattern=None):
 383     self._name = name
 384     self._mmap = mmap
 385
 386     self._stackfunction_pattern = None
 387     if stackfunction_pattern:
 388       self._stackfunction_pattern = re.compile(
 389           stackfunction_pattern + r'\Z')
 390
 391     self._stacksourcefile_pattern = None
 392     if stacksourcefile_pattern:
 393       self._stacksourcefile_pattern = re.compile(
 394           stacksourcefile_pattern + r'\Z')
 395
 396     self._typeinfo_pattern = None
 397     if typeinfo_pattern:
 398       self._typeinfo_pattern = re.compile(typeinfo_pattern + r'\Z')
 399
 400   @property
 401   def name(self):
 402     return self._name
 403
 404   @property
 405   def mmap(self):
 406     return self._mmap
 407
 408   @property
 409   def stackfunction_pattern(self):
 410     return self._stackfunction_pattern
 411
 412   @property
 413   def stacksourcefile_pattern(self):
 414     return self._stacksourcefile_pattern
 415
 416   @property
 417   def typeinfo_pattern(self):
 418     return self._typeinfo_pattern
 419
 420
 421 class Policy(object):
 422   """Represents a policy, a content of a policy file."""
 423
 424   def __init__(self, rules, version, components):
 425     self._rules = rules
 426     self._version = version
 427     self._components = components
 428
 429   @property
 430   def rules(self):
 431     return self._rules
 432
 433   @property
 434   def version(self):
 435     return self._version
 436
 437   @property
 438   def components(self):
 439     return self._components
 440
 441   def find(self, bucket):
 442     """Finds a matching component name which a given |bucket| belongs to.
 443
 444     Args:
 445         bucket: A Bucket object to be searched for.
 446
 447     Returns:
 448         A string representing a component name.
 449     """
 450     if not bucket:
 451       return 'no-bucket'
 452     if bucket.component_cache:
 453       return bucket.component_cache
 454
 455     stackfunction = bucket.symbolized_joined_stackfunction
 456     stacksourcefile = bucket.symbolized_joined_stacksourcefile
 457     typeinfo = bucket.symbolized_typeinfo
 458     if typeinfo.startswith('0x'):
 459       typeinfo = bucket.typeinfo_name
 460
 461     for rule in self._rules:
 462       if (bucket.mmap == rule.mmap and
 463           (not rule.stackfunction_pattern or
 464            rule.stackfunction_pattern.match(stackfunction)) and
 465           (not rule.stacksourcefile_pattern or
 466            rule.stacksourcefile_pattern.match(stacksourcefile)) and
 467           (not rule.typeinfo_pattern or rule.typeinfo_pattern.match(typeinfo))):
 468         bucket.component_cache = rule.name
 469         return rule.name
 470
 471     assert False
 472
 473   @staticmethod
 474   def load(filename, filetype):
 475     """Loads a policy file of |filename| in a |format|.
 476
 477     Args:
 478         filename: A filename to be loaded.
 479         filetype: A string to specify a type of the file.  Only 'json' is
 480             supported for now.
 481
 482     Returns:
 483         A loaded Policy object.
 484     """
 485     with open(os.path.join(BASE_PATH, filename)) as policy_f:
 486       return Policy.parse(policy_f, filetype)
 487
 488   @staticmethod
 489   def parse(policy_f, filetype):
 490     """Parses a policy file content in a |format|.
 491
 492     Args:
 493         policy_f: An IO object to be loaded.
 494         filetype: A string to specify a type of the file.  Only 'json' is
 495             supported for now.
 496
 497     Returns:
 498         A loaded Policy object.
 499     """
 500     if filetype == 'json':
 501       return Policy._parse_json(policy_f)
 502     else:
 503       return None
 504
 505   @staticmethod
 506   def _parse_json(policy_f):
 507     """Parses policy file in json format.
 508
 509     A policy file contains component's names and their stacktrace pattern
 510     written in regular expression.  Those patterns are matched against each
 511     symbols of each stacktraces in the order written in the policy file
 512
 513     Args:
 514          policy_f: A File/IO object to read.
 515
 516     Returns:
 517          A loaded policy object.
 518     """
 519     policy = json.load(policy_f)
 520
 521     rules = []
 522     for rule in policy['rules']:
 523       stackfunction = rule.get('stackfunction') or rule.get('stacktrace')
 524       stacksourcefile = rule.get('stacksourcefile')
 525       rules.append(Rule(
 526           rule['name'],
 527           rule['allocator'] == 'mmap',
 528           stackfunction,
 529           stacksourcefile,
 530           rule['typeinfo'] if 'typeinfo' in rule else None))
 531
 532     return Policy(rules, policy['version'], policy['components'])
 533
 534
 535 class PolicySet(object):
 536   """Represents a set of policies."""
 537
 538   def __init__(self, policy_directory):
 539     self._policy_directory = policy_directory
 540
 541   @staticmethod
 542   def load(labels=None):
 543     """Loads a set of policies via the "default policy directory".
 544
 545     The "default policy directory" contains pairs of policies and their labels.
 546     For example, a policy "policy.l0.json" is labeled "l0" in the default
 547     policy directory "policies.json".
 548
 549     All policies in the directory are loaded by default.  Policies can be
 550     limited by |labels|.
 551
 552     Args:
 553         labels: An array that contains policy labels to be loaded.
 554
 555     Returns:
 556         A PolicySet object.
 557     """
 558     default_policy_directory = PolicySet._load_default_policy_directory()
 559     if labels:
 560       specified_policy_directory = {}
 561       for label in labels:
 562         if label in default_policy_directory:
 563           specified_policy_directory[label] = default_policy_directory[label]
 564         # TODO(dmikurube): Load an un-labeled policy file.
 565       return PolicySet._load_policies(specified_policy_directory)
 566     else:
 567       return PolicySet._load_policies(default_policy_directory)
 568
 569   def __len__(self):
 570     return len(self._policy_directory)
 571
 572   def __iter__(self):
 573     for label in self._policy_directory:
 574       yield label
 575
 576   def __getitem__(self, label):
 577     return self._policy_directory[label]
 578
 579   @staticmethod
 580   def _load_default_policy_directory():
 581     with open(POLICIES_JSON_PATH, mode='r') as policies_f:
 582       default_policy_directory = json.load(policies_f)
 583     return default_policy_directory
 584
 585   @staticmethod
 586   def _load_policies(directory):
 587     LOGGER.info('Loading policy files.')
 588     policies = {}
 589     for label in directory:
 590       LOGGER.info('  %s: %s' % (label, directory[label]['file']))
 591       loaded = Policy.load(directory[label]['file'], directory[label]['format'])
 592       if loaded:
 593         policies[label] = loaded
 594     return PolicySet(policies)
 595
 596
 597 class Bucket(object):
 598   """Represents a bucket, which is a unit of memory block classification."""
 599
 600   def __init__(self, stacktrace, mmap, typeinfo, typeinfo_name):
 601     self._stacktrace = stacktrace
 602     self._mmap = mmap
 603     self._typeinfo = typeinfo
 604     self._typeinfo_name = typeinfo_name
 605
 606     self._symbolized_stackfunction = stacktrace
 607     self._symbolized_joined_stackfunction = ''
 608     self._symbolized_stacksourcefile = stacktrace
 609     self._symbolized_joined_stacksourcefile = ''
 610     self._symbolized_typeinfo = typeinfo_name
 611
 612     self.component_cache = ''
 613
 614   def __str__(self):
 615     result = []
 616     result.append('mmap' if self._mmap else 'malloc')
 617     if self._symbolized_typeinfo == 'no typeinfo':
 618       result.append('tno_typeinfo')
 619     else:
 620       result.append('t' + self._symbolized_typeinfo)
 621     result.append('n' + self._typeinfo_name)
 622     result.extend(['%s(@%s)' % (function, sourcefile)
 623                    for function, sourcefile
 624                    in zip(self._symbolized_stackfunction,
 625                           self._symbolized_stacksourcefile)])
 626     return ' '.join(result)
 627
 628   def symbolize(self, symbol_mapping_cache):
 629     """Makes a symbolized stacktrace and typeinfo with |symbol_mapping_cache|.
 630
 631     Args:
 632         symbol_mapping_cache: A SymbolMappingCache object.
 633     """
 634     # TODO(dmikurube): Fill explicitly with numbers if symbol not found.
 635     self._symbolized_stackfunction = [
 636         symbol_mapping_cache.lookup(FUNCTION_SYMBOLS, address)
 637         for address in self._stacktrace]
 638     self._symbolized_joined_stackfunction = ' '.join(
 639         self._symbolized_stackfunction)
 640     self._symbolized_stacksourcefile = [
 641         symbol_mapping_cache.lookup(SOURCEFILE_SYMBOLS, address)
 642         for address in self._stacktrace]
 643     self._symbolized_joined_stacksourcefile = ' '.join(
 644         self._symbolized_stacksourcefile)
 645     if not self._typeinfo:
 646       self._symbolized_typeinfo = 'no typeinfo'
 647     else:
 648       self._symbolized_typeinfo = symbol_mapping_cache.lookup(
 649           TYPEINFO_SYMBOLS, self._typeinfo)
 650       if not self._symbolized_typeinfo:
 651         self._symbolized_typeinfo = 'no typeinfo'
 652
 653   def clear_component_cache(self):
 654     self.component_cache = ''
 655
 656   @property
 657   def stacktrace(self):
 658     return self._stacktrace
 659
 660   @property
 661   def mmap(self):
 662     return self._mmap
 663
 664   @property
 665   def typeinfo(self):
 666     return self._typeinfo
 667
 668   @property
 669   def typeinfo_name(self):
 670     return self._typeinfo_name
 671
 672   @property
 673   def symbolized_stackfunction(self):
 674     return self._symbolized_stackfunction
 675
 676   @property
 677   def symbolized_joined_stackfunction(self):
 678     return self._symbolized_joined_stackfunction
 679
 680   @property
 681   def symbolized_stacksourcefile(self):
 682     return self._symbolized_stacksourcefile
 683
 684   @property
 685   def symbolized_joined_stacksourcefile(self):
 686     return self._symbolized_joined_stacksourcefile
 687
 688   @property
 689   def symbolized_typeinfo(self):
 690     return self._symbolized_typeinfo
 691
 692
 693 class BucketSet(object):
 694   """Represents a set of bucket."""
 695   def __init__(self):
 696     self._buckets = {}
 697     self._code_addresses = set()
 698     self._typeinfo_addresses = set()
 699
 700   def load(self, prefix):
 701     """Loads all related bucket files.
 702
 703     Args:
 704         prefix: A prefix string for bucket file names.
 705     """
 706     LOGGER.info('Loading bucket files.')
 707
 708     n = 0
 709     while True:
 710       path = '%s.%04d.buckets' % (prefix, n)
 711       if not os.path.exists(path):
 712         if n > 10:
 713           break
 714         n += 1
 715         continue
 716       LOGGER.info('  %s' % path)
 717       with open(path, 'r') as f:
 718         self._load_file(f)
 719       n += 1
 720
 721   def _load_file(self, bucket_f):
 722     for line in bucket_f:
 723       words = line.split()
 724       typeinfo = None
 725       typeinfo_name = ''
 726       stacktrace_begin = 2
 727       for index, word in enumerate(words):
 728         if index < 2:
 729           continue
 730         if word[0] == 't':
 731           typeinfo = int(word[1:], 16)
 732           self._typeinfo_addresses.add(typeinfo)
 733         elif word[0] == 'n':
 734           typeinfo_name = word[1:]
 735         else:
 736           stacktrace_begin = index
 737           break
 738       stacktrace = [int(address, 16) for address in words[stacktrace_begin:]]
 739       for frame in stacktrace:
 740         self._code_addresses.add(frame)
 741       self._buckets[int(words[0])] = Bucket(
 742           stacktrace, words[1] == 'mmap', typeinfo, typeinfo_name)
 743
 744   def __iter__(self):
 745     for bucket_id, bucket_content in self._buckets.iteritems():
 746       yield bucket_id, bucket_content
 747
 748   def __getitem__(self, bucket_id):
 749     return self._buckets[bucket_id]
 750
 751   def get(self, bucket_id):
 752     return self._buckets.get(bucket_id)
 753
 754   def symbolize(self, symbol_mapping_cache):
 755     for bucket_content in self._buckets.itervalues():
 756       bucket_content.symbolize(symbol_mapping_cache)
 757
 758   def clear_component_cache(self):
 759     for bucket_content in self._buckets.itervalues():
 760       bucket_content.clear_component_cache()
 761
 762   def iter_addresses(self, symbol_type):
 763     if symbol_type in [FUNCTION_SYMBOLS, SOURCEFILE_SYMBOLS]:
 764       for function in self._code_addresses:
 765         yield function
 766     else:
 767       for function in self._typeinfo_addresses:
 768         yield function
 769
 770
 771 class Dump(object):
 772   """Represents a heap profile dump."""
 773
 774   _PATH_PATTERN = re.compile(r'^(.*)\.([0-9]+)\.([0-9]+)\.heap$')
 775
 776   _HOOK_PATTERN = re.compile(
 777       r'^ ([ \(])([a-f0-9]+)([ \)])-([ \(])([a-f0-9]+)([ \)])\s+'
 778       r'(hooked|unhooked)\s+(.+)$', re.IGNORECASE)
 779
 780   _TIME_PATTERN = re.compile(
 781       r'^Time: ([0-9]+/[0-9]+/[0-9]+ [0-9]+:[0-9]+:[0-9]+)(\.[0-9]+)?')
 782
 783   def __init__(self, path, modified_time):
 784     self._path = path
 785     matched = self._PATH_PATTERN.match(path)
 786     self._pid = int(matched.group(2))
 787     self._count = int(matched.group(3))
 788     self._time = modified_time
 789     self._map = {}
 790     self._procmaps = ExclusiveRangeDict(ProcMapsEntryAttribute)
 791     self._stacktrace_lines = []
 792     self._global_stats = {} # used only in apply_policy
 793
 794     self._version = ''
 795     self._lines = []
 796
 797   @property
 798   def path(self):
 799     return self._path
 800
 801   @property
 802   def count(self):
 803     return self._count
 804
 805   @property
 806   def time(self):
 807     return self._time
 808
 809   @property
 810   def iter_map(self):
 811     for region in sorted(self._map.iteritems()):
 812       yield region[0], region[1]
 813
 814   def iter_procmaps(self):
 815     for begin, end, attr in self._map.iter_range():
 816       yield begin, end, attr
 817
 818   @property
 819   def iter_stacktrace(self):
 820     for line in self._stacktrace_lines:
 821       yield line
 822
 823   def global_stat(self, name):
 824     return self._global_stats[name]
 825
 826   @staticmethod
 827   def load(path, log_header='Loading a heap profile dump: '):
 828     """Loads a heap profile dump.
 829
 830     Args:
 831         path: A file path string to load.
 832         log_header: A preceding string for log messages.
 833
 834     Returns:
 835         A loaded Dump object.
 836
 837     Raises:
 838         ParsingException for invalid heap profile dumps.
 839     """
 840     dump = Dump(path, os.stat(path).st_mtime)
 841     with open(path, 'r') as f:
 842       dump.load_file(f, log_header)
 843     return dump
 844
 845   def load_file(self, f, log_header):
 846     self._lines = [line for line in f
 847                    if line and not line.startswith('#')]
 848
 849     try:
 850       self._version, ln = self._parse_version()
 851       self._parse_meta_information()
 852       if self._version == DUMP_DEEP_6:
 853         self._parse_mmap_list()
 854       self._parse_global_stats()
 855       self._extract_stacktrace_lines(ln)
 856     except EmptyDumpException:
 857       LOGGER.info('%s%s ...ignored an empty dump.' % (log_header, self._path))
 858     except ParsingException, e:
 859       LOGGER.error('%s%s ...error %s' % (log_header, self._path, e))
 860       raise
 861     else:
 862       LOGGER.info('%s%s (version:%s)' % (log_header, self._path, self._version))
 863
 864   def _parse_version(self):
 865     """Parses a version string in self._lines.
 866
 867     Returns:
 868         A pair of (a string representing a version of the stacktrace dump,
 869         and an integer indicating a line number next to the version string).
 870
 871     Raises:
 872         ParsingException for invalid dump versions.
 873     """
 874     version = ''
 875
 876     # Skip until an identifiable line.
 877     headers = ('STACKTRACES:\n', 'MMAP_STACKTRACES:\n', 'heap profile: ')
 878     if not self._lines:
 879       raise EmptyDumpException('Empty heap dump file.')
 880     (ln, found) = skip_while(
 881         0, len(self._lines),
 882         lambda n: not self._lines[n].startswith(headers))
 883     if not found:
 884       raise InvalidDumpException('No version header.')
 885
 886     # Identify a version.
 887     if self._lines[ln].startswith('heap profile: '):
 888       version = self._lines[ln][13:].strip()
 889       if version in (DUMP_DEEP_5, DUMP_DEEP_6):
 890         (ln, _) = skip_while(
 891             ln, len(self._lines),
 892             lambda n: self._lines[n] != 'STACKTRACES:\n')
 893       elif version in DUMP_DEEP_OBSOLETE:
 894         raise ObsoleteDumpVersionException(version)
 895       else:
 896         raise InvalidDumpException('Invalid version: %s' % version)
 897     elif self._lines[ln] == 'STACKTRACES:\n':
 898       raise ObsoleteDumpVersionException(DUMP_DEEP_1)
 899     elif self._lines[ln] == 'MMAP_STACKTRACES:\n':
 900       raise ObsoleteDumpVersionException(DUMP_DEEP_2)
 901
 902     return (version, ln)
 903
 904   def _parse_global_stats(self):
 905     """Parses lines in self._lines as global stats."""
 906     (ln, _) = skip_while(
 907         0, len(self._lines),
 908         lambda n: self._lines[n] != 'GLOBAL_STATS:\n')
 909
 910     global_stat_names = [
 911         'total', 'absent', 'file-exec', 'file-nonexec', 'anonymous', 'stack',
 912         'other', 'nonprofiled-absent', 'nonprofiled-anonymous',
 913         'nonprofiled-file-exec', 'nonprofiled-file-nonexec',
 914         'nonprofiled-stack', 'nonprofiled-other',
 915         'profiled-mmap', 'profiled-malloc']
 916
 917     for prefix in global_stat_names:
 918       (ln, _) = skip_while(
 919           ln, len(self._lines),
 920           lambda n: self._lines[n].split()[0] != prefix)
 921       words = self._lines[ln].split()
 922       self._global_stats[prefix + '_virtual'] = int(words[-2])
 923       self._global_stats[prefix + '_committed'] = int(words[-1])
 924
 925   def _parse_meta_information(self):
 926     """Parses lines in self._lines for meta information."""
 927     (ln, found) = skip_while(
 928         0, len(self._lines),
 929         lambda n: self._lines[n] != 'META:\n')
 930     if not found:
 931       return
 932     ln += 1
 933
 934     while True:
 935       if self._lines[ln].startswith('Time:'):
 936         matched = self._TIME_PATTERN.match(self._lines[ln])
 937         if matched:
 938           self._time = time.mktime(datetime.datetime.strptime(
 939               matched.group(1), '%Y/%m/%d %H:%M:%S').timetuple())
 940           if matched.group(2):
 941             self._time += float(matched.group(2)[1:]) / 1000.0
 942       else:
 943         break
 944       ln += 1
 945
 946   def _parse_mmap_list(self):
 947     """Parses lines in self._lines as a mmap list."""
 948     (ln, found) = skip_while(
 949         0, len(self._lines),
 950         lambda n: self._lines[n] != 'MMAP_LIST:\n')
 951     if not found:
 952       return {}
 953
 954     ln += 1
 955     self._map = {}
 956     while True:
 957       entry = proc_maps.ProcMaps.parse_line(self._lines[ln])
 958       if entry:
 959         for _, _, attr in self._procmaps.iter_range(entry.begin, entry.end):
 960           for key, value in entry.as_dict().iteritems():
 961             attr[key] = value
 962         ln += 1
 963         continue
 964       matched = self._HOOK_PATTERN.match(self._lines[ln])
 965       if not matched:
 966         break
 967       # 2: starting address
 968       # 5: end address
 969       # 7: hooked or unhooked
 970       # 8: additional information
 971       self._map[(int(matched.group(2), 16),
 972                  int(matched.group(5), 16))] = (matched.group(7),
 973                                                 matched.group(8))
 974       ln += 1
 975
 976   def _extract_stacktrace_lines(self, line_number):
 977     """Extracts the position of stacktrace lines.
 978
 979     Valid stacktrace lines are stored into self._stacktrace_lines.
 980
 981     Args:
 982         line_number: A line number to start parsing in lines.
 983
 984     Raises:
 985         ParsingException for invalid dump versions.
 986     """
 987     if self._version in (DUMP_DEEP_5, DUMP_DEEP_6):
 988       (line_number, _) = skip_while(
 989           line_number, len(self._lines),
 990           lambda n: not self._lines[n].split()[0].isdigit())
 991       stacktrace_start = line_number
 992       (line_number, _) = skip_while(
 993           line_number, len(self._lines),
 994           lambda n: self._check_stacktrace_line(self._lines[n]))
 995       self._stacktrace_lines = self._lines[stacktrace_start:line_number]
 996
 997     elif self._version in DUMP_DEEP_OBSOLETE:
 998       raise ObsoleteDumpVersionException(self._version)
 999
1000     else:
1001       raise InvalidDumpException('Invalid version: %s' % self._version)
1002
1003   @staticmethod
1004   def _check_stacktrace_line(stacktrace_line):
1005     """Checks if a given stacktrace_line is valid as stacktrace.
1006
1007     Args:
1008         stacktrace_line: A string to be checked.
1009
1010     Returns:
1011         True if the given stacktrace_line is valid.
1012     """
1013     words = stacktrace_line.split()
1014     if len(words) < BUCKET_ID + 1:
1015       return False
1016     if words[BUCKET_ID - 1] != '@':
1017       return False
1018     return True
1019
1020
1021 class DumpList(object):
1022   """Represents a sequence of heap profile dumps."""
1023
1024   def __init__(self, dump_list):
1025     self._dump_list = dump_list
1026
1027   @staticmethod
1028   def load(path_list):
1029     LOGGER.info('Loading heap dump profiles.')
1030     dump_list = []
1031     for path in path_list:
1032       dump_list.append(Dump.load(path, '  '))
1033     return DumpList(dump_list)
1034
1035   def __len__(self):
1036     return len(self._dump_list)
1037
1038   def __iter__(self):
1039     for dump in self._dump_list:
1040       yield dump
1041
1042   def __getitem__(self, index):
1043     return self._dump_list[index]
1044
1045
1046 class Command(object):
1047   """Subclasses are a subcommand for this executable.
1048
1049   See COMMANDS in main().
1050   """
1051   def __init__(self, usage):
1052     self._parser = optparse.OptionParser(usage)
1053
1054   @staticmethod
1055   def load_basic_files(
1056       dump_path, multiple, no_dump=False, fake_directories=None):
1057     prefix = Command._find_prefix(dump_path)
1058     symbol_data_sources = SymbolDataSources(prefix, fake_directories or {})
1059     symbol_data_sources.prepare()
1060     bucket_set = BucketSet()
1061     bucket_set.load(prefix)
1062     if not no_dump:
1063       if multiple:
1064         dump_list = DumpList.load(Command._find_all_dumps(dump_path))
1065       else:
1066         dump = Dump.load(dump_path)
1067     symbol_mapping_cache = SymbolMappingCache()
1068     with open(prefix + '.cache.function', 'a+') as cache_f:
1069       symbol_mapping_cache.update(
1070           FUNCTION_SYMBOLS, bucket_set,
1071           SymbolFinder(FUNCTION_SYMBOLS, symbol_data_sources), cache_f)
1072     with open(prefix + '.cache.typeinfo', 'a+') as cache_f:
1073       symbol_mapping_cache.update(
1074           TYPEINFO_SYMBOLS, bucket_set,
1075           SymbolFinder(TYPEINFO_SYMBOLS, symbol_data_sources), cache_f)
1076     with open(prefix + '.cache.sourcefile', 'a+') as cache_f:
1077       symbol_mapping_cache.update(
1078           SOURCEFILE_SYMBOLS, bucket_set,
1079           SymbolFinder(SOURCEFILE_SYMBOLS, symbol_data_sources), cache_f)
1080     bucket_set.symbolize(symbol_mapping_cache)
1081     if no_dump:
1082       return bucket_set
1083     elif multiple:
1084       return (bucket_set, dump_list)
1085     else:
1086       return (bucket_set, dump)
1087
1088   @staticmethod
1089   def _find_prefix(path):
1090     return re.sub('\.[0-9][0-9][0-9][0-9]\.heap', '', path)
1091
1092   @staticmethod
1093   def _find_all_dumps(dump_path):
1094     prefix = Command._find_prefix(dump_path)
1095     dump_path_list = [dump_path]
1096
1097     n = int(dump_path[len(dump_path) - 9 : len(dump_path) - 5])
1098     n += 1
1099     while True:
1100       p = '%s.%04d.heap' % (prefix, n)
1101       if os.path.exists(p):
1102         dump_path_list.append(p)
1103       else:
1104         break
1105       n += 1
1106
1107     return dump_path_list
1108
1109   @staticmethod
1110   def _find_all_buckets(dump_path):
1111     prefix = Command._find_prefix(dump_path)
1112     bucket_path_list = []
1113
1114     n = 0
1115     while True:
1116       path = '%s.%04d.buckets' % (prefix, n)
1117       if not os.path.exists(path):
1118         if n > 10:
1119           break
1120         n += 1
1121         continue
1122       bucket_path_list.append(path)
1123       n += 1
1124
1125     return bucket_path_list
1126
1127   def _parse_args(self, sys_argv, required):
1128     options, args = self._parser.parse_args(sys_argv)
1129     if len(args) != required + 1:
1130       self._parser.error('needs %d argument(s).\n' % required)
1131       return None
1132     return (options, args)
1133
1134   @staticmethod
1135   def _parse_policy_list(options_policy):
1136     if options_policy:
1137       return options_policy.split(',')
1138     else:
1139       return None
1140
1141
1142 class BucketsCommand(Command):
1143   def __init__(self):
1144     super(BucketsCommand, self).__init__('Usage: %prog buckets <first-dump>')
1145
1146   def do(self, sys_argv, out=sys.stdout):
1147     _, args = self._parse_args(sys_argv, 1)
1148     dump_path = args[1]
1149     bucket_set = Command.load_basic_files(dump_path, True, True)
1150
1151     BucketsCommand._output(bucket_set, out)
1152     return 0
1153
1154   @staticmethod
1155   def _output(bucket_set, out):
1156     """Prints all buckets with resolving symbols.
1157
1158     Args:
1159         bucket_set: A BucketSet object.
1160         out: An IO object to output.
1161     """
1162     for bucket_id, bucket in sorted(bucket_set):
1163       out.write('%d: %s\n' % (bucket_id, bucket))
1164
1165
1166 class StacktraceCommand(Command):
1167   def __init__(self):
1168     super(StacktraceCommand, self).__init__(
1169         'Usage: %prog stacktrace <dump>')
1170
1171   def do(self, sys_argv):
1172     _, args = self._parse_args(sys_argv, 1)
1173     dump_path = args[1]
1174     (bucket_set, dump) = Command.load_basic_files(dump_path, False)
1175
1176     StacktraceCommand._output(dump, bucket_set, sys.stdout)
1177     return 0
1178
1179   @staticmethod
1180   def _output(dump, bucket_set, out):
1181     """Outputs a given stacktrace.
1182
1183     Args:
1184         bucket_set: A BucketSet object.
1185         out: A file object to output.
1186     """
1187     for line in dump.iter_stacktrace:
1188       words = line.split()
1189       bucket = bucket_set.get(int(words[BUCKET_ID]))
1190       if not bucket:
1191         continue
1192       for i in range(0, BUCKET_ID - 1):
1193         out.write(words[i] + ' ')
1194       for frame in bucket.symbolized_stackfunction:
1195         out.write(frame + ' ')
1196       out.write('\n')
1197
1198
1199 class PolicyCommands(Command):
1200   def __init__(self, command):
1201     super(PolicyCommands, self).__init__(
1202         'Usage: %%prog %s [-p POLICY] <first-dump>' % command)
1203     self._parser.add_option('-p', '--policy', type='string', dest='policy',
1204                             help='profile with POLICY', metavar='POLICY')
1205     self._parser.add_option('--fake-directories', dest='fake_directories',
1206                             metavar='/path/on/target@/path/on/host[:...]',
1207                             help='Read files in /path/on/host/ instead of '
1208                                  'files in /path/on/target/.')
1209
1210   def _set_up(self, sys_argv):
1211     options, args = self._parse_args(sys_argv, 1)
1212     dump_path = args[1]
1213     fake_directories_dict = {}
1214     if options.fake_directories:
1215       for fake_directory_pair in options.fake_directories.split(':'):
1216         target_path, host_path = fake_directory_pair.split('@', 1)
1217         fake_directories_dict[target_path] = host_path
1218     (bucket_set, dumps) = Command.load_basic_files(
1219         dump_path, True, fake_directories=fake_directories_dict)
1220
1221     policy_set = PolicySet.load(Command._parse_policy_list(options.policy))
1222     return policy_set, dumps, bucket_set
1223
1224   @staticmethod
1225   def _apply_policy(dump, policy, bucket_set, first_dump_time):
1226     """Aggregates the total memory size of each component.
1227
1228     Iterate through all stacktraces and attribute them to one of the components
1229     based on the policy.  It is important to apply policy in right order.
1230
1231     Args:
1232         dump: A Dump object.
1233         policy: A Policy object.
1234         bucket_set: A BucketSet object.
1235         first_dump_time: An integer representing time when the first dump is
1236             dumped.
1237
1238     Returns:
1239         A dict mapping components and their corresponding sizes.
1240     """
1241     LOGGER.info('  %s' % dump.path)
1242     sizes = dict((c, 0) for c in policy.components)
1243
1244     PolicyCommands._accumulate(dump, policy, bucket_set, sizes)
1245
1246     sizes['mmap-no-log'] = (
1247         dump.global_stat('profiled-mmap_committed') -
1248         sizes['mmap-total-log'])
1249     sizes['mmap-total-record'] = dump.global_stat('profiled-mmap_committed')
1250     sizes['mmap-total-record-vm'] = dump.global_stat('profiled-mmap_virtual')
1251
1252     sizes['tc-no-log'] = (
1253         dump.global_stat('profiled-malloc_committed') -
1254         sizes['tc-total-log'])
1255     sizes['tc-total-record'] = dump.global_stat('profiled-malloc_committed')
1256     sizes['tc-unused'] = (
1257         sizes['mmap-tcmalloc'] -
1258         dump.global_stat('profiled-malloc_committed'))
1259     sizes['tc-total'] = sizes['mmap-tcmalloc']
1260
1261     for key, value in {
1262         'total': 'total_committed',
1263         'filemapped': 'file_committed',
1264         'absent': 'absent_committed',
1265         'file-exec': 'file-exec_committed',
1266         'file-nonexec': 'file-nonexec_committed',
1267         'anonymous': 'anonymous_committed',
1268         'stack': 'stack_committed',
1269         'other': 'other_committed',
1270         'unhooked-absent': 'nonprofiled-absent_committed',
1271         'unhooked-anonymous': 'nonprofiled-anonymous_committed',
1272         'unhooked-file-exec': 'nonprofiled-file-exec_committed',
1273         'unhooked-file-nonexec': 'nonprofiled-file-nonexec_committed',
1274         'unhooked-stack': 'nonprofiled-stack_committed',
1275         'unhooked-other': 'nonprofiled-other_committed',
1276         'total-vm': 'total_virtual',
1277         'filemapped-vm': 'file_virtual',
1278         'anonymous-vm': 'anonymous_virtual',
1279         'other-vm': 'other_virtual' }.iteritems():
1280       if key in sizes:
1281         sizes[key] = dump.global_stat(value)
1282
1283     if 'mustbezero' in sizes:
1284       removed_list = (
1285           'profiled-mmap_committed',
1286           'nonprofiled-absent_committed',
1287           'nonprofiled-anonymous_committed',
1288           'nonprofiled-file-exec_committed',
1289           'nonprofiled-file-nonexec_committed',
1290           'nonprofiled-stack_committed',
1291           'nonprofiled-other_committed')
1292       sizes['mustbezero'] = (
1293           dump.global_stat('total_committed') -
1294           sum(dump.global_stat(removed) for removed in removed_list))
1295     if 'total-exclude-profiler' in sizes:
1296       sizes['total-exclude-profiler'] = (
1297           dump.global_stat('total_committed') -
1298           (sizes['mmap-profiler'] + sizes['mmap-type-profiler']))
1299     if 'hour' in sizes:
1300       sizes['hour'] = (dump.time - first_dump_time) / 60.0 / 60.0
1301     if 'minute' in sizes:
1302       sizes['minute'] = (dump.time - first_dump_time) / 60.0
1303     if 'second' in sizes:
1304       sizes['second'] = dump.time - first_dump_time
1305
1306     return sizes
1307
1308   @staticmethod
1309   def _accumulate(dump, policy, bucket_set, sizes):
1310     for line in dump.iter_stacktrace:
1311       words = line.split()
1312       bucket = bucket_set.get(int(words[BUCKET_ID]))
1313       component_match = policy.find(bucket)
1314       sizes[component_match] += int(words[COMMITTED])
1315
1316       if component_match.startswith('tc-'):
1317         sizes['tc-total-log'] += int(words[COMMITTED])
1318       elif component_match.startswith('mmap-'):
1319         sizes['mmap-total-log'] += int(words[COMMITTED])
1320       else:
1321         sizes['other-total-log'] += int(words[COMMITTED])
1322
1323
1324 class CSVCommand(PolicyCommands):
1325   def __init__(self):
1326     super(CSVCommand, self).__init__('csv')
1327
1328   def do(self, sys_argv):
1329     policy_set, dumps, bucket_set = self._set_up(sys_argv)
1330     return CSVCommand._output(policy_set, dumps, bucket_set, sys.stdout)
1331
1332   @staticmethod
1333   def _output(policy_set, dumps, bucket_set, out):
1334     max_components = 0
1335     for label in policy_set:
1336       max_components = max(max_components, len(policy_set[label].components))
1337
1338     for label in sorted(policy_set):
1339       components = policy_set[label].components
1340       if len(policy_set) > 1:
1341         out.write('%s%s\n' % (label, ',' * (max_components - 1)))
1342       out.write('%s%s\n' % (
1343           ','.join(components), ',' * (max_components - len(components))))
1344
1345       LOGGER.info('Applying a policy %s to...' % label)
1346       for dump in dumps:
1347         component_sizes = PolicyCommands._apply_policy(
1348             dump, policy_set[label], bucket_set, dumps[0].time)
1349         s = []
1350         for c in components:
1351           if c in ('hour', 'minute', 'second'):
1352             s.append('%05.5f' % (component_sizes[c]))
1353           else:
1354             s.append('%05.5f' % (component_sizes[c] / 1024.0 / 1024.0))
1355         out.write('%s%s\n' % (
1356               ','.join(s), ',' * (max_components - len(components))))
1357
1358       bucket_set.clear_component_cache()
1359
1360     return 0
1361
1362
1363 class JSONCommand(PolicyCommands):
1364   def __init__(self):
1365     super(JSONCommand, self).__init__('json')
1366
1367   def do(self, sys_argv):
1368     policy_set, dumps, bucket_set = self._set_up(sys_argv)
1369     return JSONCommand._output(policy_set, dumps, bucket_set, sys.stdout)
1370
1371   @staticmethod
1372   def _output(policy_set, dumps, bucket_set, out):
1373     json_base = {
1374       'version': 'JSON_DEEP_2',
1375       'policies': {},
1376     }
1377
1378     for label in sorted(policy_set):
1379       json_base['policies'][label] = {
1380         'legends': policy_set[label].components,
1381         'snapshots': [],
1382       }
1383
1384       LOGGER.info('Applying a policy %s to...' % label)
1385       for dump in dumps:
1386         component_sizes = PolicyCommands._apply_policy(
1387             dump, policy_set[label], bucket_set, dumps[0].time)
1388         component_sizes['dump_path'] = dump.path
1389         component_sizes['dump_time'] = datetime.datetime.fromtimestamp(
1390             dump.time).strftime('%Y-%m-%d %H:%M:%S')
1391         json_base['policies'][label]['snapshots'].append(component_sizes)
1392
1393       bucket_set.clear_component_cache()
1394
1395     json.dump(json_base, out, indent=2, sort_keys=True)
1396
1397     return 0
1398
1399
1400 class ListCommand(PolicyCommands):
1401   def __init__(self):
1402     super(ListCommand, self).__init__('list')
1403
1404   def do(self, sys_argv):
1405     policy_set, dumps, bucket_set = self._set_up(sys_argv)
1406     return ListCommand._output(policy_set, dumps, bucket_set, sys.stdout)
1407
1408   @staticmethod
1409   def _output(policy_set, dumps, bucket_set, out):
1410     for label in sorted(policy_set):
1411       LOGGER.info('Applying a policy %s to...' % label)
1412       for dump in dumps:
1413         component_sizes = PolicyCommands._apply_policy(
1414             dump, policy_set[label], bucket_set, dump.time)
1415         out.write('%s for %s:\n' % (label, dump.path))
1416         for c in policy_set[label].components:
1417           if c in ['hour', 'minute', 'second']:
1418             out.write('%40s %12.3f\n' % (c, component_sizes[c]))
1419           else:
1420             out.write('%40s %12d\n' % (c, component_sizes[c]))
1421
1422       bucket_set.clear_component_cache()
1423
1424     return 0
1425
1426
1427 class MapCommand(Command):
1428   def __init__(self):
1429     super(MapCommand, self).__init__('Usage: %prog map <first-dump> <policy>')
1430
1431   def do(self, sys_argv, out=sys.stdout):
1432     _, args = self._parse_args(sys_argv, 2)
1433     dump_path = args[1]
1434     target_policy = args[2]
1435     (bucket_set, dumps) = Command.load_basic_files(dump_path, True)
1436     policy_set = PolicySet.load(Command._parse_policy_list(target_policy))
1437
1438     MapCommand._output(dumps, bucket_set, policy_set[target_policy], out)
1439     return 0
1440
1441   @staticmethod
1442   def _output(dumps, bucket_set, policy, out):
1443     """Prints all stacktraces in a given component of given depth.
1444
1445     Args:
1446         dumps: A list of Dump objects.
1447         bucket_set: A BucketSet object.
1448         policy: A Policy object.
1449         out: An IO object to output.
1450     """
1451     max_dump_count = 0
1452     range_dict = ExclusiveRangeDict(ListAttribute)
1453     for dump in dumps:
1454       max_dump_count = max(max_dump_count, dump.count)
1455       for key, value in dump.iter_map:
1456         for begin, end, attr in range_dict.iter_range(key[0], key[1]):
1457           attr[dump.count] = value
1458
1459     max_dump_count_digit = len(str(max_dump_count))
1460     for begin, end, attr in range_dict.iter_range():
1461       out.write('%x-%x\n' % (begin, end))
1462       if len(attr) < max_dump_count:
1463         attr[max_dump_count] = None
1464       for index, x in enumerate(attr[1:]):
1465         out.write('  #%0*d: ' % (max_dump_count_digit, index + 1))
1466         if not x:
1467           out.write('None\n')
1468         elif x[0] == 'hooked':
1469           attrs = x[1].split()
1470           assert len(attrs) == 3
1471           bucket_id = int(attrs[2])
1472           bucket = bucket_set.get(bucket_id)
1473           component = policy.find(bucket)
1474           out.write('hooked %s: %s @ %d\n' % (attrs[0], component, bucket_id))
1475         else:
1476           attrs = x[1].split()
1477           size = int(attrs[1])
1478           out.write('unhooked %s: %d bytes committed\n' % (attrs[0], size))
1479
1480
1481 class ExpandCommand(Command):
1482   def __init__(self):
1483     super(ExpandCommand, self).__init__(
1484         'Usage: %prog expand <dump> <policy> <component> <depth>')
1485
1486   def do(self, sys_argv):
1487     _, args = self._parse_args(sys_argv, 4)
1488     dump_path = args[1]
1489     target_policy = args[2]
1490     component_name = args[3]
1491     depth = args[4]
1492     (bucket_set, dump) = Command.load_basic_files(dump_path, False)
1493     policy_set = PolicySet.load(Command._parse_policy_list(target_policy))
1494
1495     ExpandCommand._output(dump, policy_set[target_policy], bucket_set,
1496                           component_name, int(depth), sys.stdout)
1497     return 0
1498
1499   @staticmethod
1500   def _output(dump, policy, bucket_set, component_name, depth, out):
1501     """Prints all stacktraces in a given component of given depth.
1502
1503     Args:
1504         dump: A Dump object.
1505         policy: A Policy object.
1506         bucket_set: A BucketSet object.
1507         component_name: A name of component for filtering.
1508         depth: An integer representing depth to be printed.
1509         out: An IO object to output.
1510     """
1511     sizes = {}
1512
1513     ExpandCommand._accumulate(
1514         dump, policy, bucket_set, component_name, depth, sizes)
1515
1516     sorted_sizes_list = sorted(
1517         sizes.iteritems(), key=(lambda x: x[1]), reverse=True)
1518     total = 0
1519     # TODO(dmikurube): Better formatting.
1520     for size_pair in sorted_sizes_list:
1521       out.write('%10d %s\n' % (size_pair[1], size_pair[0]))
1522       total += size_pair[1]
1523     LOGGER.info('total: %d\n' % total)
1524
1525   @staticmethod
1526   def _accumulate(dump, policy, bucket_set, component_name, depth, sizes):
1527     for line in dump.iter_stacktrace:
1528       words = line.split()
1529       bucket = bucket_set.get(int(words[BUCKET_ID]))
1530       component_match = policy.find(bucket)
1531       if component_match == component_name:
1532         stacktrace_sequence = ''
1533         if bucket.typeinfo:
1534           stacktrace_sequence += '(type=%s)' % bucket.symbolized_typeinfo
1535           stacktrace_sequence += ' (type.name=%s) ' % bucket.typeinfo_name
1536         for function, sourcefile in zip(
1537             bucket.symbolized_stackfunction[
1538                 0 : min(len(bucket.symbolized_stackfunction), 1 + depth)],
1539             bucket.symbolized_stacksourcefile[
1540                 0 : min(len(bucket.symbolized_stacksourcefile), 1 + depth)]):
1541           stacktrace_sequence += '%s(@%s) ' % (function, sourcefile)
1542         if not stacktrace_sequence in sizes:
1543           sizes[stacktrace_sequence] = 0
1544         sizes[stacktrace_sequence] += int(words[COMMITTED])
1545
1546
1547 class PProfCommand(Command):
1548   def __init__(self):
1549     super(PProfCommand, self).__init__(
1550         'Usage: %prog pprof [-c COMPONENT] <dump> <policy>')
1551     self._parser.add_option('-c', '--component', type='string',
1552                             dest='component',
1553                             help='restrict to COMPONENT', metavar='COMPONENT')
1554
1555   def do(self, sys_argv):
1556     options, args = self._parse_args(sys_argv, 2)
1557
1558     dump_path = args[1]
1559     target_policy = args[2]
1560     component = options.component
1561
1562     (bucket_set, dump) = Command.load_basic_files(dump_path, False)
1563     policy_set = PolicySet.load(Command._parse_policy_list(target_policy))
1564
1565     with open(Command._find_prefix(dump_path) + '.maps', 'r') as maps_f:
1566       maps_lines = maps_f.readlines()
1567     PProfCommand._output(
1568         dump, policy_set[target_policy], bucket_set, maps_lines, component,
1569         sys.stdout)
1570
1571     return 0
1572
1573   @staticmethod
1574   def _output(dump, policy, bucket_set, maps_lines, component_name, out):
1575     """Converts the heap profile dump so it can be processed by pprof.
1576
1577     Args:
1578         dump: A Dump object.
1579         policy: A Policy object.
1580         bucket_set: A BucketSet object.
1581         maps_lines: A list of strings containing /proc/.../maps.
1582         component_name: A name of component for filtering.
1583         out: An IO object to output.
1584     """
1585     out.write('heap profile: ')
1586     com_committed, com_allocs = PProfCommand._accumulate(
1587         dump, policy, bucket_set, component_name)
1588
1589     out.write('%6d: %8s [%6d: %8s] @ heapprofile\n' % (
1590         com_allocs, com_committed, com_allocs, com_committed))
1591
1592     PProfCommand._output_stacktrace_lines(
1593         dump, policy, bucket_set, component_name, out)
1594
1595     out.write('MAPPED_LIBRARIES:\n')
1596     for line in maps_lines:
1597       out.write(line)
1598
1599   @staticmethod
1600   def _accumulate(dump, policy, bucket_set, component_name):
1601     """Accumulates size of committed chunks and the number of allocated chunks.
1602
1603     Args:
1604         dump: A Dump object.
1605         policy: A Policy object.
1606         bucket_set: A BucketSet object.
1607         component_name: A name of component for filtering.
1608
1609     Returns:
1610         Two integers which are the accumulated size of committed regions and the
1611         number of allocated chunks, respectively.
1612     """
1613     com_committed = 0
1614     com_allocs = 0
1615     for line in dump.iter_stacktrace:
1616       words = line.split()
1617       bucket = bucket_set.get(int(words[BUCKET_ID]))
1618       if (not bucket or
1619           (component_name and component_name != policy.find(bucket))):
1620         continue
1621
1622       com_committed += int(words[COMMITTED])
1623       com_allocs += int(words[ALLOC_COUNT]) - int(words[FREE_COUNT])
1624
1625     return com_committed, com_allocs
1626
1627   @staticmethod
1628   def _output_stacktrace_lines(dump, policy, bucket_set, component_name, out):
1629     """Prints information of stacktrace lines for pprof.
1630
1631     Args:
1632         dump: A Dump object.
1633         policy: A Policy object.
1634         bucket_set: A BucketSet object.
1635         component_name: A name of component for filtering.
1636         out: An IO object to output.
1637     """
1638     for line in dump.iter_stacktrace:
1639       words = line.split()
1640       bucket = bucket_set.get(int(words[BUCKET_ID]))
1641       if (not bucket or
1642           (component_name and component_name != policy.find(bucket))):
1643         continue
1644
1645       out.write('%6d: %8s [%6d: %8s] @' % (
1646           int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]),
1647           words[COMMITTED],
1648           int(words[ALLOC_COUNT]) - int(words[FREE_COUNT]),
1649           words[COMMITTED]))
1650       for address in bucket.stacktrace:
1651         out.write(' 0x%016x' % address)
1652       out.write('\n')
1653
1654
1655 class UploadCommand(Command):
1656   def __init__(self):
1657     super(UploadCommand, self).__init__(
1658         'Usage: %prog upload [--gsutil path/to/gsutil] '
1659         '<first-dump> <destination-gs-path>')
1660     self._parser.add_option('--gsutil', default='gsutil',
1661                             help='path to GSUTIL', metavar='GSUTIL')
1662
1663   def do(self, sys_argv):
1664     options, args = self._parse_args(sys_argv, 2)
1665     dump_path = args[1]
1666     gs_path = args[2]
1667
1668     dump_files = Command._find_all_dumps(dump_path)
1669     bucket_files = Command._find_all_buckets(dump_path)
1670     prefix = Command._find_prefix(dump_path)
1671     symbol_data_sources = SymbolDataSources(prefix)
1672     symbol_data_sources.prepare()
1673     symbol_path = symbol_data_sources.path()
1674
1675     handle_zip, filename_zip = tempfile.mkstemp('.zip', 'dmprof')
1676     os.close(handle_zip)
1677
1678     try:
1679       file_zip = zipfile.ZipFile(filename_zip, 'w', zipfile.ZIP_DEFLATED)
1680       for filename in dump_files:
1681         file_zip.write(filename, os.path.basename(os.path.abspath(filename)))
1682       for filename in bucket_files:
1683         file_zip.write(filename, os.path.basename(os.path.abspath(filename)))
1684
1685       symbol_basename = os.path.basename(os.path.abspath(symbol_path))
1686       for filename in os.listdir(symbol_path):
1687         if not filename.startswith('.'):
1688           file_zip.write(os.path.join(symbol_path, filename),
1689                          os.path.join(symbol_basename, os.path.basename(
1690                              os.path.abspath(filename))))
1691       file_zip.close()
1692
1693       returncode = UploadCommand._run_gsutil(
1694           options.gsutil, 'cp', '-a', 'public-read', filename_zip, gs_path)
1695     finally:
1696       os.remove(filename_zip)
1697
1698     return returncode
1699
1700   @staticmethod
1701   def _run_gsutil(gsutil, *args):
1702     """Run gsutil as a subprocess.
1703
1704     Args:
1705         *args: Arguments to pass to gsutil. The first argument should be an
1706             operation such as ls, cp or cat.
1707     Returns:
1708         The return code from the process.
1709     """
1710     command = [gsutil] + list(args)
1711     LOGGER.info("Running: %s", command)
1712
1713     try:
1714       return subprocess.call(command)
1715     except OSError, e:
1716       LOGGER.error('Error to run gsutil: %s', e)
1717
1718
1719 def main():
1720   COMMANDS = {
1721     'buckets': BucketsCommand,
1722     'csv': CSVCommand,
1723     'expand': ExpandCommand,
1724     'json': JSONCommand,
1725     'list': ListCommand,
1726     'map': MapCommand,
1727     'pprof': PProfCommand,
1728     'stacktrace': StacktraceCommand,
1729     'upload': UploadCommand,
1730   }
1731
1732   if len(sys.argv) < 2 or (not sys.argv[1] in COMMANDS):
1733     sys.stderr.write("""Usage: dmprof <command> [options] [<args>]
1734
1735 Commands:
1736    buckets      Dump a bucket list with resolving symbols
1737    csv          Classify memory usage in CSV
1738    expand       Show all stacktraces contained in the specified component
1739    json         Classify memory usage in JSON
1740    list         Classify memory usage in simple listing format
1741    map          Show history of mapped regions
1742    pprof        Format the profile dump so that it can be processed by pprof
1743    stacktrace   Convert runtime addresses to symbol names
1744    upload       Upload dumped files
1745
1746 Quick Reference:
1747    dmprof buckets <first-dump>
1748    dmprof csv [-p POLICY] <first-dump>
1749    dmprof expand <dump> <policy> <component> <depth>
1750    dmprof json [-p POLICY] <first-dump>
1751    dmprof list [-p POLICY] <first-dump>
1752    dmprof map <first-dump> <policy>
1753    dmprof pprof [-c COMPONENT] <dump> <policy>
1754    dmprof stacktrace <dump>
1755    dmprof upload [--gsutil path/to/gsutil] <first-dump> <destination-gs-path>
1756 """)
1757     sys.exit(1)
1758   action = sys.argv.pop(1)
1759
1760   LOGGER.setLevel(logging.DEBUG)
1761   handler = logging.StreamHandler()
1762   handler.setLevel(logging.INFO)
1763   formatter = logging.Formatter('%(message)s')
1764   handler.setFormatter(formatter)
1765   LOGGER.addHandler(handler)
1766
1767   try:
1768     errorcode = COMMANDS[action]().do(sys.argv)
1769   except ParsingException, e:
1770     errorcode = 1
1771     sys.stderr.write('Exit by parsing error: %s\n' % e)
1772
1773   return errorcode
1774
1775
1776 if __name__ == '__main__':
1777   sys.exit(main())