compiler-rt/lib/hwasan/scripts/hwasan_symbolize

   1 #!/usr/bin/env python3
   2 #===- lib/hwasan/scripts/hwasan_symbolize ----------------------------------===#
   3 #
   4 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
   5 # See https:#llvm.org/LICENSE.txt for license information.
   6 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
   7 #
   8 #===------------------------------------------------------------------------===#
   9 #
  10 # HWAddressSanitizer offline symbolization script.
  11 #
  12 #===------------------------------------------------------------------------===#
  13
  14 from __future__ import print_function
  15 from __future__ import unicode_literals
  16
  17 import argparse
  18 import glob
  19 import html
  20 import json
  21 import mmap
  22 import os
  23 import re
  24 import struct
  25 import subprocess
  26 import sys
  27
  28 if sys.version_info.major < 3:
  29   # Simulate Python 3.x behaviour of defaulting to UTF-8 for print. This is
  30   # important in case any symbols are non-ASCII.
  31   import codecs
  32   sys.stdout = codecs.getwriter("utf-8")(sys.stdout)
  33
  34 # Below, a parser for a subset of ELF. It only supports 64 bit, little-endian,
  35 # and only parses what is necessary to find the build ids. It uses a memoryview
  36 # into an mmap to avoid copying.
  37 Ehdr_size = 64
  38 e_shnum_offset = 60
  39 e_shoff_offset = 40
  40
  41 Shdr_size = 64
  42 sh_type_offset = 4
  43 sh_offset_offset = 24
  44 sh_size_offset = 32
  45 SHT_NOTE = 7
  46
  47 Nhdr_size = 12
  48 NT_GNU_BUILD_ID = 3
  49
  50 def align_up(size, alignment):
  51   return (size + alignment - 1) & ~(alignment - 1)
  52
  53 def handle_Nhdr(mv, sh_size):
  54   offset = 0
  55   while offset < sh_size:
  56     n_namesz, n_descsz, n_type = struct.unpack_from('<III', buffer=mv,
  57                                                     offset=offset)
  58     if (n_type == NT_GNU_BUILD_ID and n_namesz == 4 and
  59         mv[offset + Nhdr_size: offset + Nhdr_size + 4] == b"GNU\x00"):
  60       value = mv[offset + Nhdr_size + 4: offset + Nhdr_size + 4 + n_descsz]
  61       return value.hex()
  62     offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
  63   return None
  64
  65 def handle_Shdr(mv):
  66   sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
  67   if sh_type != SHT_NOTE:
  68     return None, None
  69   sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
  70   sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
  71   return sh_offset, sh_size
  72
  73 def handle_elf(mv):
  74   # \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
  75   # 64-bit little endian platforms (x86_64 and ARM64). If this changes, we will
  76   # have to extend the parsing code.
  77   if mv[:6] != b'\x7fELF\x02\x01':
  78     return None
  79   e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
  80   e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
  81   for i in range(0, e_shnum):
  82     start = e_shoff + i * Shdr_size
  83     sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
  84     if sh_offset is None:
  85       continue
  86     note_hdr = mv[sh_offset: sh_offset + sh_size]
  87     result = handle_Nhdr(note_hdr, sh_size)
  88     if result is not None:
  89       return result
  90
  91 def get_buildid(filename):
  92   with open(filename, "r") as fd:
  93     if os.fstat(fd.fileno()).st_size < Ehdr_size:
  94       return None
  95     with mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_READ) as m:
  96       with memoryview(m) as mv:
  97         return handle_elf(mv)
  98
  99 class Symbolizer:
 100   def __init__(self, path, binary_prefixes, paths_to_cut):
 101     self.__pipe = None
 102     self.__path = path
 103     self.__binary_prefixes = binary_prefixes
 104     self.__paths_to_cut = paths_to_cut
 105     self.__log = False
 106     self.__warnings = set()
 107     self.__index = {}
 108     self.__link_prefixes = []
 109     self.__html = False
 110     self.__last_access_address = None
 111     self.__last_access_tag = None
 112     self.__tag_dump = []
 113     self.__tag_dump_match_idx = None
 114     self.__matched_stack_uas = False
 115     self.__offsets = []
 116
 117   def enable_html(self, enable):
 118     self.__html = enable
 119
 120   def enable_logging(self, enable):
 121     self.__log = enable
 122
 123   def maybe_escape(self, text):
 124     if self.__html:
 125       # We need to manually use &nbsp; for leading spaces, html.escape does
 126       # not do that, and HTML ignores them.
 127       spaces = 0
 128       for i, c in enumerate(text):
 129         spaces = i
 130         if c != ' ':
 131           break
 132       text = text[spaces:]
 133       return spaces * '&nbsp;' + html.escape(text)
 134     return text
 135
 136   def print(self, line, escape=True):
 137     if escape:
 138       line = self.maybe_escape(line)
 139     if self.__html:
 140       line += '<br/>'
 141     print(line)
 142
 143   def read_linkify(self, filename):
 144     with open(filename, 'r') as fd:
 145       data = json.load(fd)
 146     self.__link_prefixes = [(e["prefix"], e["link"]) for e in data]
 147
 148   def __open_pipe(self):
 149     if not self.__pipe:
 150       opt = {}
 151       if sys.version_info.major > 2:
 152         opt['encoding'] = 'utf-8'
 153       self.__pipe = subprocess.Popen([self.__path, "--inlining", "--functions"],
 154                                      stdin=subprocess.PIPE, stdout=subprocess.PIPE,
 155                                      **opt)
 156
 157   class __EOF(Exception):
 158     pass
 159
 160   def __write(self, s):
 161     print(s, file=self.__pipe.stdin)
 162     self.__pipe.stdin.flush()
 163     if self.__log:
 164       print("#>>  |%s|" % (s,), file=sys.stderr)
 165
 166   def __read(self):
 167     s = self.__pipe.stdout.readline().rstrip()
 168     if self.__log:
 169       print("# << |%s|" % (s,), file=sys.stderr)
 170     if s == '':
 171       raise Symbolizer.__EOF
 172     return s
 173
 174   def __process_source_path(self, file_name):
 175     for path_to_cut in self.__paths_to_cut:
 176       file_name = re.sub(".*" + path_to_cut, "", file_name)
 177     file_name = re.sub(".*hwasan_[a-z_]*.(cc|h):[0-9]*", "[hwasan_rtl]", file_name)
 178     file_name = re.sub(".*asan_[a-z_]*.(cc|h):[0-9]*", "[asan_rtl]", file_name)
 179     file_name = re.sub(".*crtstuff.c:0", "???:0", file_name)
 180     return file_name
 181
 182   def __process_binary_name(self, name, buildid):
 183     if name.startswith('/'):
 184       name = name[1:]
 185     if buildid is not None and buildid in self.__index:
 186       return self.__index[buildid]
 187
 188     for p in self.__binary_prefixes:
 189       full_path = os.path.join(p, name)
 190       if os.path.exists(full_path):
 191         return full_path
 192       apex_prefix = "apex/com.android."
 193       if name.startswith(apex_prefix):
 194         full_path = os.path.join(p, "apex/com.google.android." + name[len(apex_prefix):])
 195         if os.path.exists(full_path):
 196           return full_path
 197     # Try stripping extra path components as the last resort.
 198     for p in self.__binary_prefixes:
 199       full_path = os.path.join(p, os.path.basename(name))
 200       if os.path.exists(full_path):
 201         return full_path
 202     if name not in self.__warnings:
 203       print("Could not find symbols for", name, file=sys.stderr)
 204       self.__warnings.add(name)
 205     return None
 206
 207   def iter_locals(self, binary, addr, buildid):
 208     self.__open_pipe()
 209     p = self.__pipe
 210     binary = self.__process_binary_name(binary, buildid)
 211     if not binary:
 212       return
 213     self.__write("FRAME %s %s" % (binary, addr))
 214     try:
 215       while True:
 216         function_name = self.__read()
 217         local_name = self.__read()
 218         file_line = self.__read()
 219         extra = self.__read().split()
 220
 221         file_line = self.__process_source_path(file_line)
 222         offset = None if extra[0] == '??' else int(extra[0])
 223         size = None if extra[1] == '??' else int(extra[1])
 224         tag_offset = None if extra[2] == '??' else int(extra[2])
 225         yield (function_name, file_line, local_name, offset, size, tag_offset)
 226     except Symbolizer.__EOF:
 227       pass
 228
 229   def iter_call_stack(self, binary, buildid, addr):
 230     self.__open_pipe()
 231     p = self.__pipe
 232     binary = self.__process_binary_name(binary, buildid)
 233     if not binary:
 234       return
 235     self.__write("CODE %s %s" % (binary, addr))
 236     try:
 237       while True:
 238         function_name = self.__read()
 239         file_line = self.__read()
 240         file_line = self.__process_source_path(file_line)
 241         yield (function_name, file_line)
 242     except Symbolizer.__EOF:
 243       pass
 244
 245   def maybe_linkify(self, file_line):
 246     if not self.__html or not self.__link_prefixes:
 247       return file_line
 248     filename, line_col = file_line.split(':', 1)
 249     if not line_col:
 250       line = '0' # simplify the link generation
 251     else:
 252       line = line_col.split(':')[0]
 253     longest_prefix = max((
 254       (prefix, link) for prefix, link in self.__link_prefixes
 255       if filename.startswith(prefix)),
 256       key=lambda x: len(x[0]), default=None)
 257     if longest_prefix is None:
 258       return file_line
 259     else:
 260       prefix, link = longest_prefix
 261       return '<a href="{}">{}</a>'.format(
 262         html.escape(link.format(file=filename[len(prefix):], line=line,
 263                                 file_line=file_line, prefix=prefix)), file_line)
 264
 265   def build_index(self):
 266     for p in self.__binary_prefixes:
 267       for dname, _, fnames in os.walk(p):
 268         for fn in fnames:
 269           filename = os.path.join(dname, fn)
 270           try:
 271             bid = get_buildid(filename)
 272           except FileNotFoundError:
 273             continue
 274           except Exception as e:
 275             print("Failed to parse {}: {}".format(filename, e), file=sys.stderr)
 276             continue
 277           if bid is not None:
 278             self.__index[bid] = filename
 279
 280   def symbolize_line(self, line):
 281     #0 0x7f6e35cf2e45  (/blah/foo.so+0x11fe45) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9)
 282     match = re.match(r'^(.*?)#([0-9]+)( *)(0x[0-9a-f]*) *\((.*)\+(0x[0-9a-f]+)\)'
 283                     r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
 284     if match:
 285       frameno = match.group(2)
 286       binary = match.group(5)
 287       addr = int(match.group(6), 16)
 288       buildid = match.group(7)
 289
 290       frames = list(self.iter_call_stack(binary, buildid, addr))
 291
 292       if len(frames) > 0:
 293         self.print(
 294           self.maybe_escape(
 295             "%s#%s%s%s in " % (match.group(1), match.group(2), match.group(3),
 296                               frames[0][0])
 297           ) + self.maybe_linkify(frames[0][1]),
 298           escape=False)
 299         for i in range(1, len(frames)):
 300           space1 = ' ' * match.end(1)
 301           space2 = ' ' * (match.start(4) - match.end(1) - 2)
 302           self.print(
 303             self.maybe_escape("%s->%s%s in " % (space1, space2, frames[i][0]))
 304               + self.maybe_linkify(frames[i][1]), escape=False)
 305       else:
 306         self.print(line.rstrip())
 307     else:
 308       self.print(line.rstrip())
 309
 310   def save_access_address(self, line):
 311     match = re.match(r'^(.*?)HWAddressSanitizer: tag-mismatch on address (0x[0-9a-f]+) ', line, re.UNICODE)
 312     if match:
 313       self.__last_access_address = int(match.group(2), 16)
 314     match = re.match(r'^(.*?) of size [0-9]+ at 0x[0-9a-f]* tags: ([0-9a-f]+)/[0-9a-f]+(\([0-9a-f]+\))? \(ptr/mem\)', line, re.UNICODE)
 315     if match:
 316       self.__last_access_tag = int(match.group(2), 16)
 317
 318   def process_tag_dump_line(self, line, ignore_tags=False):
 319     m = re.match(r'.*?(0x[0-9a-f]+):' + r'([ ]*[\[ ][0-9a-f][0-9a-f]\]?)' * 16, line)
 320     if m is None:
 321       return False
 322     addr = m.group(1)
 323     tags = m.group(*range(2, 18))
 324     fault = [i for i, x in enumerate(tags) if '[' in x]
 325     if fault:
 326       self.__tag_dump_match_idx = len(self.__tag_dump) + fault[0]
 327     self.__tag_dump.extend(int(x.strip(' [').rstrip('] '), 16) for x in tags)
 328     return True
 329
 330   def finish_tag_dump(self):
 331     if self.__matched_stack_uas or self.__tag_dump_match_idx is None:
 332       return
 333     for offset, size, local in sorted(self.__offsets, key=lambda x: abs(x[0])):
 334       idx = self.__tag_dump_match_idx - offset // 16
 335       if idx < 0 or idx > len(self.__tag_dump):
 336         continue
 337       if self.__tag_dump[idx] == self.__last_access_tag:
 338         self.print('')
 339         self.print('Potentially referenced stack object:')
 340         if offset > 0:
 341           self.print('  %d bytes after a variable "%s" in stack frame of function "%s"' % (offset - size, local[2], local[0]))
 342         if offset < 0:
 343           self.print('  %d bytes before a variable "%s" in stack frame of function "%s"' % (-offset, local[2], local[0]))
 344         self.print('  at %s' % (local[1],))
 345
 346   def process_stack_history(self, line, ignore_tags=False):
 347     if self.__last_access_address is None or self.__last_access_tag is None:
 348       return
 349     if re.match(r'Previously allocated frames:', line, re.UNICODE):
 350       return True
 351     pc_mask = (1 << 48) - 1
 352     fp_mask = (1 << 20) - 1
 353     # record_addr:0x1234ABCD record:0x1234ABCD (/path/to/binary+0x1234ABCD) (BuildId: 4abce4cd41ea5c2f34753297b7e774d9)
 354     match = re.match(r'^(.*?)record_addr:(0x[0-9a-f]+) +record:(0x[0-9a-f]+) +\((.*)\+(0x[0-9a-f]+)\)'
 355                     r'(?:\s*\(BuildId: ([0-9a-f]+)\))?', line, re.UNICODE)
 356     if match:
 357       record_addr = int(match.group(2), 16)
 358       record = int(match.group(3), 16)
 359       binary = match.group(4)
 360       addr = int(match.group(5), 16)
 361       buildid = match.group(6)
 362       base_tag = (record_addr >> 3) & 0xFF
 363       fp = (record >> 48) << 4
 364       pc = record & pc_mask
 365
 366       for local in self.iter_locals(binary, addr, buildid):
 367         frame_offset = local[3]
 368         size = local[4]
 369         if frame_offset is None or size is None:
 370           continue
 371         obj_offset = (self.__last_access_address & fp_mask) - ((fp & fp_mask) + frame_offset)
 372         tag_offset = local[5]
 373         if not ignore_tags and (tag_offset is None or base_tag ^ tag_offset != self.__last_access_tag):
 374           continue
 375         if obj_offset < 0 or obj_offset >= size:
 376           self.__offsets.append((obj_offset, size, local))
 377           continue
 378         self.print('')
 379         self.print('Potentially referenced stack object:')
 380         self.print('  %d bytes inside a variable "%s" in stack frame of function "%s"' % (obj_offset, local[2], local[0]))
 381         self.print('  at %s' % (local[1],))
 382         self.__matched_stack_uas = True
 383       return True
 384     return False
 385
 386 def extract_version(s):
 387   idx = s.rfind('-')
 388   if idx == -1:
 389     return 0
 390   x = float(s[idx + 1:])
 391   return x
 392
 393 def main():
 394   parser = argparse.ArgumentParser()
 395   parser.add_argument('-d', action='store_true')
 396   parser.add_argument('-v', action='store_true')
 397   parser.add_argument('--ignore-tags', action='store_true')
 398   parser.add_argument('--symbols', action='append')
 399   parser.add_argument('--source', action='append')
 400   parser.add_argument('--index', action='store_true')
 401   parser.add_argument('--symbolizer')
 402   parser.add_argument('--linkify', type=str)
 403   parser.add_argument('--html', action='store_true')
 404   parser.add_argument('args', nargs=argparse.REMAINDER)
 405   args = parser.parse_args()
 406
 407   # Unstripped binaries location.
 408   binary_prefixes = args.symbols or []
 409   if not binary_prefixes:
 410     if 'ANDROID_PRODUCT_OUT' in os.environ:
 411       product_out = os.path.join(os.environ['ANDROID_PRODUCT_OUT'], 'symbols')
 412       binary_prefixes.append(product_out)
 413     binary_prefixes.append('/')
 414
 415   for p in binary_prefixes:
 416     if not os.path.isdir(p):
 417       print("Symbols path does not exist or is not a directory:", p, file=sys.stderr)
 418       sys.exit(1)
 419
 420   # Source location.
 421   paths_to_cut = args.source or []
 422   if not paths_to_cut:
 423     paths_to_cut.append(os.getcwd() + '/')
 424     if 'ANDROID_BUILD_TOP' in os.environ:
 425       paths_to_cut.append(os.environ['ANDROID_BUILD_TOP'] + '/')
 426
 427   # llvm-symbolizer binary.
 428   # 1. --symbolizer flag
 429   # 2. environment variable
 430   # 3. unsuffixed binary in the current directory
 431   # 4. if inside Android platform, prebuilt binary at a known path
 432   # 5. first "llvm-symbolizer", then "llvm-symbolizer-$VER" with the
 433   #    highest available version in $PATH
 434   symbolizer_path = args.symbolizer
 435   if not symbolizer_path:
 436     if 'LLVM_SYMBOLIZER_PATH' in os.environ:
 437       symbolizer_path = os.environ['LLVM_SYMBOLIZER_PATH']
 438     elif 'HWASAN_SYMBOLIZER_PATH' in os.environ:
 439       symbolizer_path = os.environ['HWASAN_SYMBOLIZER_PATH']
 440
 441   if not symbolizer_path:
 442     s = os.path.join(os.path.dirname(sys.argv[0]), 'llvm-symbolizer')
 443     if os.path.exists(s):
 444       symbolizer_path = s
 445
 446   if not symbolizer_path:
 447     if 'ANDROID_BUILD_TOP' in os.environ:
 448       s = os.path.join(os.environ['ANDROID_BUILD_TOP'], 'prebuilts/clang/host/linux-x86/llvm-binutils-stable/llvm-symbolizer')
 449       if os.path.exists(s):
 450         symbolizer_path = s
 451
 452   if not symbolizer_path:
 453     for path in os.environ["PATH"].split(os.pathsep):
 454       p = os.path.join(path, 'llvm-symbolizer')
 455       if os.path.exists(p):
 456         symbolizer_path = p
 457         break
 458
 459   if not symbolizer_path:
 460     for path in os.environ["PATH"].split(os.pathsep):
 461       candidates = glob.glob(os.path.join(path, 'llvm-symbolizer-*'))
 462       if len(candidates) > 0:
 463         candidates.sort(key = extract_version, reverse = True)
 464         symbolizer_path = candidates[0]
 465         break
 466
 467   if not os.path.exists(symbolizer_path):
 468     print("Symbolizer path does not exist:", symbolizer_path, file=sys.stderr)
 469     sys.exit(1)
 470
 471   if args.v:
 472     print("Looking for symbols in:")
 473     for s in binary_prefixes:
 474       print("  %s" % (s,))
 475     print("Stripping source path prefixes:")
 476     for s in paths_to_cut:
 477       print("  %s" % (s,))
 478     print("Using llvm-symbolizer binary in:\n  %s" % (symbolizer_path,))
 479     print()
 480
 481   symbolizer = Symbolizer(symbolizer_path, binary_prefixes, paths_to_cut)
 482   symbolizer.enable_html(args.html)
 483   symbolizer.enable_logging(args.d)
 484   if args.index:
 485     symbolizer.build_index()
 486
 487   if args.linkify:
 488     if not args.html:
 489       print('Need --html to --linkify', file=sys.stderr)
 490       sys.exit(1)
 491     symbolizer.read_linkify(args.linkify)
 492
 493   tag_dump = False
 494   for line in sys.stdin:
 495     if sys.version_info.major < 3:
 496       line = line.decode('utf-8')
 497     if tag_dump:
 498       tag_dump = symbolizer.process_tag_dump_line(line)
 499       if tag_dump:
 500         continue
 501       symbolizer.finish_tag_dump()
 502     if 'Memory tags around the buggy address' in line:
 503       tag_dump = True
 504
 505     symbolizer.save_access_address(line)
 506     if symbolizer.process_stack_history(line, ignore_tags=args.ignore_tags):
 507       continue
 508     symbolizer.symbolize_line(line)
 509
 510
 511 if __name__ == '__main__':
 512   main()