tools/findit/crash_utils.py

   1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import atexit
   6 import cgi
   7 import ConfigParser
   8 import json
   9 import os
  10 import Queue
  11 import threading
  12 import time
  13
  14 from common import utils
  15 from result import Result
  16
  17
  18 INFINITY = float('inf')
  19
  20 MAX_THREAD_NUMBER = 10
  21 TASK_QUEUE = None
  22
  23
  24 def SignalWorkerThreads():
  25   global TASK_QUEUE
  26   if not TASK_QUEUE:
  27     return
  28
  29   for i in range(MAX_THREAD_NUMBER):
  30     TASK_QUEUE.put(None)
  31
  32   # Give worker threads a chance to exit.
  33   # Workaround the harmless bug in python 2.7 below.
  34   time.sleep(1)
  35
  36
  37 atexit.register(SignalWorkerThreads)
  38
  39
  40 def Worker():
  41   global TASK_QUEUE
  42   while True:
  43     try:
  44       task = TASK_QUEUE.get()
  45       if not task:
  46         return
  47     except TypeError:
  48       # According to http://bugs.python.org/issue14623, this is a harmless bug
  49       # in python 2.7 which won't be fixed.
  50       # The exception is raised on daemon threads when python interpreter is
  51       # shutting down.
  52       return
  53
  54     function, args, kwargs, result_semaphore = task
  55     try:
  56       function(*args, **kwargs)
  57     except:
  58       pass
  59     finally:
  60       # Signal one task is done in case of exception.
  61       result_semaphore.release()
  62
  63
  64 def RunTasks(tasks):
  65   """Run given tasks. Not thread-safe: no concurrent calls of this function.
  66
  67   Return after all tasks were completed. A task is a dict as below:
  68     {
  69       'function': the function to call,
  70       'args': the positional argument to pass to the function,
  71       'kwargs': the key-value arguments to pass to the function,
  72     }
  73   """
  74   if not tasks:
  75     return
  76
  77   global TASK_QUEUE
  78   if not TASK_QUEUE:
  79     TASK_QUEUE = Queue.Queue()
  80     for index in range(MAX_THREAD_NUMBER):
  81       thread = threading.Thread(target=Worker, name='worker_%s' % index)
  82       # Set as daemon, so no join is needed.
  83       thread.daemon = True
  84       thread.start()
  85
  86   result_semaphore = threading.Semaphore(0)
  87   # Push task to task queue for execution.
  88   for task in tasks:
  89     TASK_QUEUE.put(
  90         (task['function'], task.get('args', []),
  91          task.get('kwargs', {}), result_semaphore))
  92
  93   # Wait until all tasks to be executed.
  94   for _ in tasks:
  95     result_semaphore.acquire()
  96
  97
  98 def GetRepositoryType(revision_number):
  99   """Returns the repository type of this revision number.
 100
 101   Args:
 102     revision_number: A revision number or git hash.
 103
 104   Returns:
 105     'git' or 'svn', depending on the revision_number.
 106   """
 107   if utils.IsGitHash(revision_number):
 108     return 'git'
 109   else:
 110     return 'svn'
 111
 112
 113 def ParseURLsFromConfig(file_name):
 114   """Parses URLS from the config file.
 115
 116   The file should be in python config format, where svn section is in the
 117   format "svn:component_path".
 118   Each of the section for svn should contain changelog_url, revision_url,
 119   diff_url and blame_url.
 120
 121   Args:
 122     file_name: The name of the file that contains URL information.
 123
 124   Returns:
 125     A dictionary that maps repository type to list of URLs. For svn, it maps
 126     key 'svn' to another dictionary, which maps component path to the URLs
 127     as explained above. For git, it maps to the URLs as explained above.
 128   """
 129   config = ConfigParser.ConfigParser()
 130
 131   # Get the absolute path of the config file, and read the file. If it fails,
 132   # return none.
 133   config_file_path = os.path.join(os.path.abspath(os.path.dirname(__file__)),
 134                                   file_name)
 135   config.read(config_file_path)
 136   if not config:
 137     return None
 138
 139   # Iterate through the config file, check for sections.
 140   config_dict = {}
 141   for section in config.sections():
 142     # These two do not need another layer of dictionary, so add it and go
 143     # to next section.
 144     if ':' not in section:
 145       for option in config.options(section):
 146         if section not in config_dict:
 147           config_dict[section] = {}
 148
 149         url = config.get(section, option)
 150         config_dict[section][option] = url
 151
 152       continue
 153
 154     # Get repository type and component name from the section name.
 155     repository_type_and_component = section.split(':')
 156     repository_type = repository_type_and_component[0]
 157     component_path = repository_type_and_component[1]
 158
 159     # Add 'svn' as the key, if it is not already there.
 160     if repository_type not in config_dict:
 161       config_dict[repository_type] = {}
 162     url_map_for_repository = config_dict[repository_type]
 163
 164     # Add the path to the 'svn', if it is not already there.
 165     if component_path not in url_map_for_repository:
 166       url_map_for_repository[component_path] = {}
 167     type_to_url = url_map_for_repository[component_path]
 168
 169     # Add all URLs to this map.
 170     for option in config.options(section):
 171       url = config.get(section, option)
 172       type_to_url[option] = url
 173
 174   return config_dict
 175
 176
 177 def NormalizePath(path, parsed_deps):
 178   """Normalizes the path.
 179
 180   Args:
 181     path: A string representing a path.
 182     parsed_deps: A map from component path to its component name, repository,
 183                  etc.
 184
 185   Returns:
 186     A tuple containing a component this path is in (e.g blink, skia, etc)
 187     and a path in that component's repository. Returns None if the component
 188     repository is not supported, i.e from googlecode.
 189   """
 190   # First normalize the path by retreiving the normalized path.
 191   normalized_path = os.path.normpath(path).replace('\\', '/')
 192
 193   # Iterate through all component paths in the parsed DEPS, in the decreasing
 194   # order of the length of the file path.
 195   for component_path in sorted(parsed_deps,
 196                                key=(lambda path: -len(path))):
 197     # new_component_path is the component path with 'src/' removed.
 198     new_component_path = component_path
 199     if new_component_path.startswith('src/') and new_component_path != 'src/':
 200       new_component_path = new_component_path[len('src/'):]
 201
 202     # We need to consider when the lowercased component path is in the path,
 203     # because syzyasan build returns lowercased file path.
 204     lower_component_path = new_component_path.lower()
 205
 206     # If this path is the part of file path, this file must be from this
 207     # component.
 208     if new_component_path in normalized_path or \
 209         lower_component_path in normalized_path:
 210
 211       # Case when the retreived path is in lowercase.
 212       if lower_component_path in normalized_path:
 213         current_component_path = lower_component_path
 214       else:
 215         current_component_path = new_component_path
 216
 217       # Normalize the path by stripping everything off the component's relative
 218       # path.
 219       normalized_path = normalized_path.split(current_component_path, 1)[1]
 220       lower_normalized_path = normalized_path.lower()
 221
 222       # Add 'src/' or 'Source/' at the front of the normalized path, depending
 223       # on what prefix the component path uses. For example, blink uses
 224       # 'Source' but chromium uses 'src/', and blink component path is
 225       # 'src/third_party/WebKit/Source', so add 'Source/' in front of the
 226       # normalized path.
 227       if (lower_component_path == 'src/third_party/webkit/source' and
 228           not lower_normalized_path.startswith('source/')):
 229           normalized_path = (current_component_path.split('/')[-2] + '/' +
 230                              normalized_path)
 231
 232       component_name = parsed_deps[component_path]['name']
 233
 234       return (component_path, component_name, normalized_path)
 235
 236   # If the path does not match any component, default to chromium.
 237   return ('src/', 'chromium', normalized_path)
 238
 239
 240 def SplitRange(regression):
 241   """Splits a range as retrieved from clusterfuzz.
 242
 243   Args:
 244     regression: A string in format 'r1234:r5678'.
 245
 246   Returns:
 247     A list containing two numbers represented in string, for example
 248     ['1234','5678'].
 249   """
 250   if not regression:
 251     return None
 252
 253   revisions = regression.split(':')
 254
 255   # If regression information is not available, return none.
 256   if len(revisions) != 2:
 257     return None
 258
 259   range_start = revisions[0]
 260   range_end = revisions[1]
 261
 262   # Strip 'r' off the range start/end. Not using lstrip to avoid the case when
 263   # the range is in git hash and it starts with 'r'.
 264   if range_start.startswith('r'):
 265     range_start = range_start[1:]
 266
 267   if range_end.startswith('r'):
 268     range_end = range_end[1:]
 269
 270   return [range_start, range_end]
 271
 272
 273 def LoadJSON(json_string):
 274   """Loads json object from string, or None.
 275
 276   Args:
 277     json_string: A string to get object from.
 278
 279   Returns:
 280     JSON object if the string represents a JSON object, None otherwise.
 281   """
 282   try:
 283     data = json.loads(json_string)
 284   except ValueError:
 285     data = None
 286
 287   return data
 288
 289
 290 def GetDataFromURL(url):
 291   """Retrieves raw data from URL, tries 10 times.
 292
 293   Args:
 294     url: URL to get data from.
 295     retries: Number of times to retry connection.
 296
 297   Returns:
 298     None if the data retrieval fails, or the raw data.
 299   """
 300   status_code, data = utils.GetHttpClient().Get(url, retries=10)
 301   if status_code == 200:
 302     return data
 303   else:
 304     # Return None if it fails to read data.
 305     return None
 306
 307
 308 def FindMinLineDistance(crashed_line_list, changed_line_numbers,
 309                         line_range=3):
 310   """Calculates how far the changed line is from one of the crashes.
 311
 312   Finds the minimum distance between the lines that the file crashed on
 313   and the lines that the file changed. For example, if the file crashed on
 314   line 200 and the CL changes line 203,204 and 205, the function returns 3.
 315
 316   Args:
 317     crashed_line_list: A list of lines that the file crashed on.
 318     changed_line_numbers: A list of lines that the file changed.
 319     line_range: Number of lines to look back for.
 320
 321   Returns:
 322     The minimum distance. If either of the input lists is empty,
 323     it returns inf.
 324
 325   """
 326   min_distance = INFINITY
 327   crashed_line = -1
 328   changed_line = -1
 329
 330   crashed_line_numbers = set()
 331   for crashed_line_range in crashed_line_list:
 332     for crashed_line in crashed_line_range:
 333       for line in range(crashed_line - line_range, crashed_line + 1):
 334         crashed_line_numbers.add(line)
 335
 336   for line in crashed_line_numbers:
 337     for distance in changed_line_numbers:
 338       # Find the current distance and update the min if current distance is
 339       # less than current min.
 340       current_distance = abs(line - distance)
 341       if current_distance < min_distance:
 342         min_distance = current_distance
 343         crashed_line = line
 344         changed_line = distance
 345
 346   return (min_distance, crashed_line, changed_line)
 347
 348
 349 def GuessIfSameSubPath(path1, path2):
 350   """Guesses if two paths represent same path.
 351
 352   Compares the name of the folders in the path (by split('/')), and checks
 353   if they match either more than 3 or min of path lengths.
 354
 355   Args:
 356     path1: First path.
 357     path2: Second path to compare.
 358
 359   Returns:
 360     True if it they are thought to be a same path, False otherwise.
 361   """
 362   path1 = path1.split('/')
 363   path2 = path2.split('/')
 364
 365   intersection = set(path1).intersection(set(path2))
 366   return len(intersection) >= (min(3, min(len(path1), len(path2))))
 367
 368
 369 def FindMinStackFrameNumber(stack_frame_indices, priorities):
 370   """Finds the minimum stack number, from the list of stack numbers.
 371
 372   Args:
 373     stack_frame_indices: A list of lists containing stack position.
 374     priorities: A list of of priority for each file.
 375
 376   Returns:
 377     Inf if stack_frame_indices is empty, minimum stack number otherwise.
 378   """
 379   # Get the indexes of the highest priority (or low priority number).
 380   highest_priority = min(priorities)
 381   highest_priority_indices = []
 382   for i in range(len(priorities)):
 383     if priorities[i] == highest_priority:
 384       highest_priority_indices.append(i)
 385
 386   # Gather the list of stack frame numbers for the files that change the
 387   # crash lines.
 388   flattened = []
 389   for i in highest_priority_indices:
 390     flattened += stack_frame_indices[i]
 391
 392   # If no stack frame information is available, return inf. Else, return min.
 393   if not flattened:
 394     return INFINITY
 395   else:
 396     return min(flattened)
 397
 398
 399 def AddHyperlink(text, link):
 400   """Returns a string with HTML link tag.
 401
 402   Args:
 403     text: A string to add link.
 404     link: A link to add to the string.
 405
 406   Returns:
 407     A string with hyperlink added.
 408   """
 409   sanitized_link = cgi.escape(link, quote=True)
 410   sanitized_text = cgi.escape(str(text))
 411   return '<a href="%s">%s</a>' % (sanitized_link, sanitized_text)
 412
 413
 414 def PrettifyList(items):
 415   """Returns a string representation of a list.
 416
 417   It adds comma in between the elements and removes the brackets.
 418   Args:
 419     items: A list to prettify.
 420   Returns:
 421     A string representation of the list.
 422   """
 423   return ', '.join(map(str, items))
 424
 425
 426 def PrettifyFrameInfo(frame_indices, functions):
 427   """Return a string to represent the frames with functions."""
 428   frames = []
 429   for frame_index, function in zip(frame_indices, functions):
 430     frames.append('frame #%s, "%s"' % (frame_index, function.split('(')[0]))
 431   return '; '.join(frames)
 432
 433
 434 def PrettifyFiles(file_list):
 435   """Returns a string representation of a list of file names.
 436
 437   Args:
 438     file_list: A list of tuple, (file_name, file_url).
 439   Returns:
 440     A string representation of file names with their urls.
 441   """
 442   ret = ['\n']
 443   for file_name, file_url in file_list:
 444     ret.append('      %s\n' % AddHyperlink(file_name, file_url))
 445   return ''.join(ret)
 446
 447
 448 def Intersection(crashed_line_list, stack_frame_index, changed_line_numbers,
 449                  function, line_range=3):
 450   """Finds the overlap betwee changed lines and crashed lines.
 451
 452   Finds the intersection of the lines that caused the crash and
 453   lines that the file changes. The intersection looks within 3 lines
 454   of the line that caused the crash.
 455
 456   Args:
 457     crashed_line_list: A list of lines that the file crashed on.
 458     stack_frame_index: A list of positions in stack for each of the lines.
 459     changed_line_numbers: A list of lines that the file changed.
 460     function: A list of functions that the file crashed on.
 461     line_range: Number of lines to look backwards from crashed lines.
 462
 463   Returns:
 464     line_number_intersection: Intersection between crashed_line_list and
 465                        changed_line_numbers.
 466     stack_frame_index_intersection: Stack number for each of the intersections.
 467   """
 468   line_number_intersection = []
 469   stack_frame_index_intersection = []
 470   function_intersection = []
 471
 472   # Iterate through the crashed lines, and its occurence in stack.
 473   for (lines, stack_frame_index, function_name) in zip(
 474       crashed_line_list, stack_frame_index, function):
 475     # Also check previous 'line_range' lines. Create a set of all changed lines
 476     # and lines within 3 lines range before the crashed line.
 477     line_minus_n = set()
 478     for line in lines:
 479       for line_in_range in range(line - line_range, line + 1):
 480         line_minus_n.add(line_in_range)
 481
 482     for changed_line in changed_line_numbers:
 483       # If a CL does not change crahsed line, check next line.
 484       if changed_line not in line_minus_n:
 485         continue
 486
 487       intersected_line = set()
 488       # If the changed line is exactly the crashed line, add that line.
 489       for line in lines:
 490         if line in changed_line_numbers:
 491           intersected_line.add(line)
 492
 493         # If the changed line is in 3 lines of the crashed line, add the line.
 494         else:
 495           intersected_line.add(changed_line)
 496
 497       # Avoid adding the same line twice.
 498       if intersected_line not in line_number_intersection:
 499         line_number_intersection.append(list(intersected_line))
 500         stack_frame_index_intersection.append(stack_frame_index)
 501         function_intersection.append(function_name)
 502       break
 503
 504   return (line_number_intersection, stack_frame_index_intersection,
 505           function_intersection)
 506
 507
 508 def MatchListToResultList(matches):
 509   """Convert list of matches to the list of result objects.
 510
 511   Args:
 512     matches: A list of match objects along with its stack priority and revision
 513              number/git hash
 514   Returns:
 515     A list of result object.
 516
 517   """
 518   result_list = []
 519
 520   for _, cl, match in matches:
 521     suspected_cl = cl
 522     revision_url = match.revision_url
 523     component_name = match.component_name
 524     author = match.author
 525     reason = match.reason.strip()
 526     review_url = match.review_url
 527     reviewers = match.reviewers
 528     # For matches, line content do not exist.
 529     line_content = None
 530     message = match.message
 531     time = match.time
 532
 533     result = Result(suspected_cl, revision_url, component_name, author, reason,
 534                     review_url, reviewers, line_content, message, time)
 535     result_list.append(result)
 536
 537   return result_list
 538
 539
 540 def BlameListToResultList(blame_list):
 541   """Convert blame list to the list of result objects.
 542
 543   Args:
 544     blame_list: A list of blame objects.
 545
 546   Returns:
 547     A list of result objects.
 548   """
 549   result_list = []
 550
 551   for blame in blame_list:
 552     suspected_cl = blame.revision
 553     revision_url = blame.url
 554     component_name = blame.component_name
 555     author = blame.author
 556     reason = (
 557         'The CL last changed line %s of file %s, which is stack frame %d.' %
 558         (blame.line_number, blame.file, blame.stack_frame_index))
 559     time = blame.time
 560     # Blame object does not have review url and reviewers.
 561     review_url = None
 562     reviewers = None
 563     line_content = blame.line_content
 564     message = blame.message
 565
 566     result = Result(suspected_cl, revision_url, component_name, author, reason,
 567                     review_url, reviewers, line_content, message, time)
 568     result_list.append(result)
 569
 570   return result_list