tools/findit/git_repository_parser.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import base64
   6 import logging
   7 import os
   8 import xml.dom.minidom as minidom
   9 from xml.parsers.expat import ExpatError
  10
  11 import crash_utils
  12 from repository_parser_interface import ParserInterface
  13
  14
  15 class GitParser(ParserInterface):
  16   """Parser for Git repository in googlesource.
  17
  18   Attributes:
  19     parsed_deps: A map from component path to its repository name, regression,
  20                  etc.
  21     url_parts_map: A map from url type to its url parts. This parts are added
  22                    the base url to form different urls.
  23   """
  24
  25   def __init__(self, parsed_deps, url_parts_map):
  26     self.component_to_url_map = parsed_deps
  27     self.url_parts_map = url_parts_map
  28
  29   def ParseChangelog(self, component_path, range_start, range_end):
  30     file_to_revision_map = {}
  31     revision_map = {}
  32     base_url = self.component_to_url_map[component_path]['repository']
  33     changelog_url = base_url + self.url_parts_map['changelog_url']
  34     revision_url = base_url + self.url_parts_map['revision_url']
  35
  36     # Retrieve data from the url, return empty maps if fails. Html url is a\
  37     # url where the changelog can be parsed from html.
  38     url = changelog_url % (range_start, range_end)
  39     html_url = url + '?pretty=fuller'
  40     response = crash_utils.GetDataFromURL(html_url)
  41     if not response:
  42       logging.error('Failed to retrieve changelog from %s', html_url)
  43       return (revision_map, file_to_revision_map)
  44
  45     # Parse xml out of the returned string. If it failes, return empty map.
  46     try:
  47       dom = minidom.parseString(response)
  48     except ExpatError:
  49       logging.error('Failed to parse changelog from %s', url)
  50       return (revision_map, file_to_revision_map)
  51
  52     # The revisions information are in from the third divs to the second
  53     # to last one.
  54     divs = dom.getElementsByTagName('div')[2:-1]
  55     pres = dom.getElementsByTagName('pre')
  56     uls = dom.getElementsByTagName('ul')
  57
  58     # Divs, pres and uls each contain revision information for one CL, so
  59     # they should have same length.
  60     if not divs or len(divs) != len(pres) or len(pres) != len(uls):
  61       self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
  62                                   revision_url, revision_map,
  63                                   file_to_revision_map)
  64       return (revision_map, file_to_revision_map)
  65
  66     # Iterate through divs and parse revisions
  67     for (div, pre, ul) in zip(divs, pres, uls):
  68       # Create new revision object for each revision.
  69       revision = {}
  70
  71       # There must be three <tr>s. If not, this page is wrong.
  72       trs = div.getElementsByTagName('tr')
  73       if len(trs) != 3:
  74         continue
  75
  76       # Retrieve git hash.
  77       githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
  78
  79       # Retrieve and set author.
  80       author = trs[1].getElementsByTagName(
  81           'td')[0].firstChild.nodeValue.split('<')[0]
  82       revision['author'] = author
  83
  84       # Retrive and set message.
  85       revision['message'] = pre.firstChild.nodeValue
  86
  87       # Set url of this CL.
  88       revision_url_part = self.url_parts_map['revision_url'] % githash
  89       revision['url'] = base_url + revision_url_part
  90
  91       # Go through changed files, they are in li.
  92       lis = ul.getElementsByTagName('li')
  93       for li in lis:
  94         # Retrieve path and action of the changed file
  95         file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
  96         file_action = li.getElementsByTagName('span')[0].getAttribute('class')
  97
  98         # Normalize file action so that it is same as SVN parser.
  99         if file_action == 'add':
 100           file_action = 'A'
 101         elif file_action == 'delete':
 102           file_action = 'D'
 103         elif file_action == 'modify':
 104           file_action = 'M'
 105
 106         # Add the changed file to the map.
 107         changed_file = os.path.basename(file_path)
 108         if changed_file not in file_to_revision_map:
 109           file_to_revision_map[changed_file] = []
 110         file_to_revision_map[changed_file].append((githash, file_action,
 111                                                    file_path))
 112
 113       # Add this revision object to the map.
 114       revision_map[githash] = revision
 115
 116     # Parse one revision for the start range, because googlesource does not
 117     # include the start of the range.
 118     self.ParseRevision(revision_url, range_start, revision_map,
 119                        file_to_revision_map)
 120
 121     return (revision_map, file_to_revision_map)
 122
 123   def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
 124                              revision_url, revision_map, file_to_revision_map):
 125     """Parses changelog by going over the JSON file.
 126
 127     Args:
 128       range_start: Starting range of the regression.
 129       range_end: Ending range of the regression.
 130       changelog_url: The url to retrieve changelog from.
 131       revision_url: The url to retrieve individual revision from.
 132       revision_map: A map from a git hash number to its revision information.
 133       file_to_revision_map: A map from file to a git hash in which it occurs.
 134     """
 135     # Compute URLs from given range, and retrieves changelog. Stop if it fails.
 136     changelog_url %= (range_start, range_end)
 137     json_url = changelog_url + '?format=json'
 138     response = crash_utils.GetDataFromURL(json_url)
 139     if not response:
 140       logging.error('Failed to retrieve changelog from %s.', json_url)
 141       return
 142
 143     # Parse changelog from the returned object. The returned string should
 144     # start with ")}]'\n", so start from the 6th character.
 145     revisions = crash_utils.LoadJSON(response[5:])
 146     if not revisions:
 147       logging.error('Failed to parse changelog from %s.', json_url)
 148       return
 149
 150     # Parse individual revision in the log.
 151     for revision in revisions['log']:
 152       githash = revision['commit']
 153       self.ParseRevision(revision_url, githash, revision_map,
 154                          file_to_revision_map)
 155
 156     # Parse the revision with range_start, because googlesource ignores
 157     # that one.
 158     self.ParseRevision(revision_url, range_start, revision_map,
 159                        file_to_revision_map)
 160
 161   def ParseRevision(self, revision_url, githash, revision_map,
 162                     file_to_revision_map):
 163
 164     # Retrieve data from the URL, return if it fails.
 165     url = revision_url % githash
 166     response = crash_utils.GetDataFromURL(url + '?format=json')
 167     if not response:
 168       logging.warning('Failed to retrieve revision from %s.', url)
 169       return
 170
 171     # Load JSON object from the string. If it fails, terminate the function.
 172     json_revision = crash_utils.LoadJSON(response[5:])
 173     if not json_revision:
 174       logging.warning('Failed to parse revision from %s.', url)
 175       return
 176
 177     # Create a map representing object and get githash from the JSON object.
 178     revision = {}
 179     githash = json_revision['commit']
 180
 181     # Set author, message and URL of this CL.
 182     revision['author'] = json_revision['author']['name']
 183     revision['message'] = json_revision['message']
 184     revision['url'] = url
 185
 186     # Iterate through the changed files.
 187     for diff in json_revision['tree_diff']:
 188       file_path = diff['new_path']
 189       file_action = diff['type']
 190
 191       # Normalize file action so that it fits with svn_repository_parser.
 192       if file_action == 'add':
 193         file_action = 'A'
 194       elif file_action == 'delete':
 195         file_action = 'D'
 196       elif file_action == 'modify':
 197         file_action = 'M'
 198
 199       # Add the file to the map.
 200       changed_file = os.path.basename(file_path)
 201       if changed_file not in file_to_revision_map:
 202         file_to_revision_map[changed_file] = []
 203       file_to_revision_map[changed_file].append(
 204           (githash, file_action, file_path))
 205
 206     # Add this CL to the map.
 207     revision_map[githash] = revision
 208
 209     return
 210
 211   def ParseLineDiff(self, path, component, file_action, githash):
 212     changed_line_numbers = []
 213     changed_line_contents = []
 214     base_url = self.component_to_url_map[component]['repository']
 215     backup_url = (base_url + self.url_parts_map['revision_url']) % githash
 216
 217     # If the file is added (not modified), treat it as if it is not changed.
 218     if file_action == 'A':
 219       return (backup_url, changed_line_numbers, changed_line_contents)
 220
 221     # Retrieves the diff data from URL, and if it fails, return emptry lines.
 222     url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
 223     data = crash_utils.GetDataFromURL(url + '?format=text')
 224     if not data:
 225       logging.error('Failed to get diff from %s.', url)
 226       return (backup_url, changed_line_numbers, changed_line_contents)
 227
 228     # Decode the returned object to line diff info
 229     diff = base64.b64decode(data).splitlines()
 230
 231     # Iterate through the lines in diff. Set current line to -1 so that we know
 232     # that current line is part of the diff chunk.
 233     current_line = -1
 234     for line in diff:
 235       line = line.strip()
 236
 237       # If line starts with @@, a new chunk starts.
 238       if line.startswith('@@'):
 239         current_line = int(line.split('+')[1].split(',')[0])
 240
 241       # If we are in a chunk.
 242       elif current_line != -1:
 243         # If line is either added or modified.
 244         if line.startswith('+'):
 245           changed_line_numbers.append(current_line)
 246           changed_line_contents.append(line[2:])
 247
 248         # Do not increment current line if the change is 'delete'.
 249         if not line.startswith('-'):
 250           current_line += 1
 251
 252     # Return url without '?format=json'
 253     return (url, changed_line_numbers, changed_line_contents)
 254
 255   def ParseBlameInfo(self, component, file_path, line, revision):
 256     base_url = self.component_to_url_map[component]['repository']
 257
 258     # Retrieve blame JSON file from googlesource. If it fails, return None.
 259     url_part = self.url_parts_map['blame_url'] % (revision, file_path)
 260     blame_url = base_url + url_part
 261     json_string = crash_utils.GetDataFromURL(blame_url)
 262     if not json_string:
 263       logging.error('Failed to retrieve annotation information from %s.',
 264                     blame_url)
 265       return
 266
 267     # Parse JSON object from the string. The returned string should
 268     # start with ")}]'\n", so start from the 6th character.
 269     annotation = crash_utils.LoadJSON(json_string[5:])
 270     if not annotation:
 271       logging.error('Failed to parse annotation information from %s.',
 272                     blame_url)
 273       return
 274
 275     # Go through the regions, which is a list of consecutive lines with same
 276     # author/revision.
 277     for blame_line in annotation['regions']:
 278       start = blame_line['start']
 279       count = blame_line['count']
 280
 281       # For each region, check if the line we want the blame info of is in this
 282       # region.
 283       if start <= line and line <= start + count - 1:
 284         # If we are in the right region, get the information from the line.
 285         revision = blame_line['commit']
 286         author = blame_line['author']['name']
 287         revision_url_parts = self.url_parts_map['revision_url'] % revision
 288         revision_url = base_url + revision_url_parts
 289         # TODO(jeun): Add a way to get content from JSON object.
 290         content = None
 291
 292         return (content, revision, author, revision_url)
 293
 294     # Return none if the region does not exist.
 295     return None