Supervised user whitelists: Cleanup
[chromium-blink-merge.git] / tools / findit / git_repository_parser.py
blob765da99b667a25e25bb934101a3d1b1d0b58a1b9
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import base64
6 import xml.dom.minidom as minidom
7 from xml.parsers.expat import ExpatError
9 import crash_utils
10 from repository_parser_interface import ParserInterface
12 FILE_CHANGE_TYPE_MAP = {
13 'add': 'A',
14 'copy': 'C',
15 'delete': 'D',
16 'modify': 'M',
17 'rename': 'R'
21 def _ConvertToFileChangeType(file_action):
22 # TODO(stgao): verify impact on code that checks the file change type.
23 return file_action[0].upper()
26 class GitParser(ParserInterface):
27 """Parser for Git repository in googlesource.
29 Attributes:
30 parsed_deps: A map from component path to its repository name, regression,
31 etc.
32 url_parts_map: A map from url type to its url parts. This parts are added
33 the base url to form different urls.
34 """
36 def __init__(self, parsed_deps, url_parts_map):
37 self.component_to_url_map = parsed_deps
38 self.url_parts_map = url_parts_map
40 def ParseChangelog(self, component_path, range_start, range_end):
41 file_to_revision_map = {}
42 revision_map = {}
43 base_url = self.component_to_url_map[component_path]['repository']
44 changelog_url = base_url + self.url_parts_map['changelog_url']
45 revision_url = base_url + self.url_parts_map['revision_url']
47 # Retrieve data from the url, return empty maps if fails. Html url is a\
48 # url where the changelog can be parsed from html.
49 url = changelog_url % (range_start, range_end)
50 html_url = url + '?pretty=fuller'
51 response = crash_utils.GetDataFromURL(html_url)
52 if not response:
53 return (revision_map, file_to_revision_map)
55 # Parse xml out of the returned string. If it failes, Try parsing
56 # from JSON objects.
57 try:
58 dom = minidom.parseString(response)
59 except ExpatError:
60 self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
61 revision_url, revision_map,
62 file_to_revision_map)
63 return (revision_map, file_to_revision_map)
65 # The revisions information are in from the third divs to the second
66 # to last one.
67 divs = dom.getElementsByTagName('div')[2:-1]
68 pres = dom.getElementsByTagName('pre')
69 uls = dom.getElementsByTagName('ul')
71 # Divs, pres and uls each contain revision information for one CL, so
72 # they should have same length.
73 if not divs or len(divs) != len(pres) or len(pres) != len(uls):
74 self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
75 revision_url, revision_map,
76 file_to_revision_map)
77 return (revision_map, file_to_revision_map)
79 # Iterate through divs and parse revisions
80 for (div, pre, ul) in zip(divs, pres, uls):
81 # Create new revision object for each revision.
82 revision = {}
84 # There must be three <tr>s. If not, this page is wrong.
85 trs = div.getElementsByTagName('tr')
86 if len(trs) != 3:
87 continue
89 # Retrieve git hash.
90 githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
92 # Retrieve and set author.
93 author = trs[1].getElementsByTagName(
94 'td')[0].firstChild.nodeValue.split('<')[0]
95 revision['author'] = author
96 revision['time'] = trs[1].getElementsByTagName(
97 'td')[1].firstChild.nodeValue
99 # Retrive and set message.
100 revision['message'] = pre.firstChild.nodeValue
102 # Set url of this CL.
103 revision_url_part = self.url_parts_map['revision_url'] % githash
104 revision['url'] = base_url + revision_url_part
106 # Go through changed files, they are in li.
107 lis = ul.getElementsByTagName('li')
108 for li in lis:
109 # Retrieve path and action of the changed file
110 file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
111 file_change_type = li.getElementsByTagName('span')[
112 0].getAttribute('class')
114 # Normalize file action so that it is same as SVN parser.
115 file_change_type = _ConvertToFileChangeType(file_change_type)
117 # Add the changed file to the map.
118 if file_path not in file_to_revision_map:
119 file_to_revision_map[file_path] = []
120 file_to_revision_map[file_path].append((githash, file_change_type))
122 # Add this revision object to the map.
123 revision_map[githash] = revision
125 # Parse one revision for the start range, because googlesource does not
126 # include the start of the range.
127 self.ParseRevision(revision_url, range_start, revision_map,
128 file_to_revision_map)
130 return (revision_map, file_to_revision_map)
132 def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
133 revision_url, revision_map, file_to_revision_map):
134 """Parses changelog by going over the JSON file.
136 Args:
137 range_start: Starting range of the regression.
138 range_end: Ending range of the regression.
139 changelog_url: The url to retrieve changelog from.
140 revision_url: The url to retrieve individual revision from.
141 revision_map: A map from a git hash number to its revision information.
142 file_to_revision_map: A map from file to a git hash in which it occurs.
144 # Compute URLs from given range, and retrieves changelog. Stop if it fails.
145 changelog_url %= (range_start, range_end)
146 json_url = changelog_url + '?format=json'
147 response = crash_utils.GetDataFromURL(json_url)
148 if not response:
149 return
151 # Parse changelog from the returned object. The returned string should
152 # start with ")}]'\n", so start from the 6th character.
153 revisions = crash_utils.LoadJSON(response[5:])
154 if not revisions:
155 return
157 # Parse individual revision in the log.
158 for revision in revisions['log']:
159 githash = revision['commit']
160 self.ParseRevision(revision_url, githash, revision_map,
161 file_to_revision_map)
163 # Parse the revision with range_start, because googlesource ignores
164 # that one.
165 self.ParseRevision(revision_url, range_start, revision_map,
166 file_to_revision_map)
168 def ParseRevision(self, revision_url, githash, revision_map,
169 file_to_revision_map):
171 # Retrieve data from the URL, return if it fails.
172 url = revision_url % githash
173 response = crash_utils.GetDataFromURL(url + '?format=json')
174 if not response:
175 return
177 # Load JSON object from the string. If it fails, terminate the function.
178 json_revision = crash_utils.LoadJSON(response[5:])
179 if not json_revision:
180 return
182 # Create a map representing object and get githash from the JSON object.
183 revision = {}
184 githash = json_revision['commit']
186 # Set author, message and URL of this CL.
187 revision['author'] = json_revision['author']['name']
188 revision['time'] = json_revision['author']['time']
189 revision['message'] = json_revision['message']
190 revision['url'] = url
192 # Iterate through the changed files.
193 for diff in json_revision['tree_diff']:
194 file_path = diff['new_path']
195 file_change_type = diff['type']
197 # Normalize file action so that it fits with svn_repository_parser.
198 file_change_type = _ConvertToFileChangeType(file_change_type)
200 # Add the file to the map.
201 if file_path not in file_to_revision_map:
202 file_to_revision_map[file_path] = []
203 file_to_revision_map[file_path].append((githash, file_change_type))
205 # Add this CL to the map.
206 revision_map[githash] = revision
208 return
210 def ParseLineDiff(self, path, component, file_change_type, githash):
211 changed_line_numbers = []
212 changed_line_contents = []
213 base_url = self.component_to_url_map[component]['repository']
214 backup_url = (base_url + self.url_parts_map['revision_url']) % githash
216 # If the file is added (not modified), treat it as if it is not changed.
217 if file_change_type in ('A', 'C', 'R'):
218 # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
219 return (backup_url, changed_line_numbers, changed_line_contents)
221 # Retrieves the diff data from URL, and if it fails, return emptry lines.
222 url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
223 data = crash_utils.GetDataFromURL(url + '?format=text')
224 if not data:
225 return (backup_url, changed_line_numbers, changed_line_contents)
227 # Decode the returned object to line diff info
228 diff = base64.b64decode(data).splitlines()
230 # Iterate through the lines in diff. Set current line to -1 so that we know
231 # that current line is part of the diff chunk.
232 current_line = -1
233 for line in diff:
234 line = line.strip()
236 # If line starts with @@, a new chunk starts.
237 if line.startswith('@@'):
238 current_line = int(line.split('+')[1].split(',')[0])
240 # If we are in a chunk.
241 elif current_line != -1:
242 # If line is either added or modified.
243 if line.startswith('+'):
244 changed_line_numbers.append(current_line)
245 changed_line_contents.append(line[2:])
247 # Do not increment current line if the change is 'delete'.
248 if not line.startswith('-'):
249 current_line += 1
251 # Return url without '?format=json'
252 return (url, changed_line_numbers, changed_line_contents)
254 def ParseBlameInfo(self, component, file_path, line, revision):
255 base_url = self.component_to_url_map[component]['repository']
257 # Retrieve blame JSON file from googlesource. If it fails, return None.
258 url_part = self.url_parts_map['blame_url'] % (revision, file_path)
259 blame_url = base_url + url_part
260 json_string = crash_utils.GetDataFromURL(blame_url)
261 if not json_string:
262 return
264 # Parse JSON object from the string. The returned string should
265 # start with ")}]'\n", so start from the 6th character.
266 annotation = crash_utils.LoadJSON(json_string[5:])
267 if not annotation:
268 return
270 # Go through the regions, which is a list of consecutive lines with same
271 # author/revision.
272 for blame_line in annotation['regions']:
273 start = blame_line['start']
274 count = blame_line['count']
276 # For each region, check if the line we want the blame info of is in this
277 # region.
278 if start <= line and line <= start + count - 1:
279 # If we are in the right region, get the information from the line.
280 revision = blame_line['commit']
281 author = blame_line['author']['name']
282 revision_url_parts = self.url_parts_map['revision_url'] % revision
283 revision_url = base_url + revision_url_parts
284 # TODO(jeun): Add a way to get content from JSON object.
285 content = None
287 (revision_info, _) = self.ParseChangelog(component, revision, revision)
288 message = revision_info[revision]['message']
289 time = revision_info[revision]['time']
290 return (content, revision, author, revision_url, message, time)
292 # Return none if the region does not exist.
293 return None