Plug a memory leak in google_apis_unittests.
[chromium-blink-merge.git] / tools / findit / git_repository_parser.py
blob8053f1cd669523475855398d05fb0e519f8a0cbe
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import base64
6 import logging
7 import os
8 import xml.dom.minidom as minidom
9 from xml.parsers.expat import ExpatError
11 import crash_utils
12 from repository_parser_interface import ParserInterface
15 class GitParser(ParserInterface):
16 """Parser for Git repository in googlesource.
18 Attributes:
19 parsed_deps: A map from component path to its repository name, regression,
20 etc.
21 url_parts_map: A map from url type to its url parts. This parts are added
22 the base url to form different urls.
23 """
25 def __init__(self, parsed_deps, url_parts_map):
26 self.component_to_url_map = parsed_deps
27 self.url_parts_map = url_parts_map
29 def ParseChangelog(self, component_path, range_start, range_end):
30 file_to_revision_map = {}
31 revision_map = {}
32 base_url = self.component_to_url_map[component_path]['repository']
33 changelog_url = base_url + self.url_parts_map['changelog_url']
34 revision_url = base_url + self.url_parts_map['revision_url']
36 # Retrieve data from the url, return empty maps if fails. Html url is a\
37 # url where the changelog can be parsed from html.
38 url = changelog_url % (range_start, range_end)
39 html_url = url + '?pretty=fuller'
40 response = crash_utils.GetDataFromURL(html_url)
41 if not response:
42 logging.error('Failed to retrieve changelog from %s', html_url)
43 return (revision_map, file_to_revision_map)
45 # Parse xml out of the returned string. If it failes, return empty map.
46 try:
47 dom = minidom.parseString(response)
48 except ExpatError:
49 logging.error('Failed to parse changelog from %s', url)
50 return (revision_map, file_to_revision_map)
52 # The revisions information are in from the third divs to the second
53 # to last one.
54 divs = dom.getElementsByTagName('div')[2:-1]
55 pres = dom.getElementsByTagName('pre')
56 uls = dom.getElementsByTagName('ul')
58 # Divs, pres and uls each contain revision information for one CL, so
59 # they should have same length.
60 if not divs or len(divs) != len(pres) or len(pres) != len(uls):
61 self.ParseChangelogFromJSON(range_start, range_end, changelog_url,
62 revision_url, revision_map,
63 file_to_revision_map)
64 return (revision_map, file_to_revision_map)
66 # Iterate through divs and parse revisions
67 for (div, pre, ul) in zip(divs, pres, uls):
68 # Create new revision object for each revision.
69 revision = {}
71 # There must be three <tr>s. If not, this page is wrong.
72 trs = div.getElementsByTagName('tr')
73 if len(trs) != 3:
74 continue
76 # Retrieve git hash.
77 githash = trs[0].getElementsByTagName('a')[0].firstChild.nodeValue
79 # Retrieve and set author.
80 author = trs[1].getElementsByTagName(
81 'td')[0].firstChild.nodeValue.split('<')[0]
82 revision['author'] = author
84 # Retrive and set message.
85 revision['message'] = pre.firstChild.nodeValue
87 # Set url of this CL.
88 revision_url_part = self.url_parts_map['revision_url'] % githash
89 revision['url'] = base_url + revision_url_part
91 # Go through changed files, they are in li.
92 lis = ul.getElementsByTagName('li')
93 for li in lis:
94 # Retrieve path and action of the changed file
95 file_path = li.getElementsByTagName('a')[0].firstChild.nodeValue
96 file_action = li.getElementsByTagName('span')[0].getAttribute('class')
98 # Normalize file action so that it is same as SVN parser.
99 if file_action == 'add':
100 file_action = 'A'
101 elif file_action == 'delete':
102 file_action = 'D'
103 elif file_action == 'modify':
104 file_action = 'M'
106 # Add the changed file to the map.
107 changed_file = os.path.basename(file_path)
108 if changed_file not in file_to_revision_map:
109 file_to_revision_map[changed_file] = []
110 file_to_revision_map[changed_file].append((githash, file_action,
111 file_path))
113 # Add this revision object to the map.
114 revision_map[githash] = revision
116 # Parse one revision for the start range, because googlesource does not
117 # include the start of the range.
118 self.ParseRevision(revision_url, range_start, revision_map,
119 file_to_revision_map)
121 return (revision_map, file_to_revision_map)
123 def ParseChangelogFromJSON(self, range_start, range_end, changelog_url,
124 revision_url, revision_map, file_to_revision_map):
125 """Parses changelog by going over the JSON file.
127 Args:
128 range_start: Starting range of the regression.
129 range_end: Ending range of the regression.
130 changelog_url: The url to retrieve changelog from.
131 revision_url: The url to retrieve individual revision from.
132 revision_map: A map from a git hash number to its revision information.
133 file_to_revision_map: A map from file to a git hash in which it occurs.
135 # Compute URLs from given range, and retrieves changelog. Stop if it fails.
136 changelog_url %= (range_start, range_end)
137 json_url = changelog_url + '?format=json'
138 response = crash_utils.GetDataFromURL(json_url)
139 if not response:
140 logging.error('Failed to retrieve changelog from %s.', json_url)
141 return
143 # Parse changelog from the returned object. The returned string should
144 # start with ")}]'\n", so start from the 6th character.
145 revisions = crash_utils.LoadJSON(response[5:])
146 if not revisions:
147 logging.error('Failed to parse changelog from %s.', json_url)
148 return
150 # Parse individual revision in the log.
151 for revision in revisions['log']:
152 githash = revision['commit']
153 self.ParseRevision(revision_url, githash, revision_map,
154 file_to_revision_map)
156 # Parse the revision with range_start, because googlesource ignores
157 # that one.
158 self.ParseRevision(revision_url, range_start, revision_map,
159 file_to_revision_map)
161 def ParseRevision(self, revision_url, githash, revision_map,
162 file_to_revision_map):
164 # Retrieve data from the URL, return if it fails.
165 url = revision_url % githash
166 response = crash_utils.GetDataFromURL(url + '?format=json')
167 if not response:
168 logging.warning('Failed to retrieve revision from %s.', url)
169 return
171 # Load JSON object from the string. If it fails, terminate the function.
172 json_revision = crash_utils.LoadJSON(response[5:])
173 if not json_revision:
174 logging.warning('Failed to parse revision from %s.', url)
175 return
177 # Create a map representing object and get githash from the JSON object.
178 revision = {}
179 githash = json_revision['commit']
181 # Set author, message and URL of this CL.
182 revision['author'] = json_revision['author']['name']
183 revision['message'] = json_revision['message']
184 revision['url'] = url
186 # Iterate through the changed files.
187 for diff in json_revision['tree_diff']:
188 file_path = diff['new_path']
189 file_action = diff['type']
191 # Normalize file action so that it fits with svn_repository_parser.
192 if file_action == 'add':
193 file_action = 'A'
194 elif file_action == 'delete':
195 file_action = 'D'
196 elif file_action == 'modify':
197 file_action = 'M'
199 # Add the file to the map.
200 changed_file = os.path.basename(file_path)
201 if changed_file not in file_to_revision_map:
202 file_to_revision_map[changed_file] = []
203 file_to_revision_map[changed_file].append(
204 (githash, file_action, file_path))
206 # Add this CL to the map.
207 revision_map[githash] = revision
209 return
211 def ParseLineDiff(self, path, component, file_action, githash):
212 changed_line_numbers = []
213 changed_line_contents = []
214 base_url = self.component_to_url_map[component]['repository']
215 backup_url = (base_url + self.url_parts_map['revision_url']) % githash
217 # If the file is added (not modified), treat it as if it is not changed.
218 if file_action == 'A':
219 return (backup_url, changed_line_numbers, changed_line_contents)
221 # Retrieves the diff data from URL, and if it fails, return emptry lines.
222 url = (base_url + self.url_parts_map['diff_url']) % (githash, path)
223 data = crash_utils.GetDataFromURL(url + '?format=text')
224 if not data:
225 logging.error('Failed to get diff from %s.', url)
226 return (backup_url, changed_line_numbers, changed_line_contents)
228 # Decode the returned object to line diff info
229 diff = base64.b64decode(data).splitlines()
231 # Iterate through the lines in diff. Set current line to -1 so that we know
232 # that current line is part of the diff chunk.
233 current_line = -1
234 for line in diff:
235 line = line.strip()
237 # If line starts with @@, a new chunk starts.
238 if line.startswith('@@'):
239 current_line = int(line.split('+')[1].split(',')[0])
241 # If we are in a chunk.
242 elif current_line != -1:
243 # If line is either added or modified.
244 if line.startswith('+'):
245 changed_line_numbers.append(current_line)
246 changed_line_contents.append(line[2:])
248 # Do not increment current line if the change is 'delete'.
249 if not line.startswith('-'):
250 current_line += 1
252 # Return url without '?format=json'
253 return (url, changed_line_numbers, changed_line_contents)
255 def ParseBlameInfo(self, component, file_path, line, revision):
256 base_url = self.component_to_url_map[component]['repository']
258 # Retrieve blame JSON file from googlesource. If it fails, return None.
259 url_part = self.url_parts_map['blame_url'] % (revision, file_path)
260 blame_url = base_url + url_part
261 json_string = crash_utils.GetDataFromURL(blame_url)
262 if not json_string:
263 logging.error('Failed to retrieve annotation information from %s.',
264 blame_url)
265 return
267 # Parse JSON object from the string. The returned string should
268 # start with ")}]'\n", so start from the 6th character.
269 annotation = crash_utils.LoadJSON(json_string[5:])
270 if not annotation:
271 logging.error('Failed to parse annotation information from %s.',
272 blame_url)
273 return
275 # Go through the regions, which is a list of consecutive lines with same
276 # author/revision.
277 for blame_line in annotation['regions']:
278 start = blame_line['start']
279 count = blame_line['count']
281 # For each region, check if the line we want the blame info of is in this
282 # region.
283 if start <= line and line <= start + count - 1:
284 # If we are in the right region, get the information from the line.
285 revision = blame_line['commit']
286 author = blame_line['author']['name']
287 revision_url_parts = self.url_parts_map['revision_url'] % revision
288 revision_url = base_url + revision_url_parts
289 # TODO(jeun): Add a way to get content from JSON object.
290 content = None
292 return (content, revision, author, revision_url)
294 # Return none if the region does not exist.
295 return None