1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
8 import xml
.dom
.minidom
as minidom
9 from xml
.parsers
.expat
import ExpatError
12 from repository_parser_interface
import ParserInterface
15 class GitParser(ParserInterface
):
16 """Parser for Git repository in googlesource.
19 parsed_deps: A map from component path to its repository name, regression,
21 url_parts_map: A map from url type to its url parts. This parts are added
22 the base url to form different urls.
25 def __init__(self
, parsed_deps
, url_parts_map
):
26 self
.component_to_url_map
= parsed_deps
27 self
.url_parts_map
= url_parts_map
29 def ParseChangelog(self
, component_path
, range_start
, range_end
):
30 file_to_revision_map
= {}
32 base_url
= self
.component_to_url_map
[component_path
]['repository']
33 changelog_url
= base_url
+ self
.url_parts_map
['changelog_url']
34 revision_url
= base_url
+ self
.url_parts_map
['revision_url']
36 # Retrieve data from the url, return empty maps if fails. Html url is a\
37 # url where the changelog can be parsed from html.
38 url
= changelog_url
% (range_start
, range_end
)
39 html_url
= url
+ '?pretty=fuller'
40 response
= crash_utils
.GetDataFromURL(html_url
)
42 logging
.error('Failed to retrieve changelog from %s', html_url
)
43 return (revision_map
, file_to_revision_map
)
45 # Parse xml out of the returned string. If it failes, return empty map.
47 dom
= minidom
.parseString(response
)
49 logging
.error('Failed to parse changelog from %s', url
)
50 return (revision_map
, file_to_revision_map
)
52 # The revisions information are in from the third divs to the second
54 divs
= dom
.getElementsByTagName('div')[2:-1]
55 pres
= dom
.getElementsByTagName('pre')
56 uls
= dom
.getElementsByTagName('ul')
58 # Divs, pres and uls each contain revision information for one CL, so
59 # they should have same length.
60 if not divs
or len(divs
) != len(pres
) or len(pres
) != len(uls
):
61 self
.ParseChangelogFromJSON(range_start
, range_end
, changelog_url
,
62 revision_url
, revision_map
,
64 return (revision_map
, file_to_revision_map
)
66 # Iterate through divs and parse revisions
67 for (div
, pre
, ul
) in zip(divs
, pres
, uls
):
68 # Create new revision object for each revision.
71 # There must be three <tr>s. If not, this page is wrong.
72 trs
= div
.getElementsByTagName('tr')
77 githash
= trs
[0].getElementsByTagName('a')[0].firstChild
.nodeValue
79 # Retrieve and set author.
80 author
= trs
[1].getElementsByTagName(
81 'td')[0].firstChild
.nodeValue
.split('<')[0]
82 revision
['author'] = author
84 # Retrive and set message.
85 revision
['message'] = pre
.firstChild
.nodeValue
88 revision_url_part
= self
.url_parts_map
['revision_url'] % githash
89 revision
['url'] = base_url
+ revision_url_part
91 # Go through changed files, they are in li.
92 lis
= ul
.getElementsByTagName('li')
94 # Retrieve path and action of the changed file
95 file_path
= li
.getElementsByTagName('a')[0].firstChild
.nodeValue
96 file_action
= li
.getElementsByTagName('span')[0].getAttribute('class')
98 # Normalize file action so that it is same as SVN parser.
99 if file_action
== 'add':
101 elif file_action
== 'delete':
103 elif file_action
== 'modify':
106 # Add the changed file to the map.
107 changed_file
= os
.path
.basename(file_path
)
108 if changed_file
not in file_to_revision_map
:
109 file_to_revision_map
[changed_file
] = []
110 file_to_revision_map
[changed_file
].append((githash
, file_action
,
113 # Add this revision object to the map.
114 revision_map
[githash
] = revision
116 # Parse one revision for the start range, because googlesource does not
117 # include the start of the range.
118 self
.ParseRevision(revision_url
, range_start
, revision_map
,
119 file_to_revision_map
)
121 return (revision_map
, file_to_revision_map
)
123 def ParseChangelogFromJSON(self
, range_start
, range_end
, changelog_url
,
124 revision_url
, revision_map
, file_to_revision_map
):
125 """Parses changelog by going over the JSON file.
128 range_start: Starting range of the regression.
129 range_end: Ending range of the regression.
130 changelog_url: The url to retrieve changelog from.
131 revision_url: The url to retrieve individual revision from.
132 revision_map: A map from a git hash number to its revision information.
133 file_to_revision_map: A map from file to a git hash in which it occurs.
135 # Compute URLs from given range, and retrieves changelog. Stop if it fails.
136 changelog_url
%= (range_start
, range_end
)
137 json_url
= changelog_url
+ '?format=json'
138 response
= crash_utils
.GetDataFromURL(json_url
)
140 logging
.error('Failed to retrieve changelog from %s.', json_url
)
143 # Parse changelog from the returned object. The returned string should
144 # start with ")}]'\n", so start from the 6th character.
145 revisions
= crash_utils
.LoadJSON(response
[5:])
147 logging
.error('Failed to parse changelog from %s.', json_url
)
150 # Parse individual revision in the log.
151 for revision
in revisions
['log']:
152 githash
= revision
['commit']
153 self
.ParseRevision(revision_url
, githash
, revision_map
,
154 file_to_revision_map
)
156 # Parse the revision with range_start, because googlesource ignores
158 self
.ParseRevision(revision_url
, range_start
, revision_map
,
159 file_to_revision_map
)
161 def ParseRevision(self
, revision_url
, githash
, revision_map
,
162 file_to_revision_map
):
164 # Retrieve data from the URL, return if it fails.
165 url
= revision_url
% githash
166 response
= crash_utils
.GetDataFromURL(url
+ '?format=json')
168 logging
.warning('Failed to retrieve revision from %s.', url
)
171 # Load JSON object from the string. If it fails, terminate the function.
172 json_revision
= crash_utils
.LoadJSON(response
[5:])
173 if not json_revision
:
174 logging
.warning('Failed to parse revision from %s.', url
)
177 # Create a map representing object and get githash from the JSON object.
179 githash
= json_revision
['commit']
181 # Set author, message and URL of this CL.
182 revision
['author'] = json_revision
['author']['name']
183 revision
['message'] = json_revision
['message']
184 revision
['url'] = url
186 # Iterate through the changed files.
187 for diff
in json_revision
['tree_diff']:
188 file_path
= diff
['new_path']
189 file_action
= diff
['type']
191 # Normalize file action so that it fits with svn_repository_parser.
192 if file_action
== 'add':
194 elif file_action
== 'delete':
196 elif file_action
== 'modify':
199 # Add the file to the map.
200 changed_file
= os
.path
.basename(file_path
)
201 if changed_file
not in file_to_revision_map
:
202 file_to_revision_map
[changed_file
] = []
203 file_to_revision_map
[changed_file
].append(
204 (githash
, file_action
, file_path
))
206 # Add this CL to the map.
207 revision_map
[githash
] = revision
211 def ParseLineDiff(self
, path
, component
, file_action
, githash
):
212 changed_line_numbers
= []
213 changed_line_contents
= []
214 base_url
= self
.component_to_url_map
[component
]['repository']
215 backup_url
= (base_url
+ self
.url_parts_map
['revision_url']) % githash
217 # If the file is added (not modified), treat it as if it is not changed.
218 if file_action
== 'A':
219 return (backup_url
, changed_line_numbers
, changed_line_contents
)
221 # Retrieves the diff data from URL, and if it fails, return emptry lines.
222 url
= (base_url
+ self
.url_parts_map
['diff_url']) % (githash
, path
)
223 data
= crash_utils
.GetDataFromURL(url
+ '?format=text')
225 logging
.error('Failed to get diff from %s.', url
)
226 return (backup_url
, changed_line_numbers
, changed_line_contents
)
228 # Decode the returned object to line diff info
229 diff
= base64
.b64decode(data
).splitlines()
231 # Iterate through the lines in diff. Set current line to -1 so that we know
232 # that current line is part of the diff chunk.
237 # If line starts with @@, a new chunk starts.
238 if line
.startswith('@@'):
239 current_line
= int(line
.split('+')[1].split(',')[0])
241 # If we are in a chunk.
242 elif current_line
!= -1:
243 # If line is either added or modified.
244 if line
.startswith('+'):
245 changed_line_numbers
.append(current_line
)
246 changed_line_contents
.append(line
[2:])
248 # Do not increment current line if the change is 'delete'.
249 if not line
.startswith('-'):
252 # Return url without '?format=json'
253 return (url
, changed_line_numbers
, changed_line_contents
)
255 def ParseBlameInfo(self
, component
, file_path
, line
, revision
):
256 base_url
= self
.component_to_url_map
[component
]['repository']
258 # Retrieve blame JSON file from googlesource. If it fails, return None.
259 url_part
= self
.url_parts_map
['blame_url'] % (revision
, file_path
)
260 blame_url
= base_url
+ url_part
261 json_string
= crash_utils
.GetDataFromURL(blame_url
)
263 logging
.error('Failed to retrieve annotation information from %s.',
267 # Parse JSON object from the string. The returned string should
268 # start with ")}]'\n", so start from the 6th character.
269 annotation
= crash_utils
.LoadJSON(json_string
[5:])
271 logging
.error('Failed to parse annotation information from %s.',
275 # Go through the regions, which is a list of consecutive lines with same
277 for blame_line
in annotation
['regions']:
278 start
= blame_line
['start']
279 count
= blame_line
['count']
281 # For each region, check if the line we want the blame info of is in this
283 if start
<= line
and line
<= start
+ count
- 1:
284 # If we are in the right region, get the information from the line.
285 revision
= blame_line
['commit']
286 author
= blame_line
['author']['name']
287 revision_url_parts
= self
.url_parts_map
['revision_url'] % revision
288 revision_url
= base_url
+ revision_url_parts
289 # TODO(jeun): Add a way to get content from JSON object.
292 return (content
, revision
, author
, revision_url
)
294 # Return none if the region does not exist.