Popular sites on the NTP: Favicon improvements
[chromium-blink-merge.git] / tools / findit / svn_repository_parser.py
blob64a4503d49586552da121c524dce13140ca72ac1
1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import xml.dom.minidom as minidom
6 from xml.parsers.expat import ExpatError
8 import crash_utils
9 from repository_parser_interface import ParserInterface
12 # This number is 6 because each linediff page in src.chromium.org should
13 # contain the following tables: table with revision number, table with actual
14 # diff, table with dropdown menu, table with legend, a border table and a table
15 # containing page information.
16 NUM_TABLES_IN_LINEDIFF_PAGE = 6
17 # Each of the linediff info should contain 3 tds, one for changed line number,
18 # and two for line contents before/after.
19 NUM_TDS_IN_LINEDIFF_PAGE = 3
22 class SVNParser(ParserInterface):
23 """Parser for SVN repository using chromium.org, for components in config.
25 Attributes:
26 url_map: A map from component to the urls, where urls are for changelog,
27 revision, line diff and annotation.
28 """
30 def __init__(self, url_map):
31 self.component_to_urls_map = url_map
33 def ParseChangelog(self, component, range_start, range_end):
34 file_to_revision_map = {}
35 revision_map = {}
37 # Check if the current component is supported by reading the components
38 # parsed from config file. If it is not, fail.
40 url_map = self.component_to_urls_map.get(component)
41 if not url_map:
42 return (revision_map, file_to_revision_map)
44 # Retrieve data from the url, return empty map if fails.
45 revision_range_str = '%s:%s' % (range_start, range_end)
46 url = url_map['changelog_url'] % revision_range_str
47 response = crash_utils.GetDataFromURL(url)
48 if not response:
49 return (revision_map, file_to_revision_map)
51 # Parse xml out of the returned string. If it fails, return empty map.
52 try:
53 xml_revisions = minidom.parseString(response)
54 except ExpatError:
55 return (revision_map, file_to_revision_map)
57 # Iterate through the returned XML object.
58 revisions = xml_revisions.getElementsByTagName('logentry')
59 for revision in revisions:
60 # Create new revision object for each of the revision.
61 revision_object = {}
63 # Set author of the CL.
64 revision_object['author'] = revision.getElementsByTagName(
65 'author')[0].firstChild.nodeValue
67 # Get the revision number from xml.
68 revision_number = int(revision.getAttribute('revision'))
70 # Iterate through the changed paths in the CL.
71 paths = revision.getElementsByTagName('paths')
72 if paths:
73 for changed_path in paths[0].getElementsByTagName('path'):
74 # Get path and file change type from the xml.
75 file_path = changed_path.firstChild.nodeValue
76 file_change_type = changed_path.getAttribute('action')
78 if file_path.startswith('/trunk/'):
79 file_path = file_path[len('/trunk/'):]
81 # Add file to the map.
82 if file_path not in file_to_revision_map:
83 file_to_revision_map[file_path] = []
84 file_to_revision_map[file_path].append(
85 (revision_number, file_change_type))
87 # Set commit message of the CL.
88 revision_object['message'] = revision.getElementsByTagName('msg')[
89 0].firstChild.nodeValue
91 # Set url of this CL.
92 revision_url = url_map['revision_url'] % revision_number
93 revision_object['url'] = revision_url
95 # Add this CL to the revision map.
96 revision_map[revision_number] = revision_object
98 return (revision_map, file_to_revision_map)
100 def ParseLineDiff(self, path, component, file_change_type, revision_number):
101 changed_line_numbers = []
102 changed_line_contents = []
104 url_map = self.component_to_urls_map.get(component)
105 if not url_map:
106 return (None, None, None)
108 # If the file is added (not modified), treat it as if it is not changed.
109 backup_url = url_map['revision_url'] % revision_number
110 if file_change_type == 'A':
111 return (backup_url, changed_line_numbers, changed_line_contents)
113 # Retrieve data from the url. If no data is retrieved, return empty lists.
114 url = url_map['diff_url'] % (path, revision_number - 1,
115 revision_number, revision_number)
116 data = crash_utils.GetDataFromURL(url)
117 if not data:
118 return (backup_url, changed_line_numbers, changed_line_contents)
120 line_diff_html = minidom.parseString(data)
121 tables = line_diff_html.getElementsByTagName('table')
122 # If there are not NUM_TABLES tables in the html page, there should be an
123 # error in the html page.
124 if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
125 return (backup_url, changed_line_numbers, changed_line_contents)
127 # Diff content is in the second table. Each line of the diff content
128 # is in <tr>.
129 trs = tables[1].getElementsByTagName('tr')
130 prefix_len = len('vc_diff_')
132 # Filter trs so that it only contains diff chunk with contents.
133 filtered_trs = []
134 for tr in trs:
135 tr_class = tr.getAttribute('class')
137 # Check for the classes of the <tr>s.
138 if tr_class:
139 tr_class = tr_class[prefix_len:]
141 # Do not have to add header.
142 if tr_class == 'header' or tr_class == 'chunk_header':
143 continue
145 # If the class of tr is empty, this page does not have any change.
146 if tr_class == 'empty':
147 return (backup_url, changed_line_numbers, changed_line_contents)
149 filtered_trs.append(tr)
151 # Iterate through filtered trs, and grab line diff information.
152 for tr in filtered_trs:
153 tds = tr.getElementsByTagName('td')
155 # If there aren't 3 tds, this line does should not contain line diff.
156 if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
157 continue
159 # If line number information is not in hyperlink, ignore this line.
160 try:
161 line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
162 left_diff_type = tds[1].getAttribute('class')[prefix_len:]
163 right_diff_type = tds[2].getAttribute('class')[prefix_len:]
164 except IndexError:
165 continue
167 # Treat the line as modified only if both left and right diff has type
168 # changed or both have different change type, and if the change is not
169 # deletion.
170 if (left_diff_type != right_diff_type) or (
171 left_diff_type == 'change' and right_diff_type == 'change'):
173 # Check if the line content is not empty.
174 try:
175 new_line = tds[2].firstChild.nodeValue
176 except AttributeError:
177 new_line = ''
179 if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
180 changed_line_numbers.append(int(line_num))
181 changed_line_contents.append(new_line.strip())
183 return (url, changed_line_numbers, changed_line_contents)
185 def ParseBlameInfo(self, component, file_path, line, revision):
186 url_map = self.component_to_urls_map.get(component)
187 if not url_map:
188 return None
190 # Retrieve blame data from url, return None if fails.
191 url = url_map['blame_url'] % (file_path, revision, revision)
192 data = crash_utils.GetDataFromURL(url)
193 if not data:
194 return None
196 blame_html = minidom.parseString(data)
198 title = blame_html.getElementsByTagName('title')
199 # If the returned html page is an exception page, return None.
200 if title[0].firstChild.nodeValue == 'ViewVC Exception':
201 return None
203 # Each of the blame result is in <tr>.
204 blame_results = blame_html.getElementsByTagName('tr')
205 try:
206 blame_result = blame_results[line]
207 except IndexError:
208 return None
210 # There must be 4 <td> for each <tr>. If not, this page is wrong.
211 tds = blame_result.getElementsByTagName('td')
212 if len(tds) != 4:
213 return None
215 # The third <td> has the line content, separated by <span>s. Combine
216 # those to get a string of changed line. If it has nothing, the line
217 # is empty.
218 line_content = ''
219 if tds[3].hasChildNodes():
220 contents = tds[3].childNodes
222 for content in contents:
223 # Nodetype 3 means it is text node.
224 if content.nodeType == minidom.Node.TEXT_NODE:
225 line_content += content.nodeValue
226 else:
227 line_content += content.firstChild.nodeValue
229 line_content = line_content.strip()
231 # If the current line has the same author/revision as the previous lines,
232 # the result is not shown. Propagate up until we find the line with info.
233 while not tds[1].firstChild:
234 line -= 1
235 blame_result = blame_results[line]
236 tds = blame_result.getElementsByTagName('td')
237 author = tds[1].firstChild.nodeValue
239 # Revision can either be in hyperlink or plain text.
240 try:
241 revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
242 except IndexError:
243 revision = tds[2].firstChild.nodeValue
245 (revision_info, _) = self.ParseChangelog(component, revision, revision)
246 message = revision_info[int(revision)]['message']
248 # Return the parsed information.
249 revision_url = url_map['revision_url'] % int(revision)
250 return (line_content, revision, author, revision_url, message)