1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import xml
.dom
.minidom
as minidom
6 from xml
.parsers
.expat
import ExpatError
9 from repository_parser_interface
import ParserInterface
12 # This number is 6 because each linediff page in src.chromium.org should
13 # contain the following tables: table with revision number, table with actual
14 # diff, table with dropdown menu, table with legend, a border table and a table
15 # containing page information.
16 NUM_TABLES_IN_LINEDIFF_PAGE
= 6
17 # Each of the linediff info should contain 3 tds, one for changed line number,
18 # and two for line contents before/after.
19 NUM_TDS_IN_LINEDIFF_PAGE
= 3
22 class SVNParser(ParserInterface
):
23 """Parser for SVN repository using chromium.org, for components in config.
26 url_map: A map from component to the urls, where urls are for changelog,
27 revision, line diff and annotation.
30 def __init__(self
, url_map
):
31 self
.component_to_urls_map
= url_map
33 def ParseChangelog(self
, component
, range_start
, range_end
):
34 file_to_revision_map
= {}
37 # Check if the current component is supported by reading the components
38 # parsed from config file. If it is not, fail.
40 url_map
= self
.component_to_urls_map
.get(component
)
42 return (revision_map
, file_to_revision_map
)
44 # Retrieve data from the url, return empty map if fails.
45 revision_range_str
= '%s:%s' % (range_start
, range_end
)
46 url
= url_map
['changelog_url'] % revision_range_str
47 response
= crash_utils
.GetDataFromURL(url
)
49 return (revision_map
, file_to_revision_map
)
51 # Parse xml out of the returned string. If it fails, return empty map.
53 xml_revisions
= minidom
.parseString(response
)
55 return (revision_map
, file_to_revision_map
)
57 # Iterate through the returned XML object.
58 revisions
= xml_revisions
.getElementsByTagName('logentry')
59 for revision
in revisions
:
60 # Create new revision object for each of the revision.
63 # Set author of the CL.
64 revision_object
['author'] = revision
.getElementsByTagName(
65 'author')[0].firstChild
.nodeValue
67 # Get the revision number from xml.
68 revision_number
= int(revision
.getAttribute('revision'))
70 # Iterate through the changed paths in the CL.
71 paths
= revision
.getElementsByTagName('paths')
73 for changed_path
in paths
[0].getElementsByTagName('path'):
74 # Get path and file change type from the xml.
75 file_path
= changed_path
.firstChild
.nodeValue
76 file_change_type
= changed_path
.getAttribute('action')
78 if file_path
.startswith('/trunk/'):
79 file_path
= file_path
[len('/trunk/'):]
81 # Add file to the map.
82 if file_path
not in file_to_revision_map
:
83 file_to_revision_map
[file_path
] = []
84 file_to_revision_map
[file_path
].append(
85 (revision_number
, file_change_type
))
87 # Set commit message of the CL.
88 revision_object
['message'] = revision
.getElementsByTagName('msg')[
89 0].firstChild
.nodeValue
92 revision_url
= url_map
['revision_url'] % revision_number
93 revision_object
['url'] = revision_url
95 # Add this CL to the revision map.
96 revision_map
[revision_number
] = revision_object
98 return (revision_map
, file_to_revision_map
)
100 def ParseLineDiff(self
, path
, component
, file_change_type
, revision_number
):
101 changed_line_numbers
= []
102 changed_line_contents
= []
104 url_map
= self
.component_to_urls_map
.get(component
)
106 return (None, None, None)
108 # If the file is added (not modified), treat it as if it is not changed.
109 backup_url
= url_map
['revision_url'] % revision_number
110 if file_change_type
== 'A':
111 return (backup_url
, changed_line_numbers
, changed_line_contents
)
113 # Retrieve data from the url. If no data is retrieved, return empty lists.
114 url
= url_map
['diff_url'] % (path
, revision_number
- 1,
115 revision_number
, revision_number
)
116 data
= crash_utils
.GetDataFromURL(url
)
118 return (backup_url
, changed_line_numbers
, changed_line_contents
)
120 line_diff_html
= minidom
.parseString(data
)
121 tables
= line_diff_html
.getElementsByTagName('table')
122 # If there are not NUM_TABLES tables in the html page, there should be an
123 # error in the html page.
124 if len(tables
) != NUM_TABLES_IN_LINEDIFF_PAGE
:
125 return (backup_url
, changed_line_numbers
, changed_line_contents
)
127 # Diff content is in the second table. Each line of the diff content
129 trs
= tables
[1].getElementsByTagName('tr')
130 prefix_len
= len('vc_diff_')
132 # Filter trs so that it only contains diff chunk with contents.
135 tr_class
= tr
.getAttribute('class')
137 # Check for the classes of the <tr>s.
139 tr_class
= tr_class
[prefix_len
:]
141 # Do not have to add header.
142 if tr_class
== 'header' or tr_class
== 'chunk_header':
145 # If the class of tr is empty, this page does not have any change.
146 if tr_class
== 'empty':
147 return (backup_url
, changed_line_numbers
, changed_line_contents
)
149 filtered_trs
.append(tr
)
151 # Iterate through filtered trs, and grab line diff information.
152 for tr
in filtered_trs
:
153 tds
= tr
.getElementsByTagName('td')
155 # If there aren't 3 tds, this line does should not contain line diff.
156 if len(tds
) != NUM_TDS_IN_LINEDIFF_PAGE
:
159 # If line number information is not in hyperlink, ignore this line.
161 line_num
= tds
[0].getElementsByTagName('a')[0].firstChild
.nodeValue
162 left_diff_type
= tds
[1].getAttribute('class')[prefix_len
:]
163 right_diff_type
= tds
[2].getAttribute('class')[prefix_len
:]
167 # Treat the line as modified only if both left and right diff has type
168 # changed or both have different change type, and if the change is not
170 if (left_diff_type
!= right_diff_type
) or (
171 left_diff_type
== 'change' and right_diff_type
== 'change'):
173 # Check if the line content is not empty.
175 new_line
= tds
[2].firstChild
.nodeValue
176 except AttributeError:
179 if not (left_diff_type
== 'remove' and right_diff_type
== 'empty'):
180 changed_line_numbers
.append(int(line_num
))
181 changed_line_contents
.append(new_line
.strip())
183 return (url
, changed_line_numbers
, changed_line_contents
)
185 def ParseBlameInfo(self
, component
, file_path
, line
, revision
):
186 url_map
= self
.component_to_urls_map
.get(component
)
190 # Retrieve blame data from url, return None if fails.
191 url
= url_map
['blame_url'] % (file_path
, revision
, revision
)
192 data
= crash_utils
.GetDataFromURL(url
)
196 blame_html
= minidom
.parseString(data
)
198 title
= blame_html
.getElementsByTagName('title')
199 # If the returned html page is an exception page, return None.
200 if title
[0].firstChild
.nodeValue
== 'ViewVC Exception':
203 # Each of the blame result is in <tr>.
204 blame_results
= blame_html
.getElementsByTagName('tr')
206 blame_result
= blame_results
[line
]
210 # There must be 4 <td> for each <tr>. If not, this page is wrong.
211 tds
= blame_result
.getElementsByTagName('td')
215 # The third <td> has the line content, separated by <span>s. Combine
216 # those to get a string of changed line. If it has nothing, the line
219 if tds
[3].hasChildNodes():
220 contents
= tds
[3].childNodes
222 for content
in contents
:
223 # Nodetype 3 means it is text node.
224 if content
.nodeType
== minidom
.Node
.TEXT_NODE
:
225 line_content
+= content
.nodeValue
227 line_content
+= content
.firstChild
.nodeValue
229 line_content
= line_content
.strip()
231 # If the current line has the same author/revision as the previous lines,
232 # the result is not shown. Propagate up until we find the line with info.
233 while not tds
[1].firstChild
:
235 blame_result
= blame_results
[line
]
236 tds
= blame_result
.getElementsByTagName('td')
237 author
= tds
[1].firstChild
.nodeValue
239 # Revision can either be in hyperlink or plain text.
241 revision
= tds
[2].getElementsByTagName('a')[0].firstChild
.nodeValue
243 revision
= tds
[2].firstChild
.nodeValue
245 (revision_info
, _
) = self
.ParseChangelog(component
, revision
, revision
)
246 message
= revision_info
[int(revision
)]['message']
248 # Return the parsed information.
249 revision_url
= url_map
['revision_url'] % int(revision
)
250 return (line_content
, revision
, author
, revision_url
, message
)