2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
12 # Same name as the aggregator module name.
13 import webforms_aggregator
15 logger
= logging
.getLogger(webforms_aggregator
.__name
__)
16 console
= logging
.StreamHandler()
17 logger
.addHandler(console
)
19 # Commenting out the following line will set logger level to default: WARNING
20 logger
.setLevel(logging
.INFO
)
23 class WebformsAggregatorTest(unittest
.TestCase
):
24 """Unit tests for the webforms_aggregator module."""
29 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
30 "http://www.w3.org/TR/html4/loose.dtd">
33 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
38 <p>This is a mock site. Its mere purpose is to contribute towards testing \
39 the aggregator crawler.</p>
41 <li><a href="%s">page1</a></li>
42 <li><a href="%s">page2</a></li>
43 <li><a href="%s">page3</a></li>
47 <a href="%s">sign in</a>
53 SIMPLE_PAGE_CONTENT
= """
54 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
55 "http://www.w3.org/TR/html4/loose.dtd">
58 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
65 <li><a href="%s">%s</a></li>
66 <li><a href="%s">%s</a></li>
70 <a href="%s">return to home page</a>
77 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
78 "http://www.w3.org/TR/html4/loose.dtd">
81 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
88 <label>User name: </label><input type="text"><br><br>
89 <label>password: </label><input type="password"><br><br>
90 <input type="submit" value="Sign in">
93 <p><a href="%s">return to home page</a></p>
99 <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" \
100 "http://www.w3.org/TR/html4/loose.dtd">
103 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
107 <h1>Create a user account!</h1>
109 <h3>Enter your data below:</h3>
111 <label>First name: </label><input type="text"><br><br>
112 <label>Surname: </label><input type="text"><br><br>
113 <label>User name: </label><input type="text"><br><br>
114 <label>password: </label><input type="password"><br><br>
115 <label>retype password: </label><input type="password"><br><br>
116 <input type="submit" value="Register">
119 <p><a href="%s">return to home page</a></p>
124 def CreateMockSiteOne(self
):
125 """Site One has a registration form.
127 self
.files
['site1_home'] = 'site1_index.html'
128 self
.files
['site1_page1'] = 'site1_page1.html'
129 self
.files
['site1_page2'] = 'site1_page2.html'
130 self
.files
['site1_page3'] = 'site1_page3.html'
131 self
.files
['site1_signin'] = 'site1_signin.html'
132 self
.files
['site1_reg'] = 'site1_register.html'
135 file_content
[self
.files
['site1_home']] = self
.HOME_CONTENT
% (
136 'Site One home page', 'Welcome to site one. It has a reg page!',
137 self
.files
['site1_page1'], self
.files
['site1_page2'],
138 self
.files
['site1_page3'], self
.files
['site1_signin'])
140 file_content
[self
.files
['site1_page1']] = self
.SIMPLE_PAGE_CONTENT
% (
142 'Page 1!', 'This is a useless page. It does almost nothing.',
143 self
.files
['site1_page2'], 'page 2', self
.files
['site1_page3'],
144 'page 3', self
.files
['site1_home'])
146 file_content
[self
.files
['site1_page2']] = self
.SIMPLE_PAGE_CONTENT
% (
147 'Site One page 2', 'Page 2!',
148 'This is another useless page. It does almost what the page 1 does.',
149 self
.files
['site1_page1'], 'page 1', self
.files
['site1_page3'],
150 'page 3', self
.files
['site1_home'])
152 file_content
[self
.files
['site1_page3']] = self
.SIMPLE_PAGE_CONTENT
% (
153 'Site One page 3', 'Page 3!',
154 "This is the last useless page. It doesn't do anything useful at all.",
155 self
.files
['site1_page1'], 'page 1', self
.files
['site1_page2'],
156 'page 2', self
.files
['site1_home'])
158 file_content
[self
.files
['site1_signin']] = self
.SIGNIN_CONTENT
% (
160 'If you don\'t have a user account click <a href="%s">here</a>.' \
161 % self
.files
['site1_reg'],
162 self
.files
['site1_home'])
164 file_content
[self
.files
['site1_reg']] = self
.REG_CONTENT
% (
165 'Site One signin', self
.files
['site1_home'])
167 for filename
, content
in file_content
.iteritems():
168 f
= open(filename
, 'w')
174 def CreateMockSiteTwo(self
):
175 """ Site Two has no registration page."""
177 self
.files
['site2_home'] = 'site2_index.html'
178 self
.files
['site2_page1'] = 'site2_page1.html'
179 self
.files
['site2_page2'] = 'site2_page2.html'
180 self
.files
['site2_page3'] = 'site2_page3.html'
181 self
.files
['site2_signin'] = 'site2_signin.html'
184 file_content
[self
.files
['site2_home']] = self
.HOME_CONTENT
% (
185 'Site Two home page', 'Welcome to site two. It has no reg page!',
186 self
.files
['site2_page1'], self
.files
['site2_page2'],
187 self
.files
['site2_page3'], self
.files
['site2_signin'])
189 file_content
[self
.files
['site2_page1']] = self
.SIMPLE_PAGE_CONTENT
% (
191 'Page 1!', 'This is a useless page. It does almost nothing.',
192 self
.files
['site2_page2'], 'page 2', self
.files
['site2_page3'],
193 'page 3', self
.files
['site2_home'])
195 file_content
[self
.files
['site2_page2']] = self
.SIMPLE_PAGE_CONTENT
% (
196 'Site Two page 2', 'Page 2!',
197 'This is another useless page. It does almost what the page 1 does.',
198 self
.files
['site2_page1'], 'page 1', self
.files
['site2_page3'],
199 'page 3', self
.files
['site2_home'])
201 file_content
[self
.files
['site2_page3']] = self
.SIMPLE_PAGE_CONTENT
% (
202 'Site Two page 3', 'Page 3!',
203 "This is the last useless page. It doesn't do anything useful at all.",
204 self
.files
['site2_page1'], 'page 1', self
.files
['site2_page2'],
205 'page 2', self
.files
['site2_home'])
207 file_content
[self
.files
['site2_signin']] = self
.SIGNIN_CONTENT
% (
208 'Site Two signin', 'You cannot register online with this site.',
209 self
.files
['site2_home'])
211 for filename
, content
in file_content
.iteritems():
212 f
= open(filename
, 'w')
219 self
.cwd
= os
.getcwdu()
220 self
.temp_dir
= tempfile
.mkdtemp()
221 os
.chdir(self
.temp_dir
)
225 self
.CreateMockSiteOne()
226 self
.CreateMockSiteTwo()
227 self
.files
['cookie'] = 'test.cookie'
228 self
.url1
= 'http://localhost:%s/%s' % (self
.PORT1
,
229 self
.files
['site1_home'])
230 self
.url2
= 'http://localhost:%s/%s' % (self
.PORT2
,
231 self
.files
['site2_home'])
232 self
.domain1
= 'localhost:%s' %self
.PORT1
233 self
.files
['url'] = 'urls.txt'
234 url_file_handler
= open(self
.files
['url'], 'w')
236 url_file_handler
.write('URLs to crawl:')
237 url_file_handler
.write(os
.linesep
)
238 for url
in (self
.url1
, self
.url2
):
239 url_file_handler
.write(url
)
240 url_file_handler
.write(os
.linesep
)
242 url_file_handler
.close()
244 command_line
= 'python -u -m SimpleHTTPServer %s' % self
.PORT1
245 args
= command_line
.split()
246 self
.server1
= subprocess
.Popen(
247 args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
248 self
.server1
.stdout
.readline() # Needed in order for the server to start up
250 command_line
= 'python -u -m SimpleHTTPServer %s' % self
.PORT2
251 args
= command_line
.split()
252 self
.server2
= subprocess
.Popen(
253 args
, stdout
=subprocess
.PIPE
, stderr
=subprocess
.STDOUT
)
254 self
.server2
.stdout
.readline() # Needed in order for the server to start up
257 self
.server1
.terminate()
258 self
.server2
.terminate()
260 for filename
in self
.files
.values():
261 if os
.path
.isfile(filename
):
264 os
.rmdir(self
.temp_dir
)
266 def testRetrieverDownloadsPage(self
):
267 """Verify the retriever can download a page."""
268 r
= webforms_aggregator
.Retriever(self
.url1
, self
.domain1
,
269 self
.files
['cookie'])
270 self
.assertTrue(r
.Download(),
271 msg
='Retriever could not download "%s"' % self
.url1
)
273 def testCrawlerFindsRegPageFromUrl(self
):
274 """Verify that the crawler is able to find a reg page from the given URL."""
275 c
= webforms_aggregator
.Crawler(self
.url1
)
277 c
.Run(), msg
='Crawler could not find the reg page of "%s"' % self
.url1
)
279 def testCrawlerCannotFindNonExistentRegPageFromUrl(self
):
280 """Verify that the crawler won't find a non existent reg page
281 from the given URL."""
282 c
= webforms_aggregator
.Crawler(self
.url2
)
285 msg
='Crawler found a non existent reg page of "%s"' % self
.url1
)
287 def testThreadedCrawlerFindsRegPageFromUrlsFile(self
):
288 """Verify the threaded crawler finds reg page from a file of URLs."""
289 c
= webforms_aggregator
.ThreadedCrawler(self
.files
['url'])
292 msg
='Threaded crawler could not find the reg page from the URLs file')
295 if __name__
== '__main__':
296 suite
= unittest
.TestLoader().loadTestsFromTestCase(
297 WebformsAggregatorTest
)
298 unittest
.TextTestRunner(verbosity
=2).run(suite
)