Remove old about scheme URL constants.
[chromium-blink-merge.git] / chrome / tools / webforms_aggregator.py
blob16e52732c497177166005ef0405496254773ee0f
1 #!/usr/bin/env python
2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Downloads web pages with fillable forms after parsing through a set of links.
8 Used for collecting web pages with forms. Used as a standalone script.
9 This script assumes that it's run from within the same directory in which it's
10 checked into. If this script were to be run elsewhere then the path for
11 REGISTER_PAGE_DIR needs to be changed.
13 This script assumes that third party modules are installed:
14 httplib2, lxml, pycurl.
16 Usage: webforms_aggregator.py [options] [single url or file containing urls]
18 Options:
19 -l LOG_LEVEL, --log_level LOG_LEVEL
20 LOG_LEVEL: debug, info, warning or error [default: error]
21 -h, --help show this help message and exit
22 """
24 import datetime
25 import errno
26 import logging
27 import optparse
28 import os
29 import re
30 # Needed in Linux so that PyCurl does not throw a segmentation fault.
31 import signal
32 import sys
33 import tempfile
34 import threading
35 import time
36 import urlparse
38 import httplib2
39 from lxml import html, etree
40 import pycurl
42 REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
43 'heuristics', 'input')
44 NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
46 FORM_LOCATION_COMMENT = 'Form Location: %s'
47 HTML_FILE_PREFIX = 'grabber-'
49 MAX_REDIRECTIONS = 10
51 # Strings in a webpage that are indicative of a registration link.
52 LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
54 MAX_SAME_DOMAIN_URLS_NO = 30
55 MAX_TOTAL_URLS_PER_DOMAIN = 300
56 MAX_OPEN_FILES_NO = 500
58 # URLs are selected for downloading with the following rules from the link
59 # lists, giving more weight to the links that contain a link clue.
60 CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
61 CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
62 SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
63 GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
65 MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
68 class Retriever(object):
69 """Download, parse, and check if the web page contains a registration form.
71 The objects of this class has a one to one relation with the web pages. For
72 each page that is downloaded and parsed an object of this class is created.
73 Each Retriever object creates a curl object. This object is added to the curl
74 multi object of the crawler object so that the corresponding pages gets
75 downloaded.
76 """
77 logger = logging.getLogger(__name__)
79 def __init__(self, url, domain, cookie_file):
80 """Initializes a Retriever object.
82 Args:
83 url: url to download page from.
84 domain: only links with this domain will be retrieved.
85 cookie_file: the name of a cookie file, needed for pages that use session
86 cookies to change their contents.
87 """
88 self._url = url
89 self._domain = domain
90 self._html_content = ''
92 # Http links without clues from LINK_CLUES.
93 self._general_links = []
94 # Http links that contain a clue from LINK_CLUES.
95 self._clues_general_links = []
96 # Https links that do not contain any clues from LINK_CLUES.
97 self._secure_links = []
98 # Https links that contain a clue from LINK_CLUES.
99 self._clues_secure_links = []
100 self._cookie_file = cookie_file
101 self._curl_object = None
103 def __del__(self):
104 """Cleans up before this object is destroyed.
106 The function closes the corresponding curl object that does the downloading.
108 if self._curl_object:
109 self._curl_object.close()
111 def _AddLink(self, link):
112 """Adds url |link|, if not already present, to the appropriate list.
114 The link only gets added to the single list that is appopriate for it:
115 _secure_links, _general_links, _clues_secure_links or _clues_general_links.
117 Args:
118 link: the url that is inserted to the appropriate links list.
120 # Handles sites with unicode URLs.
121 if isinstance(link, unicode):
122 # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
123 link = httplib2.iri2uri(link).encode('utf-8')
124 link_parsed = urlparse.urlparse(link)
125 link_lists = [self._clues_secure_links, self._secure_links,
126 self._clues_general_links, self._general_links]
127 # Checks that the registration page is within the domain.
128 if (self._domain in link_parsed[1] and
129 all(link not in x for x in link_lists)):
130 for clue in LINK_CLUES:
131 if clue in link.lower():
132 if link_parsed[0].startswith('https'):
133 self._clues_secure_links.append(link)
134 return
135 else:
136 self._clues_general_links.append(link)
137 return
138 if link_parsed[0].startswith('https'): # No clues found in the link.
139 self._secure_links.append(link)
140 else:
141 self._general_links.append(link)
143 def ParseAndGetLinks(self):
144 """Parses downloaded page and gets url link for non registration page.
146 Checks if current page contains a registration page and if not it gets
147 the url links. If it is a registration page, it saves it in a file as
148 'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
149 and it returns True. Otherwise it returns False.
151 Returns:
152 True if current page contains a registration form, and False otherwise.
154 Raises:
155 IOError: When can't write to the file.
157 if not self._domain:
158 self.logger.error('Error: self._domain was not set')
159 sys.exit(1)
160 match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1',
161 self._html_content)
162 for group_list in match_list:
163 link = group_list[1]
164 if link.startswith('//'):
165 link = urlparse.urljoin(self._url, link)
166 self._AddLink(link)
167 try:
168 tree = html.fromstring(self._html_content, parser=html.HTMLParser())
169 except etree.LxmlError:
170 self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s',
171 self._url)
172 return False
173 try:
174 body = tree.iter('body').next()
175 except StopIteration:
176 self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s',
177 self._url)
178 return False
180 # Get a list of all input elements with attribute type='password'
181 password_elements = list(body.iterfind('.//input[@type="password"]'))
182 # Check for multiple password elements to distinguish between a login form
183 # and a registration form (Password field and Confirm Password field).
184 if password_elements and len(password_elements) >= 2:
185 form_elements = []
186 for password_elem in password_elements:
187 form_elem = password_elem.xpath('ancestor::form[1]')
188 if not form_elem:
189 continue
190 if not form_elem[0] in form_elements:
191 form_elements.append(form_elem[0])
192 else:
193 # Confirms that the page contains a registration form if two passwords
194 # are contained in the same form for form_elem[0].
195 if not os.path.isdir(REGISTER_PAGE_DIR):
196 os.makedirs(REGISTER_PAGE_DIR)
197 # Locate the HTML tag and insert the form location comment after it.
198 html_tag = tree.iter('html').next()
199 comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
200 html_tag.insert(0, comment)
201 # Create a new file and save the HTML registration page code.
202 f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
203 self._domain), 'w')
204 try:
205 f.write(html.tostring(tree, pretty_print=True))
206 except IOError as e:
207 self.logger.error('Error: %s', e)
208 raise
209 finally:
210 f.close()
211 return True # Registration page found.
212 # Indicates page is not a registration page and links must be parsed.
213 link_elements = list(body.iter('a'))
214 for link_elem in link_elements:
215 link = link_elem.get('href')
216 if not link or '#' == link[0]:
217 continue
218 link = urlparse.urljoin(self._url, link)
219 link_parsed = urlparse.urlparse(link)
220 if not link_parsed[0].startswith('http'):
221 continue
222 self._AddLink(link)
223 return False # Registration page not found.
225 def InitRequestHead(self):
226 """Initializes curl object for a HEAD request.
228 A HEAD request is initiated so that we can check from the headers if this is
229 a valid HTML file. If it is not a valid HTML file, then we do not initiate a
230 GET request, saving any unnecessary downloadings.
232 self._curl_object = pycurl.Curl()
233 self._curl_object.setopt(pycurl.URL, self._url)
234 # The following line fixes the GnuTLS package error that pycurl depends
235 # on for getting https pages.
236 self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
237 self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
238 self._curl_object.setopt(pycurl.NOBODY, True)
239 self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
240 self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
241 self._curl_object.setopt(pycurl.FAILONERROR, False)
242 self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
243 self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
244 self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
245 self._curl_object.setopt(pycurl.TIMEOUT, 300)
246 self._curl_object.setopt(pycurl.NOSIGNAL, 1)
248 def InitRequestGet(self):
249 """Initializes curl object for a GET request.
251 This is called only for valid HTML files. The Pycurl makes a GET request.
252 The page begins to download, but since not all the data of the pages comes
253 at once. When some of the data on the page is downloaded Pycurl will put
254 this data in the buffer. The data is appended to the end of the page until
255 everything is downloaded.
257 self._curl_object.setopt(pycurl.NOBODY, False)
258 self._curl_object.setopt(
259 pycurl.WRITEFUNCTION, lambda buff: setattr(
260 self, '_html_content', self._html_content + buff))
262 def Download(self):
263 """Downloads the self._url page.
265 It first does a HEAD request and then it proceeds to a GET request.
266 It uses a curl object for a single download. This function is called only
267 once for the initial url of a site when we still don't have more urls from a
268 domain.
270 Returns:
271 True, if the downloaded page is valid HTML code, or False otherwise.
273 self.InitRequestHead()
274 try:
275 self._curl_object.perform()
276 except pycurl.error as e:
277 self.logger.error('Error: %s, url: %s', e, self._url)
278 return False
279 self._url = urlparse.urljoin(
280 self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL))
281 content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
282 if content_type and ('text/html' in content_type.lower()):
283 self.InitRequestGet()
284 try:
285 self._curl_object.perform()
286 except pycurl.error as e:
287 self.logger.error('Error: %s, url: %s', e, self._url)
288 return False
289 return True
290 else:
291 self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url)
292 return False
294 def Run(self):
295 """Called only once for the initial url when we do not have more urls.
297 Downloads the originally-specified site url, parses it and gets the links.
299 Returns:
300 True, if a registration page is found, and False otherwise.
302 if self.Download():
303 if not self._domain:
304 url_parsed = urlparse.urlparse(self._url)
305 self._domain = url_parsed[1]
306 if self._domain.startswith('www'):
307 self._domain = '.'.join(self._domain.split('.')[1:])
308 if self.ParseAndGetLinks():
309 return True
310 return False
313 class Crawler(object):
314 """Crawls a site until a registration page is found or max level is reached.
316 Creates, uses and destroys Retriever objects. Creates a cookie temp file
317 needed for session cookies. It keeps track of 'visited links' and
318 'links to visit' of the site. To do this it uses the links discovered from
319 each Retriever object. Use Run() to crawl the site.
321 try:
322 signal.signal(signal.SIGPIPE, signal.SIG_IGN)
323 except ImportError:
324 pass
325 logger = logging.getLogger(__name__)
327 def __init__(self, url, logging_level=None):
328 """Init crawler URL, links lists, logger, and creates a cookie temp file.
330 The cookie temp file is needed for session cookies.
332 Args:
333 url: the initial "seed" url of the site.
334 logging_level: the desired verbosity level, default is None.
336 if logging_level:
337 self.logger.setLevel(logging_level)
339 self.url_error = False
340 url_parsed = urlparse.urlparse(url)
341 if not url_parsed[0].startswith('http'):
342 self.logger.error(
343 'Error: "%s" does not begin with http:// or https://', url)
344 self.url_error = True
345 return
346 # Example: if url is 'http://www.example.com?name=john' then value [1] or
347 # network location is 'www.example.com'.
348 if not url_parsed[1]:
349 self.logger.error('Error: "%s" is not a valid url', url)
350 self.url_error = True
351 return
352 self._url = url
353 self._domain = ''
354 # Http links that contain a clue from LINK_CLUES.
355 self._clues_general_links = []
356 # Http links that do not contain any clue from LINK_CLUES.
357 self._general_links = []
358 # Https links that contain a clue from LINK_CLUES.
359 self._clues_secure_links = []
360 # Https links that do not contain any clue from LINK_CLUES.
361 self._secure_links = []
362 # All links downloaded and parsed so far.
363 self._links_visited = []
364 self._retrievers_list = []
365 self._cookie_file = tempfile.NamedTemporaryFile(
366 suffix='.cookie', delete=False)
367 self._cookie_file.close()
368 self._cookie_file = self._cookie_file.name # Keep only the filename.
370 def __del__(self):
371 """Deletes cookie file when Crawler instances are destroyed."""
372 if hasattr(self, '_cookie_file'):
373 self.logger.info('Deleting cookie file %s ...', self._cookie_file)
374 os.unlink(self._cookie_file)
376 def _MultiPerform(self, curl_multi_object):
377 """Performs concurrent downloads using a CurlMulti object.
379 Args:
380 curl_multi_object: a curl object that downloads multiple pages
381 concurrently. The class of this object is |pycurl.CurlMulti|.
383 # Following code uses the example from section for the CurlMulti object
384 # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
385 while True:
386 ret, no_handles = curl_multi_object.perform()
387 if ret != pycurl.E_CALL_MULTI_PERFORM:
388 break
389 while no_handles:
390 curl_multi_object.select(1.0)
391 while True:
392 ret, no_handles = curl_multi_object.perform()
393 if ret != pycurl.E_CALL_MULTI_PERFORM:
394 break
396 def _GetLinksPages(self, curl_multi_object):
397 """Downloads many pages concurrently using a CurlMulti Object.
399 Creates many Retriever objects and adds them to a list. The constant
400 MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
401 concurrently from the same domain using the pycurl multi object. It's
402 currently set to 30 URLs. These URLs are taken from the links lists, which
403 are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
404 each list during each iteration.
406 Example of the rules:
407 3/10 from csl results in 9 URLs
408 3/10 from cgl results in 9 URLs
409 2/10 from sl results in 6 URLs
410 2/10 from gl results in 6 URLs
412 Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
413 If these lists have fewer items than the defined rules, such as if a site
414 does not contain any secure links, then csl and sl lists will be of 0 length
415 and only 15 pages would be downloaded concurrently from the same domain.
417 Since 30 URLs can be handled concurrently, the number of links taken from
418 other lists can be increased. This means that we can take 24 links from the
419 cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less
420 than 24 links, e.g. there are only 21 links, then only 9 links may be taken
421 from gl so ) + 21 + 0 + 9 = 30.
423 Args:
424 curl_multi_object: Each Retriever object has a curl object which is
425 added to the CurlMulti Object.
427 self._retrievers_list = []
429 csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
430 cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
431 sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
432 gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
434 # If some links within the list have fewer items than needed, the missing
435 # links will be taken by the following priority: csl, cgl, sl, gl.
436 # c: clues, s: secure, g: general, l: list.
437 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
438 if spare_links > 0:
439 csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
440 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
441 if spare_links > 0:
442 cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
443 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
444 if spare_links > 0:
445 sl_no = min(sl_no + spare_links, len(self._secure_links))
446 spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
447 if spare_links > 0:
448 gl_no = min(gl_no + spare_links, len(self._general_links))
450 for no_of_links, links in [
451 (csl_no, self._clues_secure_links),
452 (sl_no, self._secure_links),
453 (cgl_no, self._clues_general_links),
454 (gl_no, self._general_links)]:
455 for i in xrange(no_of_links):
456 if not links:
457 break
458 url = links.pop(0)
459 self._links_visited.append(url)
460 r = Retriever(url, self._domain, self._cookie_file)
461 r.InitRequestHead()
462 curl_multi_object.add_handle(r._curl_object)
463 self._retrievers_list.append(r)
465 if self._retrievers_list:
466 try:
467 self._MultiPerform(curl_multi_object)
468 except pycurl.error as e:
469 self.logger.error('Error: %s, url: %s', e, self._url)
470 finally:
471 for r in self._retrievers_list:
472 curl_multi_object.remove_handle(r._curl_object)
473 # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing
474 # items from the iterated list.
475 for r in self._retrievers_list[:]:
476 r._url = urlparse.urljoin(r._url, r._curl_object.getinfo(
477 pycurl.EFFECTIVE_URL))
478 content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
479 if content_type and ('text/html' in content_type.lower()):
480 r.InitRequestGet()
481 curl_multi_object.add_handle(r._curl_object)
482 else:
483 self._retrievers_list.remove(r)
484 self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url)
485 if self._retrievers_list:
486 try:
487 self._MultiPerform(curl_multi_object)
488 except pycurl.error as e:
489 self.logger.error('Error: %s, url: %s', e, self._url)
490 finally:
491 for r in self._retrievers_list:
492 curl_multi_object.remove_handle(r._curl_object)
493 self.logger.info('Downloaded: %s', r._url)
495 def _LogRegPageFound(self, retriever):
496 """Display logging for registration page found.
498 Args:
499 retriever: The object that has retrieved the page.
501 self.logger.info('\t##############################################')
502 self.logger.info('\t### %s ###', retriever._domain)
503 self.logger.info('\t##############################################')
504 self.logger.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!')
505 self.logger.info('\t%s', retriever._url)
506 self.logger.info('\t##############################################')
508 def _GetNewLinks(self, retriever):
509 """Appends new links discovered by each retriever to the appropriate lists.
511 Links are copied to the links list of the crawler object, which holds all
512 the links found from all retrievers that the crawler object created. The
513 Crawler object exists as far as a specific site is examined and the
514 Retriever object exists as far as a page of this site is examined.
516 Args:
517 retriever: a temporary object that downloads a specific page, parses the
518 content and gets the page's href link.
520 for link in retriever._clues_secure_links:
521 if (not link in self._clues_secure_links and
522 not link in self._links_visited):
523 self._clues_secure_links.append(link)
524 for link in retriever._secure_links:
525 if (not link in self._secure_links and
526 not link in self._links_visited):
527 self._secure_links.append(link)
528 for link in retriever._clues_general_links:
529 if (not link in self._clues_general_links and
530 not link in self._links_visited):
531 self._clues_general_links.append(link)
532 for link in retriever._general_links:
533 if (not link in self._general_links and
534 not link in self._links_visited):
535 self._general_links.append(link)
537 def Run(self):
538 """Runs the Crawler.
540 Creates a Retriever object and calls its run method to get the first links,
541 and then uses CurlMulti object and creates many Retriever objects to get
542 the subsequent pages.
544 The number of pages (=Retriever objs) created each time is restricted by
545 MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
546 and parse their pages, we do the same again. The number of total pages
547 visited is kept in urls_visited.
548 If no registration page is found, the Crawler object will give up its try
549 after MAX_TOTAL_URLS_PER_DOMAIN is reached.
551 Returns:
552 True is returned if registration page is found, or False otherwise.
554 reg_page_found = False
555 if self.url_error:
556 return False
557 r = Retriever(self._url, self._domain, self._cookie_file)
558 if r.Run():
559 self._LogRegPageFound(r)
560 reg_page_found = True
561 else:
562 self._url = r._url
563 self._domain = r._domain
564 self.logger.info('url to crawl: %s', self._url)
565 self.logger.info('domain: %s', self._domain)
566 self._links_visited.append(r._url)
567 self._GetNewLinks(r)
568 urls_visited = 1
569 while True:
570 if (not (self._clues_secure_links or self._secure_links or
571 self._clues_general_links or self._general_links) or
572 urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN):
573 break # Registration page not found.
574 m = pycurl.CurlMulti()
575 self._GetLinksPages(m)
576 urls_visited += len(self._retrievers_list)
577 self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
578 self._domain, urls_visited)
579 for r in self._retrievers_list:
580 if r.ParseAndGetLinks():
581 self._LogRegPageFound(r)
582 reg_page_found = True
583 break
584 else:
585 self.logger.info('parsed: %s', r._url)
586 self._GetNewLinks(r)
587 m.close()
588 if reg_page_found:
589 break
590 while self._retrievers_list:
591 r = self._retrievers_list.pop()
592 return reg_page_found
595 class WorkerThread(threading.Thread):
596 """Creates a new thread of execution."""
597 def __init__(self, url):
598 """Creates _url and page_found attri to populate urls_with_no_reg_page file.
600 Used after thread's termination for the creation of a file with a list of
601 the urls for which a registration page wasn't found.
603 Args:
604 url: will be used as an argument to create a Crawler object later.
606 threading.Thread.__init__(self)
607 self._url = url
608 self.page_found = False
610 def run(self):
611 """Execution of thread creates a Crawler object and runs it.
613 Caution: this function name should not be changed to 'Run' or any other
614 names because it is overriding the 'run' method of the 'threading.Thread'
615 class. Otherwise it will never be called.
617 self.page_found = Crawler(self._url).Run()
620 class ThreadedCrawler(object):
621 """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
623 The crawler object runs concurrently, examining one site each.
625 logger = logging.getLogger(__name__)
627 def __init__(self, urls_file, logging_level=None):
628 """Creates threaded Crawler objects.
630 Args:
631 urls_file: a text file containing a URL in each line.
632 logging_level: verbosity level, default is None.
634 Raises:
635 IOError: If cannot find URLs from the list.
637 if logging_level:
638 self.logger.setLevel(logging_level)
640 self._urls_list = []
641 f = open(urls_file)
642 try:
643 for url in f.readlines():
644 url = url.strip()
645 if not urlparse.urlparse(url)[0].startswith('http'):
646 self.logger.info(
647 '%s: skipping this (does not begin with "http://")', url)
648 continue
649 self._urls_list.append(url)
650 except IOError as e:
651 self.logger.error('Error: %s', e)
652 raise
653 finally:
654 f.close()
655 if not self._urls_list:
656 error_msg = 'No URLs were found.'
657 self.logger.error('ERROR: %s', error_msg)
658 raise IOError(error_msg)
660 def Run(self):
661 """Runs Crawler objects using python threads.
663 Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
665 Returns:
666 The number of registration pages found. -1 if no URLs are given.
668 Raises:
669 OSError: When creating the same directory that already exists.
671 if self._urls_list:
672 allThreads = []
673 # originalNumThreads is the number of threads just before the
674 # ThreadedCrawler starts creating new threads. As a standalone script it
675 # will be 1.
676 originalNumThreads = threading.active_count()
677 for url in self._urls_list:
678 self.logger.info('URL fed to a crawler thread: %s', url)
679 t = WorkerThread(url)
680 t.start()
681 allThreads.append(t)
682 while threading.active_count() >= (
683 MAX_ALLOWED_THREADS + originalNumThreads):
684 time.sleep(.4)
685 while threading.active_count() > originalNumThreads:
686 time.sleep(.4)
687 self.logger.info('----------------')
688 self.logger.info('--- FINISHED ---')
689 self.logger.info('----------------')
690 urls_no = 0
691 urls_not_found_no = 0
692 not_file_name = os.path.join(
693 REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
694 not_file_dir = os.path.dirname(not_file_name)
695 try:
696 os.makedirs(not_file_dir)
697 except OSError as e:
698 if e.errno != errno.EEXIST:
699 raise
700 fnot = open(not_file_name, 'wb')
701 try:
702 for t in sorted(allThreads, key=lambda t: t._url):
703 urls_no += 1
704 if not t.page_found:
705 urls_not_found_no += 1
706 fnot.write('%s' % t._url)
707 fnot.write(os.linesep)
708 except IOError as e:
709 self.logger.error('Error: %s', e)
710 finally:
711 fnot.close()
712 self.logger.info('Total number of URLs given: %d\n', urls_no)
713 self.logger.info(
714 'Registration pages found: %d\n', (urls_no - urls_not_found_no))
715 self.logger.info(
716 'URLs that did not return a registration page: %d\n',
717 urls_not_found_no)
718 return urls_no - urls_not_found_no
719 else:
720 self.logger.error('Error: no URLs were found.')
721 return -1
724 def main():
725 usage = 'usage: %prog [options] single_url_or_urls_filename'
726 parser = optparse.OptionParser(usage)
727 parser.add_option(
728 '-l', '--log_level', metavar='LOG_LEVEL', default='error',
729 help='LOG_LEVEL: debug, info, warning or error [default: %default]')
731 (options, args) = parser.parse_args()
732 options.log_level = options.log_level.upper()
733 if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
734 print 'Wrong log_level argument.'
735 parser.print_help()
736 return 1
737 options.log_level = getattr(logging, options.log_level)
739 if len(args) != 1:
740 parser.error('Wrong number of arguments.')
742 logger = logging.getLogger(__name__)
743 if options.log_level:
744 console = logging.StreamHandler()
745 logger.addHandler(console)
746 logger.setLevel(options.log_level)
748 arg_is_a_file = os.path.isfile(args[0])
749 if arg_is_a_file:
750 CrawlerClass = ThreadedCrawler
751 else:
752 CrawlerClass = Crawler
753 t0 = datetime.datetime.now()
754 c = CrawlerClass(args[0], options.log_level)
755 c.Run()
756 if not arg_is_a_file and c.url_error:
757 logger.error(
758 'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0])
759 t1 = datetime.datetime.now()
760 delta_t = t1 - t0
761 logger.info('Started at: %s\n', t0)
762 logger.info('Ended at: %s\n', t1)
763 logger.info('Total execution time: %s\n', delta_t)
764 return 0
767 if __name__ == "__main__":
768 sys.exit(main())