chrome/tools/webforms_aggregator.py

   1 #!/usr/bin/env python
   2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Downloads web pages with fillable forms after parsing through a set of links.
   7
   8 Used for collecting web pages with forms. Used as a standalone script.
   9 This script assumes that it's run from within the same directory in which it's
  10 checked into. If this script were to be run elsewhere then the path for
  11 REGISTER_PAGE_DIR needs to be changed.
  12
  13 This script assumes that third party modules are installed:
  14 httplib2, lxml, pycurl.
  15
  16 Usage: webforms_aggregator.py [options] [single url or file containing urls]
  17
  18 Options:
  19   -l LOG_LEVEL, --log_level LOG_LEVEL
  20     LOG_LEVEL: debug, info, warning or error [default: error]
  21   -h, --help  show this help message and exit
  22 """
  23
  24 import datetime
  25 import errno
  26 import logging
  27 import optparse
  28 import os
  29 import re
  30 # Needed in Linux so that PyCurl does not throw a segmentation fault.
  31 import signal
  32 import sys
  33 import tempfile
  34 import threading
  35 import time
  36 import urlparse
  37
  38 import httplib2
  39 from lxml import html, etree
  40 import pycurl
  41
  42 REGISTER_PAGE_DIR = os.path.join(os.pardir, 'test', 'data', 'autofill',
  43                                  'heuristics', 'input')
  44 NOT_FOUND_REG_PAGE_SITES_FILENAME = 'notFoundRegPageSites.txt'
  45
  46 FORM_LOCATION_COMMENT = 'Form Location: %s'
  47 HTML_FILE_PREFIX = 'grabber-'
  48
  49 MAX_REDIRECTIONS = 10
  50
  51 # Strings in a webpage that are indicative of a registration link.
  52 LINK_CLUES = ['regist', 'user', 'sign', 'login', 'account']
  53
  54 MAX_SAME_DOMAIN_URLS_NO = 30
  55 MAX_TOTAL_URLS_PER_DOMAIN = 300
  56 MAX_OPEN_FILES_NO = 500
  57
  58 # URLs are selected for downloading with the following rules from the link
  59 # lists, giving more weight to the links that contain a link clue.
  60 CLUE_SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
  61 CLUE_GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 3/10
  62 SECURE_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
  63 GENERAL_LINKS_NO = MAX_SAME_DOMAIN_URLS_NO * 2/10
  64
  65 MAX_ALLOWED_THREADS = MAX_OPEN_FILES_NO / MAX_SAME_DOMAIN_URLS_NO + 1
  66
  67
  68 class Retriever(object):
  69   """Download, parse, and check if the web page contains a registration form.
  70
  71   The objects of this class has a one to one relation with the web pages. For
  72   each page that is downloaded and parsed an object of this class is created.
  73   Each Retriever object creates a curl object. This object is added to the curl
  74   multi object of the crawler object so that the corresponding pages gets
  75   downloaded.
  76   """
  77   logger = logging.getLogger(__name__)
  78
  79   def __init__(self, url, domain, cookie_file):
  80     """Initializes a Retriever object.
  81
  82     Args:
  83       url: url to download page from.
  84       domain: only links with this domain will be retrieved.
  85       cookie_file: the name of a cookie file, needed for pages that use session
  86           cookies to change their contents.
  87     """
  88     self._url = url
  89     self._domain = domain
  90     self._html_content = ''
  91
  92     # Http links without clues from LINK_CLUES.
  93     self._general_links = []
  94     # Http links that contain a clue from LINK_CLUES.
  95     self._clues_general_links = []
  96     # Https links that do not contain any clues from LINK_CLUES.
  97     self._secure_links = []
  98     # Https links that contain a clue from LINK_CLUES.
  99     self._clues_secure_links = []
 100     self._cookie_file = cookie_file
 101     self._curl_object = None
 102
 103   def __del__(self):
 104     """Cleans up before this object is destroyed.
 105
 106     The function closes the corresponding curl object that does the downloading.
 107     """
 108     if self._curl_object:
 109       self._curl_object.close()
 110
 111   def _AddLink(self, link):
 112     """Adds url |link|, if not already present, to the appropriate list.
 113
 114     The link only gets added to the single list that is appopriate for it:
 115     _secure_links, _general_links, _clues_secure_links or _clues_general_links.
 116
 117     Args:
 118       link: the url that is inserted to the appropriate links list.
 119     """
 120     # Handles sites with unicode URLs.
 121     if isinstance(link, unicode):
 122       # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
 123       link = httplib2.iri2uri(link).encode('utf-8')
 124     link_parsed = urlparse.urlparse(link)
 125     link_lists = [self._clues_secure_links, self._secure_links,
 126                   self._clues_general_links, self._general_links]
 127     # Checks that the registration page is within the domain.
 128     if (self._domain in link_parsed[1] and
 129         all(link not in x for x in link_lists)):
 130       for clue in LINK_CLUES:
 131         if clue in link.lower():
 132           if link_parsed[0].startswith('https'):
 133             self._clues_secure_links.append(link)
 134             return
 135           else:
 136             self._clues_general_links.append(link)
 137             return
 138       if link_parsed[0].startswith('https'):  # No clues found in the link.
 139         self._secure_links.append(link)
 140       else:
 141         self._general_links.append(link)
 142
 143   def ParseAndGetLinks(self):
 144     """Parses downloaded page and gets url link for non registration page.
 145
 146     Checks if current page contains a registration page and if not it gets
 147     the url links. If it is a registration page, it saves it in a file as
 148     'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
 149     and it returns True. Otherwise it returns False.
 150
 151     Returns:
 152       True if current page contains a registration form, and False otherwise.
 153
 154     Raises:
 155       IOError: When can't write to the file.
 156     """
 157     if not self._domain:
 158       self.logger.error('Error: self._domain was not set')
 159       sys.exit(1)
 160     match_list = re.findall(r'(?P<quote>[\'\"])(?P<link>(?:https?:)?//.*?)\1',
 161                              self._html_content)
 162     for group_list in match_list:
 163       link = group_list[1]
 164       if link.startswith('//'):
 165         link = urlparse.urljoin(self._url, link)
 166       self._AddLink(link)
 167     try:
 168       tree = html.fromstring(self._html_content, parser=html.HTMLParser())
 169     except etree.LxmlError:
 170       self.logger.info('\t\tSkipping: not valid HTML code in this page <<< %s',
 171                        self._url)
 172       return False
 173     try:
 174       body = tree.iter('body').next()
 175     except StopIteration:
 176       self.logger.info('\t\tSkipping: no "BODY" tag in this page <<< %s',
 177                        self._url)
 178       return False
 179
 180     # Get a list of all input elements with attribute type='password'
 181     password_elements = list(body.iterfind('.//input[@type="password"]'))
 182     # Check for multiple password elements to distinguish between a login form
 183     # and a registration form (Password field and Confirm Password field).
 184     if password_elements and len(password_elements) >= 2:
 185       form_elements = []
 186       for password_elem in password_elements:
 187         form_elem = password_elem.xpath('ancestor::form[1]')
 188         if not form_elem:
 189           continue
 190         if not form_elem[0] in form_elements:
 191           form_elements.append(form_elem[0])
 192         else:
 193           # Confirms that the page contains a registration form if two passwords
 194           # are contained in the same form for form_elem[0].
 195           if not os.path.isdir(REGISTER_PAGE_DIR):
 196             os.makedirs(REGISTER_PAGE_DIR)
 197           # Locate the HTML tag and insert the form location comment after it.
 198           html_tag = tree.iter('html').next()
 199           comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
 200           html_tag.insert(0, comment)
 201           # Create a new file and save the HTML registration page code.
 202           f = open('%s/%s%s.html' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
 203                                      self._domain), 'w')
 204           try:
 205             f.write(html.tostring(tree, pretty_print=True))
 206           except IOError as e:
 207             self.logger.error('Error: %s', e)
 208             raise
 209           finally:
 210             f.close()
 211           return True  # Registration page found.
 212     # Indicates page is not a registration page and links must be parsed.
 213     link_elements = list(body.iter('a'))
 214     for link_elem in link_elements:
 215       link = link_elem.get('href')
 216       if not link or '#' == link[0]:
 217         continue
 218       link = urlparse.urljoin(self._url, link)
 219       link_parsed = urlparse.urlparse(link)
 220       if not link_parsed[0].startswith('http'):
 221         continue
 222       self._AddLink(link)
 223     return False  # Registration page not found.
 224
 225   def InitRequestHead(self):
 226     """Initializes curl object for a HEAD request.
 227
 228     A HEAD request is initiated so that we can check from the headers if this is
 229     a valid HTML file. If it is not a valid HTML file, then we do not initiate a
 230     GET request, saving any unnecessary downloadings.
 231     """
 232     self._curl_object = pycurl.Curl()
 233     self._curl_object.setopt(pycurl.URL, self._url)
 234     # The following line fixes the GnuTLS package error that pycurl depends
 235     # on for getting https pages.
 236     self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
 237     self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
 238     self._curl_object.setopt(pycurl.NOBODY, True)
 239     self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
 240     self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
 241     self._curl_object.setopt(pycurl.FAILONERROR, False)
 242     self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
 243     self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
 244     self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
 245     self._curl_object.setopt(pycurl.TIMEOUT, 300)
 246     self._curl_object.setopt(pycurl.NOSIGNAL, 1)
 247
 248   def InitRequestGet(self):
 249     """Initializes curl object for a GET request.
 250
 251     This is called only for valid HTML files. The Pycurl makes a GET request.
 252     The page begins to download, but since not all the data of the pages comes
 253     at once. When some of the data on the page is downloaded Pycurl will put
 254     this data in the buffer. The data is appended to the end of the page until
 255     everything is downloaded.
 256     """
 257     self._curl_object.setopt(pycurl.NOBODY, False)
 258     self._curl_object.setopt(
 259         pycurl.WRITEFUNCTION, lambda buff: setattr(
 260             self, '_html_content', self._html_content + buff))
 261
 262   def Download(self):
 263     """Downloads the self._url page.
 264
 265     It first does a HEAD request and then it proceeds to a GET request.
 266     It uses a curl object for a single download. This function is called only
 267     once for the initial url of a site when we still don't have more urls from a
 268     domain.
 269
 270     Returns:
 271       True, if the downloaded page is valid HTML code, or False otherwise.
 272     """
 273     self.InitRequestHead()
 274     try:
 275       self._curl_object.perform()
 276     except pycurl.error as e:
 277       self.logger.error('Error: %s, url: %s', e, self._url)
 278       return False
 279     self._url = urlparse.urljoin(
 280         self._url, self._curl_object.getinfo(pycurl.EFFECTIVE_URL))
 281     content_type = self._curl_object.getinfo(pycurl.CONTENT_TYPE)
 282     if content_type and ('text/html' in content_type.lower()):
 283       self.InitRequestGet()
 284       try:
 285         self._curl_object.perform()
 286       except pycurl.error as e:
 287         self.logger.error('Error: %s, url: %s', e, self._url)
 288         return False
 289       return True
 290     else:
 291       self.logger.info('\tSkipping: Not an HTML page <<< %s', self._url)
 292       return False
 293
 294   def Run(self):
 295     """Called only once for the initial url when we do not have more urls.
 296
 297     Downloads the originally-specified site url, parses it and gets the links.
 298
 299     Returns:
 300       True, if a registration page is found, and False otherwise.
 301     """
 302     if self.Download():
 303       if not self._domain:
 304         url_parsed = urlparse.urlparse(self._url)
 305         self._domain = url_parsed[1]
 306         if self._domain.startswith('www'):
 307           self._domain = '.'.join(self._domain.split('.')[1:])
 308       if self.ParseAndGetLinks():
 309         return True
 310     return False
 311
 312
 313 class Crawler(object):
 314   """Crawls a site until a registration page is found or max level is reached.
 315
 316   Creates, uses and destroys Retriever objects. Creates a cookie temp file
 317   needed for session cookies. It keeps track of 'visited links' and
 318   'links to visit' of the site. To do this it uses the links discovered from
 319   each Retriever object. Use Run() to crawl the site.
 320   """
 321   try:
 322     signal.signal(signal.SIGPIPE, signal.SIG_IGN)
 323   except ImportError:
 324     pass
 325   logger = logging.getLogger(__name__)
 326
 327   def __init__(self, url, logging_level=None):
 328     """Init crawler URL, links lists, logger, and creates a cookie temp file.
 329
 330     The cookie temp file is needed for session cookies.
 331
 332     Args:
 333       url: the initial "seed" url of the site.
 334       logging_level: the desired verbosity level, default is None.
 335     """
 336     if logging_level:
 337       self.logger.setLevel(logging_level)
 338
 339     self.url_error = False
 340     url_parsed = urlparse.urlparse(url)
 341     if not url_parsed[0].startswith('http'):
 342       self.logger.error(
 343           'Error: "%s" does not begin with http:// or https://', url)
 344       self.url_error = True
 345       return
 346     # Example: if url is 'http://www.example.com?name=john' then value [1] or
 347     # network location is 'www.example.com'.
 348     if not url_parsed[1]:
 349       self.logger.error('Error: "%s" is not a valid url', url)
 350       self.url_error = True
 351       return
 352     self._url = url
 353     self._domain = ''
 354     # Http links that contain a clue from LINK_CLUES.
 355     self._clues_general_links = []
 356     # Http links that do not contain any clue from LINK_CLUES.
 357     self._general_links = []
 358     # Https links that contain a clue from LINK_CLUES.
 359     self._clues_secure_links = []
 360     # Https links that do not contain any clue from LINK_CLUES.
 361     self._secure_links = []
 362     # All links downloaded and parsed so far.
 363     self._links_visited = []
 364     self._retrievers_list = []
 365     self._cookie_file = tempfile.NamedTemporaryFile(
 366         suffix='.cookie', delete=False)
 367     self._cookie_file.close()
 368     self._cookie_file = self._cookie_file.name  # Keep only the filename.
 369
 370   def __del__(self):
 371     """Deletes cookie file when Crawler instances are destroyed."""
 372     if hasattr(self, '_cookie_file'):
 373       self.logger.info('Deleting cookie file %s ...', self._cookie_file)
 374       os.unlink(self._cookie_file)
 375
 376   def _MultiPerform(self, curl_multi_object):
 377     """Performs concurrent downloads using a CurlMulti object.
 378
 379     Args:
 380       curl_multi_object: a curl object that downloads multiple pages
 381           concurrently. The class of this object is |pycurl.CurlMulti|.
 382     """
 383     # Following code uses the example from section for the CurlMulti object
 384     # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
 385     while True:
 386       ret, no_handles = curl_multi_object.perform()
 387       if ret != pycurl.E_CALL_MULTI_PERFORM:
 388         break
 389     while no_handles:
 390       curl_multi_object.select(1.0)
 391       while True:
 392         ret, no_handles = curl_multi_object.perform()
 393         if ret != pycurl.E_CALL_MULTI_PERFORM:
 394           break
 395
 396   def _GetLinksPages(self, curl_multi_object):
 397     """Downloads many pages concurrently using a CurlMulti Object.
 398
 399     Creates many Retriever objects and adds them to a list. The constant
 400     MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
 401     concurrently from the same domain using the pycurl multi object. It's
 402     currently set to 30 URLs. These URLs are taken from the links lists, which
 403     are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
 404     each list during each iteration.
 405
 406     Example of the rules:
 407       3/10 from csl results in 9 URLs
 408       3/10 from cgl results in 9 URLs
 409       2/10 from sl results in 6 URLs
 410       2/10 from gl results in 6 URLs
 411
 412     Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
 413     If these lists have fewer items than the defined rules, such as if a site
 414     does not contain any secure links, then csl and sl lists will be of 0 length
 415     and only 15 pages would be downloaded concurrently from the same domain.
 416
 417     Since 30 URLs can be handled concurrently, the number of links taken from
 418     other lists can be increased. This means that we can take 24 links from the
 419     cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less
 420     than 24 links, e.g. there are only 21 links, then only 9 links may be taken
 421     from gl so ) + 21 + 0 + 9 = 30.
 422
 423     Args:
 424       curl_multi_object: Each Retriever object has a curl object which is
 425           added to the CurlMulti Object.
 426     """
 427     self._retrievers_list = []
 428
 429     csl_no = min(CLUE_SECURE_LINKS_NO, len(self._clues_secure_links))
 430     cgl_no = min(CLUE_GENERAL_LINKS_NO, len(self._clues_general_links))
 431     sl_no = min(SECURE_LINKS_NO, len(self._secure_links))
 432     gl_no = min(GENERAL_LINKS_NO, len(self._general_links))
 433
 434     # If some links within the list have fewer items than needed, the missing
 435     # links will be taken by the following priority: csl, cgl, sl, gl.
 436     # c: clues, s: secure, g: general, l: list.
 437     spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
 438     if spare_links > 0:
 439       csl_no = min(csl_no + spare_links, len(self._clues_secure_links))
 440       spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
 441     if spare_links > 0:
 442       cgl_no = min(cgl_no + spare_links, len(self._clues_general_links))
 443       spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
 444     if spare_links > 0:
 445       sl_no = min(sl_no + spare_links, len(self._secure_links))
 446       spare_links = MAX_SAME_DOMAIN_URLS_NO - (csl_no + sl_no + cgl_no + gl_no)
 447     if spare_links > 0:
 448       gl_no = min(gl_no + spare_links, len(self._general_links))
 449
 450     for no_of_links, links in [
 451         (csl_no, self._clues_secure_links),
 452         (sl_no, self._secure_links),
 453         (cgl_no, self._clues_general_links),
 454         (gl_no, self._general_links)]:
 455       for i in xrange(no_of_links):
 456         if not links:
 457           break
 458         url = links.pop(0)
 459         self._links_visited.append(url)
 460         r = Retriever(url, self._domain, self._cookie_file)
 461         r.InitRequestHead()
 462         curl_multi_object.add_handle(r._curl_object)
 463         self._retrievers_list.append(r)
 464
 465     if self._retrievers_list:
 466       try:
 467         self._MultiPerform(curl_multi_object)
 468       except pycurl.error as e:
 469         self.logger.error('Error: %s, url: %s', e, self._url)
 470       finally:
 471         for r in self._retrievers_list:
 472           curl_multi_object.remove_handle(r._curl_object)
 473       # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing
 474       # items from the iterated list.
 475       for r in self._retrievers_list[:]:
 476         r._url = urlparse.urljoin(r._url, r._curl_object.getinfo(
 477             pycurl.EFFECTIVE_URL))
 478         content_type = r._curl_object.getinfo(pycurl.CONTENT_TYPE)
 479         if content_type and ('text/html' in content_type.lower()):
 480           r.InitRequestGet()
 481           curl_multi_object.add_handle(r._curl_object)
 482         else:
 483           self._retrievers_list.remove(r)
 484           self.logger.info('\tSkipping: Not an HTML page <<< %s', r._url)
 485       if self._retrievers_list:
 486         try:
 487           self._MultiPerform(curl_multi_object)
 488         except pycurl.error as e:
 489           self.logger.error('Error: %s, url: %s', e, self._url)
 490         finally:
 491           for r in self._retrievers_list:
 492             curl_multi_object.remove_handle(r._curl_object)
 493             self.logger.info('Downloaded: %s', r._url)
 494
 495   def _LogRegPageFound(self, retriever):
 496     """Display logging for registration page found.
 497
 498     Args:
 499       retriever: The object that has retrieved the page.
 500     """
 501     self.logger.info('\t##############################################')
 502     self.logger.info('\t### %s ###', retriever._domain)
 503     self.logger.info('\t##############################################')
 504     self.logger.info('\t!!!!!!!!!  registration page FOUND !!!!!!!!!!!')
 505     self.logger.info('\t%s', retriever._url)
 506     self.logger.info('\t##############################################')
 507
 508   def _GetNewLinks(self, retriever):
 509     """Appends new links discovered by each retriever to the appropriate lists.
 510
 511     Links are copied to the links list of the crawler object, which holds all
 512     the links found from all retrievers that the crawler object created. The
 513     Crawler object exists as far as a specific site is examined and the
 514     Retriever object exists as far as a page of this site is examined.
 515
 516     Args:
 517       retriever: a temporary object that downloads a specific page, parses the
 518           content and gets the page's href link.
 519     """
 520     for link in retriever._clues_secure_links:
 521       if (not link in self._clues_secure_links and
 522           not link in self._links_visited):
 523         self._clues_secure_links.append(link)
 524     for link in retriever._secure_links:
 525       if (not link in self._secure_links and
 526           not link in self._links_visited):
 527         self._secure_links.append(link)
 528     for link in retriever._clues_general_links:
 529       if (not link in self._clues_general_links and
 530           not link in self._links_visited):
 531         self._clues_general_links.append(link)
 532     for link in retriever._general_links:
 533       if (not link in self._general_links and
 534           not link in self._links_visited):
 535         self._general_links.append(link)
 536
 537   def Run(self):
 538     """Runs the Crawler.
 539
 540     Creates a Retriever object and calls its run method to get the first links,
 541     and then uses CurlMulti object and creates many Retriever objects to get
 542     the subsequent pages.
 543
 544     The number of pages (=Retriever objs) created each time is restricted by
 545     MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
 546     and parse their pages, we do the same again. The number of total pages
 547     visited is kept in urls_visited.
 548     If no registration page is found, the Crawler object will give up its try
 549     after MAX_TOTAL_URLS_PER_DOMAIN is reached.
 550
 551     Returns:
 552       True is returned if registration page is found, or False otherwise.
 553     """
 554     reg_page_found = False
 555     if self.url_error:
 556       return False
 557     r = Retriever(self._url, self._domain, self._cookie_file)
 558     if r.Run():
 559       self._LogRegPageFound(r)
 560       reg_page_found = True
 561     else:
 562       self._url = r._url
 563       self._domain = r._domain
 564       self.logger.info('url to crawl: %s', self._url)
 565       self.logger.info('domain: %s', self._domain)
 566       self._links_visited.append(r._url)
 567       self._GetNewLinks(r)
 568       urls_visited = 1
 569       while True:
 570         if (not (self._clues_secure_links or self._secure_links or
 571                 self._clues_general_links or self._general_links) or
 572             urls_visited >= MAX_TOTAL_URLS_PER_DOMAIN):
 573           break  # Registration page not found.
 574         m = pycurl.CurlMulti()
 575         self._GetLinksPages(m)
 576         urls_visited += len(self._retrievers_list)
 577         self.logger.info('\t<----- URLs visited for domain "%s": %d ----->',
 578                          self._domain, urls_visited)
 579         for r in self._retrievers_list:
 580           if r.ParseAndGetLinks():
 581             self._LogRegPageFound(r)
 582             reg_page_found = True
 583             break
 584           else:
 585             self.logger.info('parsed: %s', r._url)
 586             self._GetNewLinks(r)
 587         m.close()
 588         if reg_page_found:
 589           break
 590     while self._retrievers_list:
 591       r = self._retrievers_list.pop()
 592     return reg_page_found
 593
 594
 595 class WorkerThread(threading.Thread):
 596   """Creates a new thread of execution."""
 597   def __init__(self, url):
 598     """Creates _url and page_found attri to populate urls_with_no_reg_page file.
 599
 600     Used after thread's termination for the creation of a file with a list of
 601     the urls for which a registration page wasn't found.
 602
 603     Args:
 604       url: will be used as an argument to create a Crawler object later.
 605     """
 606     threading.Thread.__init__(self)
 607     self._url = url
 608     self.page_found = False
 609
 610   def run(self):
 611     """Execution of thread creates a Crawler object and runs it.
 612
 613     Caution: this function name should not be changed to 'Run' or any other
 614     names because it is overriding the 'run' method of the 'threading.Thread'
 615     class. Otherwise it will never be called.
 616     """
 617     self.page_found = Crawler(self._url).Run()
 618
 619
 620 class ThreadedCrawler(object):
 621   """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
 622
 623   The crawler object runs concurrently, examining one site each.
 624   """
 625   logger = logging.getLogger(__name__)
 626
 627   def __init__(self, urls_file, logging_level=None):
 628     """Creates threaded Crawler objects.
 629
 630     Args:
 631       urls_file: a text file containing a URL in each line.
 632       logging_level: verbosity level, default is None.
 633
 634     Raises:
 635       IOError: If cannot find URLs from the list.
 636     """
 637     if logging_level:
 638       self.logger.setLevel(logging_level)
 639
 640     self._urls_list = []
 641     f = open(urls_file)
 642     try:
 643       for url in f.readlines():
 644         url = url.strip()
 645         if not urlparse.urlparse(url)[0].startswith('http'):
 646           self.logger.info(
 647               '%s: skipping this (does not begin with "http://")', url)
 648           continue
 649         self._urls_list.append(url)
 650     except IOError as e:
 651       self.logger.error('Error: %s', e)
 652       raise
 653     finally:
 654       f.close()
 655     if not self._urls_list:
 656       error_msg = 'No URLs were found.'
 657       self.logger.error('ERROR: %s', error_msg)
 658       raise IOError(error_msg)
 659
 660   def Run(self):
 661     """Runs Crawler objects using python threads.
 662
 663     Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
 664
 665     Returns:
 666       The number of registration pages found. -1 if no URLs are given.
 667
 668     Raises:
 669       OSError: When creating the same directory that already exists.
 670     """
 671     if self._urls_list:
 672       allThreads = []
 673       # originalNumThreads is the number of threads just before the
 674       # ThreadedCrawler starts creating new threads. As a standalone script it
 675       # will be 1.
 676       originalNumThreads = threading.active_count()
 677       for url in self._urls_list:
 678         self.logger.info('URL fed to a crawler thread: %s', url)
 679         t = WorkerThread(url)
 680         t.start()
 681         allThreads.append(t)
 682         while threading.active_count() >= (
 683             MAX_ALLOWED_THREADS + originalNumThreads):
 684           time.sleep(.4)
 685       while threading.active_count() > originalNumThreads:
 686         time.sleep(.4)
 687       self.logger.info('----------------')
 688       self.logger.info('--- FINISHED ---')
 689       self.logger.info('----------------')
 690       urls_no = 0
 691       urls_not_found_no = 0
 692       not_file_name = os.path.join(
 693           REGISTER_PAGE_DIR, NOT_FOUND_REG_PAGE_SITES_FILENAME)
 694       not_file_dir = os.path.dirname(not_file_name)
 695       try:
 696         os.makedirs(not_file_dir)
 697       except OSError as e:
 698         if e.errno != errno.EEXIST:
 699           raise
 700       fnot = open(not_file_name, 'wb')
 701       try:
 702         for t in sorted(allThreads, key=lambda t: t._url):
 703           urls_no += 1
 704           if not t.page_found:
 705             urls_not_found_no += 1
 706             fnot.write('%s' % t._url)
 707             fnot.write(os.linesep)
 708       except IOError as e:
 709         self.logger.error('Error: %s', e)
 710       finally:
 711         fnot.close()
 712       self.logger.info('Total number of URLs given: %d\n', urls_no)
 713       self.logger.info(
 714           'Registration pages found: %d\n', (urls_no - urls_not_found_no))
 715       self.logger.info(
 716           'URLs that did not return a registration page: %d\n',
 717           urls_not_found_no)
 718       return urls_no - urls_not_found_no
 719     else:
 720       self.logger.error('Error: no URLs were found.')
 721       return -1
 722
 723
 724 def main():
 725   usage = 'usage: %prog [options] single_url_or_urls_filename'
 726   parser = optparse.OptionParser(usage)
 727   parser.add_option(
 728       '-l', '--log_level', metavar='LOG_LEVEL', default='error',
 729       help='LOG_LEVEL: debug, info, warning or error [default: %default]')
 730
 731   (options, args) = parser.parse_args()
 732   options.log_level = options.log_level.upper()
 733   if options.log_level not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
 734     print 'Wrong log_level argument.'
 735     parser.print_help()
 736     return 1
 737   options.log_level = getattr(logging, options.log_level)
 738
 739   if len(args) != 1:
 740     parser.error('Wrong number of arguments.')
 741
 742   logger = logging.getLogger(__name__)
 743   if options.log_level:
 744     console = logging.StreamHandler()
 745     logger.addHandler(console)
 746     logger.setLevel(options.log_level)
 747
 748   arg_is_a_file = os.path.isfile(args[0])
 749   if arg_is_a_file:
 750     CrawlerClass = ThreadedCrawler
 751   else:
 752     CrawlerClass = Crawler
 753   t0 = datetime.datetime.now()
 754   c = CrawlerClass(args[0], options.log_level)
 755   c.Run()
 756   if not arg_is_a_file and c.url_error:
 757     logger.error(
 758         'ERROR: "%s" is neither a valid filename nor a valid URL' % args[0])
 759   t1 = datetime.datetime.now()
 760   delta_t = t1 - t0
 761   logger.info('Started at: %s\n', t0)
 762   logger.info('Ended at: %s\n', t1)
 763   logger.info('Total execution time: %s\n', delta_t)
 764   return 0
 765
 766
 767 if __name__ == "__main__":
 768   sys.exit(main())