2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
6 """Downloads web pages with fillable forms after parsing through a set of links.
8 Used for collecting web pages with forms. Used as a standalone script.
9 This script assumes that it's run from within the same directory in which it's
10 checked into. If this script were to be run elsewhere then the path for
11 REGISTER_PAGE_DIR needs to be changed.
13 This script assumes that third party modules are installed:
14 httplib2, lxml, pycurl.
16 Usage: webforms_aggregator.py [options] [single url or file containing urls]
19 -l LOG_LEVEL, --log_level LOG_LEVEL
20 LOG_LEVEL: debug, info, warning or error [default: error]
21 -h, --help show this help message and exit
30 # Needed in Linux so that PyCurl does not throw a segmentation fault.
39 from lxml
import html
, etree
42 REGISTER_PAGE_DIR
= os
.path
.join(os
.pardir
, 'test', 'data', 'autofill',
43 'heuristics', 'input')
44 NOT_FOUND_REG_PAGE_SITES_FILENAME
= 'notFoundRegPageSites.txt'
46 FORM_LOCATION_COMMENT
= 'Form Location: %s'
47 HTML_FILE_PREFIX
= 'grabber-'
51 # Strings in a webpage that are indicative of a registration link.
52 LINK_CLUES
= ['regist', 'user', 'sign', 'login', 'account']
54 MAX_SAME_DOMAIN_URLS_NO
= 30
55 MAX_TOTAL_URLS_PER_DOMAIN
= 300
56 MAX_OPEN_FILES_NO
= 500
58 # URLs are selected for downloading with the following rules from the link
59 # lists, giving more weight to the links that contain a link clue.
60 CLUE_SECURE_LINKS_NO
= MAX_SAME_DOMAIN_URLS_NO
* 3/10
61 CLUE_GENERAL_LINKS_NO
= MAX_SAME_DOMAIN_URLS_NO
* 3/10
62 SECURE_LINKS_NO
= MAX_SAME_DOMAIN_URLS_NO
* 2/10
63 GENERAL_LINKS_NO
= MAX_SAME_DOMAIN_URLS_NO
* 2/10
65 MAX_ALLOWED_THREADS
= MAX_OPEN_FILES_NO
/ MAX_SAME_DOMAIN_URLS_NO
+ 1
68 class Retriever(object):
69 """Download, parse, and check if the web page contains a registration form.
71 The objects of this class has a one to one relation with the web pages. For
72 each page that is downloaded and parsed an object of this class is created.
73 Each Retriever object creates a curl object. This object is added to the curl
74 multi object of the crawler object so that the corresponding pages gets
77 logger
= logging
.getLogger(__name__
)
79 def __init__(self
, url
, domain
, cookie_file
):
80 """Initializes a Retriever object.
83 url: url to download page from.
84 domain: only links with this domain will be retrieved.
85 cookie_file: the name of a cookie file, needed for pages that use session
86 cookies to change their contents.
90 self
._html
_content
= ''
92 # Http links without clues from LINK_CLUES.
93 self
._general
_links
= []
94 # Http links that contain a clue from LINK_CLUES.
95 self
._clues
_general
_links
= []
96 # Https links that do not contain any clues from LINK_CLUES.
97 self
._secure
_links
= []
98 # Https links that contain a clue from LINK_CLUES.
99 self
._clues
_secure
_links
= []
100 self
._cookie
_file
= cookie_file
101 self
._curl
_object
= None
104 """Cleans up before this object is destroyed.
106 The function closes the corresponding curl object that does the downloading.
108 if self
._curl
_object
:
109 self
._curl
_object
.close()
111 def _AddLink(self
, link
):
112 """Adds url |link|, if not already present, to the appropriate list.
114 The link only gets added to the single list that is appopriate for it:
115 _secure_links, _general_links, _clues_secure_links or _clues_general_links.
118 link: the url that is inserted to the appropriate links list.
120 # Handles sites with unicode URLs.
121 if isinstance(link
, unicode):
122 # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
123 link
= httplib2
.iri2uri(link
).encode('utf-8')
124 link_parsed
= urlparse
.urlparse(link
)
125 link_lists
= [self
._clues
_secure
_links
, self
._secure
_links
,
126 self
._clues
_general
_links
, self
._general
_links
]
127 # Checks that the registration page is within the domain.
128 if (self
._domain
in link_parsed
[1] and
129 all(link
not in x
for x
in link_lists
)):
130 for clue
in LINK_CLUES
:
131 if clue
in link
.lower():
132 if link_parsed
[0].startswith('https'):
133 self
._clues
_secure
_links
.append(link
)
136 self
._clues
_general
_links
.append(link
)
138 if link_parsed
[0].startswith('https'): # No clues found in the link.
139 self
._secure
_links
.append(link
)
141 self
._general
_links
.append(link
)
143 def ParseAndGetLinks(self
):
144 """Parses downloaded page and gets url link for non registration page.
146 Checks if current page contains a registration page and if not it gets
147 the url links. If it is a registration page, it saves it in a file as
148 'grabber-' + domain + '.html' after it has added the FORM_LOCATION_COMMENT
149 and it returns True. Otherwise it returns False.
152 True if current page contains a registration form, and False otherwise.
155 IOError: When can't write to the file.
158 self
.logger
.error('Error: self._domain was not set')
160 match_list
= re
.findall(r
'(?P<quote>[\'\"])(?P
<link
>(?
:https?
:)?
//.*?
)\
1',
162 for group_list in match_list:
164 if link.startswith('//'):
165 link = urlparse.urljoin(self._url, link)
168 tree = html.fromstring(self._html_content, parser=html.HTMLParser())
169 except etree.LxmlError:
170 self.logger.info('\t\tSkipping
: not valid HTML code
in this page
<<< %s',
174 body = tree.iter('body
').next()
175 except StopIteration:
176 self.logger.info('\t\tSkipping
: no
"BODY" tag
in this page
<<< %s',
180 # Get a list of all input elements with attribute type='password
'
181 password_elements = list(body.iterfind('.//input[@type="password"]'))
182 # Check for multiple password elements to distinguish between a login form
183 # and a registration form (Password field and Confirm Password field).
184 if password_elements and len(password_elements) >= 2:
186 for password_elem in password_elements:
187 form_elem = password_elem.xpath('ancestor
::form
[1]')
190 if not form_elem[0] in form_elements:
191 form_elements.append(form_elem[0])
193 # Confirms that the page contains a registration form if two passwords
194 # are contained in the same form for form_elem[0].
195 if not os.path.isdir(REGISTER_PAGE_DIR):
196 os.makedirs(REGISTER_PAGE_DIR)
197 # Locate the HTML tag and insert the form location comment after it.
198 html_tag = tree.iter('html
').next()
199 comment = etree.Comment(FORM_LOCATION_COMMENT % self._url)
200 html_tag.insert(0, comment)
201 # Create a new file and save the HTML registration page code.
202 f = open('%s/%s%s.html
' % (REGISTER_PAGE_DIR, HTML_FILE_PREFIX,
205 f.write(html.tostring(tree, pretty_print=True))
207 self.logger.error('Error
: %s', e)
211 return True # Registration page found.
212 # Indicates page is not a registration page and links must be parsed.
213 link_elements = list(body.iter('a
'))
214 for link_elem in link_elements:
215 link = link_elem.get('href
')
216 if not link or '#' == link[0]:
218 link
= urlparse
.urljoin(self
._url
, link
)
219 link_parsed
= urlparse
.urlparse(link
)
220 if not link_parsed
[0].startswith('http'):
223 return False # Registration page not found.
225 def InitRequestHead(self
):
226 """Initializes curl object for a HEAD request.
228 A HEAD request is initiated so that we can check from the headers if this is
229 a valid HTML file. If it is not a valid HTML file, then we do not initiate a
230 GET request, saving any unnecessary downloadings.
232 self
._curl
_object
= pycurl
.Curl()
233 self
._curl
_object
.setopt(pycurl
.URL
, self
._url
)
234 # The following line fixes the GnuTLS package error that pycurl depends
235 # on for getting https pages.
236 self
._curl
_object
.setopt(pycurl
.SSLVERSION
, pycurl
.SSLVERSION_SSLv3
)
237 self
._curl
_object
.setopt(pycurl
.FOLLOWLOCATION
, True)
238 self
._curl
_object
.setopt(pycurl
.NOBODY
, True)
239 self
._curl
_object
.setopt(pycurl
.SSL_VERIFYPEER
, False);
240 self
._curl
_object
.setopt(pycurl
.MAXREDIRS
, MAX_REDIRECTIONS
)
241 self
._curl
_object
.setopt(pycurl
.FAILONERROR
, False)
242 self
._curl
_object
.setopt(pycurl
.COOKIEFILE
, self
._cookie
_file
)
243 self
._curl
_object
.setopt(pycurl
.COOKIEJAR
, self
._cookie
_file
)
244 self
._curl
_object
.setopt(pycurl
.CONNECTTIMEOUT
, 30)
245 self
._curl
_object
.setopt(pycurl
.TIMEOUT
, 300)
246 self
._curl
_object
.setopt(pycurl
.NOSIGNAL
, 1)
248 def InitRequestGet(self
):
249 """Initializes curl object for a GET request.
251 This is called only for valid HTML files. The Pycurl makes a GET request.
252 The page begins to download, but since not all the data of the pages comes
253 at once. When some of the data on the page is downloaded Pycurl will put
254 this data in the buffer. The data is appended to the end of the page until
255 everything is downloaded.
257 self
._curl
_object
.setopt(pycurl
.NOBODY
, False)
258 self
._curl
_object
.setopt(
259 pycurl
.WRITEFUNCTION
, lambda buff
: setattr(
260 self
, '_html_content', self
._html
_content
+ buff
))
263 """Downloads the self._url page.
265 It first does a HEAD request and then it proceeds to a GET request.
266 It uses a curl object for a single download. This function is called only
267 once for the initial url of a site when we still don't have more urls from a
271 True, if the downloaded page is valid HTML code, or False otherwise.
273 self
.InitRequestHead()
275 self
._curl
_object
.perform()
276 except pycurl
.error
as e
:
277 self
.logger
.error('Error: %s, url: %s', e
, self
._url
)
279 self
._url
= urlparse
.urljoin(
280 self
._url
, self
._curl
_object
.getinfo(pycurl
.EFFECTIVE_URL
))
281 content_type
= self
._curl
_object
.getinfo(pycurl
.CONTENT_TYPE
)
282 if content_type
and ('text/html' in content_type
.lower()):
283 self
.InitRequestGet()
285 self
._curl
_object
.perform()
286 except pycurl
.error
as e
:
287 self
.logger
.error('Error: %s, url: %s', e
, self
._url
)
291 self
.logger
.info('\tSkipping: Not an HTML page <<< %s', self
._url
)
295 """Called only once for the initial url when we do not have more urls.
297 Downloads the originally-specified site url, parses it and gets the links.
300 True, if a registration page is found, and False otherwise.
304 url_parsed
= urlparse
.urlparse(self
._url
)
305 self
._domain
= url_parsed
[1]
306 if self
._domain
.startswith('www'):
307 self
._domain
= '.'.join(self
._domain
.split('.')[1:])
308 if self
.ParseAndGetLinks():
313 class Crawler(object):
314 """Crawls a site until a registration page is found or max level is reached.
316 Creates, uses and destroys Retriever objects. Creates a cookie temp file
317 needed for session cookies. It keeps track of 'visited links' and
318 'links to visit' of the site. To do this it uses the links discovered from
319 each Retriever object. Use Run() to crawl the site.
322 signal
.signal(signal
.SIGPIPE
, signal
.SIG_IGN
)
325 logger
= logging
.getLogger(__name__
)
327 def __init__(self
, url
, logging_level
=None):
328 """Init crawler URL, links lists, logger, and creates a cookie temp file.
330 The cookie temp file is needed for session cookies.
333 url: the initial "seed" url of the site.
334 logging_level: the desired verbosity level, default is None.
337 self
.logger
.setLevel(logging_level
)
339 self
.url_error
= False
340 url_parsed
= urlparse
.urlparse(url
)
341 if not url_parsed
[0].startswith('http'):
343 'Error: "%s" does not begin with http:// or https://', url
)
344 self
.url_error
= True
346 # Example: if url is 'http://www.example.com?name=john' then value [1] or
347 # network location is 'www.example.com'.
348 if not url_parsed
[1]:
349 self
.logger
.error('Error: "%s" is not a valid url', url
)
350 self
.url_error
= True
354 # Http links that contain a clue from LINK_CLUES.
355 self
._clues
_general
_links
= []
356 # Http links that do not contain any clue from LINK_CLUES.
357 self
._general
_links
= []
358 # Https links that contain a clue from LINK_CLUES.
359 self
._clues
_secure
_links
= []
360 # Https links that do not contain any clue from LINK_CLUES.
361 self
._secure
_links
= []
362 # All links downloaded and parsed so far.
363 self
._links
_visited
= []
364 self
._retrievers
_list
= []
365 self
._cookie
_file
= tempfile
.NamedTemporaryFile(
366 suffix
='.cookie', delete
=False)
367 self
._cookie
_file
.close()
368 self
._cookie
_file
= self
._cookie
_file
.name
# Keep only the filename.
371 """Deletes cookie file when Crawler instances are destroyed."""
372 if hasattr(self
, '_cookie_file'):
373 self
.logger
.info('Deleting cookie file %s ...', self
._cookie
_file
)
374 os
.unlink(self
._cookie
_file
)
376 def _MultiPerform(self
, curl_multi_object
):
377 """Performs concurrent downloads using a CurlMulti object.
380 curl_multi_object: a curl object that downloads multiple pages
381 concurrently. The class of this object is |pycurl.CurlMulti|.
383 # Following code uses the example from section for the CurlMulti object
384 # at http://pycurl.sourceforge.net/doc/curlmultiobject.html.
386 ret
, no_handles
= curl_multi_object
.perform()
387 if ret
!= pycurl
.E_CALL_MULTI_PERFORM
:
390 curl_multi_object
.select(1.0)
392 ret
, no_handles
= curl_multi_object
.perform()
393 if ret
!= pycurl
.E_CALL_MULTI_PERFORM
:
396 def _GetLinksPages(self
, curl_multi_object
):
397 """Downloads many pages concurrently using a CurlMulti Object.
399 Creates many Retriever objects and adds them to a list. The constant
400 MAX_SAME_DOMAIN_URLS_NO defines the number of pages that can be downloaded
401 concurrently from the same domain using the pycurl multi object. It's
402 currently set to 30 URLs. These URLs are taken from the links lists, which
403 are from csl, gcl, sl, and gl. The rules define how many URLs are taken from
404 each list during each iteration.
406 Example of the rules:
407 3/10 from csl results in 9 URLs
408 3/10 from cgl results in 9 URLs
409 2/10 from sl results in 6 URLs
410 2/10 from gl results in 6 URLs
412 Adding up the above URLs gives 30 URLs that can be downloaded concurrently.
413 If these lists have fewer items than the defined rules, such as if a site
414 does not contain any secure links, then csl and sl lists will be of 0 length
415 and only 15 pages would be downloaded concurrently from the same domain.
417 Since 30 URLs can be handled concurrently, the number of links taken from
418 other lists can be increased. This means that we can take 24 links from the
419 cgl list so that 24 from gfl + 6 from gl = 30 URLs. If the cgl list has less
420 than 24 links, e.g. there are only 21 links, then only 9 links may be taken
421 from gl so ) + 21 + 0 + 9 = 30.
424 curl_multi_object: Each Retriever object has a curl object which is
425 added to the CurlMulti Object.
427 self
._retrievers
_list
= []
429 csl_no
= min(CLUE_SECURE_LINKS_NO
, len(self
._clues
_secure
_links
))
430 cgl_no
= min(CLUE_GENERAL_LINKS_NO
, len(self
._clues
_general
_links
))
431 sl_no
= min(SECURE_LINKS_NO
, len(self
._secure
_links
))
432 gl_no
= min(GENERAL_LINKS_NO
, len(self
._general
_links
))
434 # If some links within the list have fewer items than needed, the missing
435 # links will be taken by the following priority: csl, cgl, sl, gl.
436 # c: clues, s: secure, g: general, l: list.
437 spare_links
= MAX_SAME_DOMAIN_URLS_NO
- (csl_no
+ sl_no
+ cgl_no
+ gl_no
)
439 csl_no
= min(csl_no
+ spare_links
, len(self
._clues
_secure
_links
))
440 spare_links
= MAX_SAME_DOMAIN_URLS_NO
- (csl_no
+ sl_no
+ cgl_no
+ gl_no
)
442 cgl_no
= min(cgl_no
+ spare_links
, len(self
._clues
_general
_links
))
443 spare_links
= MAX_SAME_DOMAIN_URLS_NO
- (csl_no
+ sl_no
+ cgl_no
+ gl_no
)
445 sl_no
= min(sl_no
+ spare_links
, len(self
._secure
_links
))
446 spare_links
= MAX_SAME_DOMAIN_URLS_NO
- (csl_no
+ sl_no
+ cgl_no
+ gl_no
)
448 gl_no
= min(gl_no
+ spare_links
, len(self
._general
_links
))
450 for no_of_links
, links
in [
451 (csl_no
, self
._clues
_secure
_links
),
452 (sl_no
, self
._secure
_links
),
453 (cgl_no
, self
._clues
_general
_links
),
454 (gl_no
, self
._general
_links
)]:
455 for i
in xrange(no_of_links
):
459 self
._links
_visited
.append(url
)
460 r
= Retriever(url
, self
._domain
, self
._cookie
_file
)
462 curl_multi_object
.add_handle(r
._curl
_object
)
463 self
._retrievers
_list
.append(r
)
465 if self
._retrievers
_list
:
467 self
._MultiPerform
(curl_multi_object
)
468 except pycurl
.error
as e
:
469 self
.logger
.error('Error: %s, url: %s', e
, self
._url
)
471 for r
in self
._retrievers
_list
:
472 curl_multi_object
.remove_handle(r
._curl
_object
)
473 # |_retrievers_list[:]| is a copy of |_retrievers_list| to avoid removing
474 # items from the iterated list.
475 for r
in self
._retrievers
_list
[:]:
476 r
._url
= urlparse
.urljoin(r
._url
, r
._curl
_object
.getinfo(
477 pycurl
.EFFECTIVE_URL
))
478 content_type
= r
._curl
_object
.getinfo(pycurl
.CONTENT_TYPE
)
479 if content_type
and ('text/html' in content_type
.lower()):
481 curl_multi_object
.add_handle(r
._curl
_object
)
483 self
._retrievers
_list
.remove(r
)
484 self
.logger
.info('\tSkipping: Not an HTML page <<< %s', r
._url
)
485 if self
._retrievers
_list
:
487 self
._MultiPerform
(curl_multi_object
)
488 except pycurl
.error
as e
:
489 self
.logger
.error('Error: %s, url: %s', e
, self
._url
)
491 for r
in self
._retrievers
_list
:
492 curl_multi_object
.remove_handle(r
._curl
_object
)
493 self
.logger
.info('Downloaded: %s', r
._url
)
495 def _LogRegPageFound(self
, retriever
):
496 """Display logging for registration page found.
499 retriever: The object that has retrieved the page.
501 self
.logger
.info('\t##############################################')
502 self
.logger
.info('\t### %s ###', retriever
._domain
)
503 self
.logger
.info('\t##############################################')
504 self
.logger
.info('\t!!!!!!!!! registration page FOUND !!!!!!!!!!!')
505 self
.logger
.info('\t%s', retriever
._url
)
506 self
.logger
.info('\t##############################################')
508 def _GetNewLinks(self
, retriever
):
509 """Appends new links discovered by each retriever to the appropriate lists.
511 Links are copied to the links list of the crawler object, which holds all
512 the links found from all retrievers that the crawler object created. The
513 Crawler object exists as far as a specific site is examined and the
514 Retriever object exists as far as a page of this site is examined.
517 retriever: a temporary object that downloads a specific page, parses the
518 content and gets the page's href link.
520 for link
in retriever
._clues
_secure
_links
:
521 if (not link
in self
._clues
_secure
_links
and
522 not link
in self
._links
_visited
):
523 self
._clues
_secure
_links
.append(link
)
524 for link
in retriever
._secure
_links
:
525 if (not link
in self
._secure
_links
and
526 not link
in self
._links
_visited
):
527 self
._secure
_links
.append(link
)
528 for link
in retriever
._clues
_general
_links
:
529 if (not link
in self
._clues
_general
_links
and
530 not link
in self
._links
_visited
):
531 self
._clues
_general
_links
.append(link
)
532 for link
in retriever
._general
_links
:
533 if (not link
in self
._general
_links
and
534 not link
in self
._links
_visited
):
535 self
._general
_links
.append(link
)
540 Creates a Retriever object and calls its run method to get the first links,
541 and then uses CurlMulti object and creates many Retriever objects to get
542 the subsequent pages.
544 The number of pages (=Retriever objs) created each time is restricted by
545 MAX_SAME_DOMAIN_URLS_NO. After this number of Retriever objects download
546 and parse their pages, we do the same again. The number of total pages
547 visited is kept in urls_visited.
548 If no registration page is found, the Crawler object will give up its try
549 after MAX_TOTAL_URLS_PER_DOMAIN is reached.
552 True is returned if registration page is found, or False otherwise.
554 reg_page_found
= False
557 r
= Retriever(self
._url
, self
._domain
, self
._cookie
_file
)
559 self
._LogRegPageFound
(r
)
560 reg_page_found
= True
563 self
._domain
= r
._domain
564 self
.logger
.info('url to crawl: %s', self
._url
)
565 self
.logger
.info('domain: %s', self
._domain
)
566 self
._links
_visited
.append(r
._url
)
570 if (not (self
._clues
_secure
_links
or self
._secure
_links
or
571 self
._clues
_general
_links
or self
._general
_links
) or
572 urls_visited
>= MAX_TOTAL_URLS_PER_DOMAIN
):
573 break # Registration page not found.
574 m
= pycurl
.CurlMulti()
575 self
._GetLinksPages
(m
)
576 urls_visited
+= len(self
._retrievers
_list
)
577 self
.logger
.info('\t<----- URLs visited for domain "%s": %d ----->',
578 self
._domain
, urls_visited
)
579 for r
in self
._retrievers
_list
:
580 if r
.ParseAndGetLinks():
581 self
._LogRegPageFound
(r
)
582 reg_page_found
= True
585 self
.logger
.info('parsed: %s', r
._url
)
590 while self
._retrievers
_list
:
591 r
= self
._retrievers
_list
.pop()
592 return reg_page_found
595 class WorkerThread(threading
.Thread
):
596 """Creates a new thread of execution."""
597 def __init__(self
, url
):
598 """Creates _url and page_found attri to populate urls_with_no_reg_page file.
600 Used after thread's termination for the creation of a file with a list of
601 the urls for which a registration page wasn't found.
604 url: will be used as an argument to create a Crawler object later.
606 threading
.Thread
.__init
__(self
)
608 self
.page_found
= False
611 """Execution of thread creates a Crawler object and runs it.
613 Caution: this function name should not be changed to 'Run' or any other
614 names because it is overriding the 'run' method of the 'threading.Thread'
615 class. Otherwise it will never be called.
617 self
.page_found
= Crawler(self
._url
).Run()
620 class ThreadedCrawler(object):
621 """Calls the Run function of WorkerThread which creates & runs a Crawler obj.
623 The crawler object runs concurrently, examining one site each.
625 logger
= logging
.getLogger(__name__
)
627 def __init__(self
, urls_file
, logging_level
=None):
628 """Creates threaded Crawler objects.
631 urls_file: a text file containing a URL in each line.
632 logging_level: verbosity level, default is None.
635 IOError: If cannot find URLs from the list.
638 self
.logger
.setLevel(logging_level
)
643 for url
in f
.readlines():
645 if not urlparse
.urlparse(url
)[0].startswith('http'):
647 '%s: skipping this (does not begin with "http://")', url
)
649 self
._urls
_list
.append(url
)
651 self
.logger
.error('Error: %s', e
)
655 if not self
._urls
_list
:
656 error_msg
= 'No URLs were found.'
657 self
.logger
.error('ERROR: %s', error_msg
)
658 raise IOError(error_msg
)
661 """Runs Crawler objects using python threads.
663 Number of concurrent threads is restricted to MAX_ALLOWED_THREADS.
666 The number of registration pages found. -1 if no URLs are given.
669 OSError: When creating the same directory that already exists.
673 # originalNumThreads is the number of threads just before the
674 # ThreadedCrawler starts creating new threads. As a standalone script it
676 originalNumThreads
= threading
.active_count()
677 for url
in self
._urls
_list
:
678 self
.logger
.info('URL fed to a crawler thread: %s', url
)
679 t
= WorkerThread(url
)
682 while threading
.active_count() >= (
683 MAX_ALLOWED_THREADS
+ originalNumThreads
):
685 while threading
.active_count() > originalNumThreads
:
687 self
.logger
.info('----------------')
688 self
.logger
.info('--- FINISHED ---')
689 self
.logger
.info('----------------')
691 urls_not_found_no
= 0
692 not_file_name
= os
.path
.join(
693 REGISTER_PAGE_DIR
, NOT_FOUND_REG_PAGE_SITES_FILENAME
)
694 not_file_dir
= os
.path
.dirname(not_file_name
)
696 os
.makedirs(not_file_dir
)
698 if e
.errno
!= errno
.EEXIST
:
700 fnot
= open(not_file_name
, 'wb')
702 for t
in sorted(allThreads
, key
=lambda t
: t
._url
):
705 urls_not_found_no
+= 1
706 fnot
.write('%s' % t
._url
)
707 fnot
.write(os
.linesep
)
709 self
.logger
.error('Error: %s', e
)
712 self
.logger
.info('Total number of URLs given: %d\n', urls_no
)
714 'Registration pages found: %d\n', (urls_no
- urls_not_found_no
))
716 'URLs that did not return a registration page: %d\n',
718 return urls_no
- urls_not_found_no
720 self
.logger
.error('Error: no URLs were found.')
725 usage
= 'usage: %prog [options] single_url_or_urls_filename'
726 parser
= optparse
.OptionParser(usage
)
728 '-l', '--log_level', metavar
='LOG_LEVEL', default
='error',
729 help='LOG_LEVEL: debug, info, warning or error [default: %default]')
731 (options
, args
) = parser
.parse_args()
732 options
.log_level
= options
.log_level
.upper()
733 if options
.log_level
not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
734 print 'Wrong log_level argument.'
737 options
.log_level
= getattr(logging
, options
.log_level
)
740 parser
.error('Wrong number of arguments.')
742 logger
= logging
.getLogger(__name__
)
743 if options
.log_level
:
744 console
= logging
.StreamHandler()
745 logger
.addHandler(console
)
746 logger
.setLevel(options
.log_level
)
748 arg_is_a_file
= os
.path
.isfile(args
[0])
750 CrawlerClass
= ThreadedCrawler
752 CrawlerClass
= Crawler
753 t0
= datetime
.datetime
.now()
754 c
= CrawlerClass(args
[0], options
.log_level
)
756 if not arg_is_a_file
and c
.url_error
:
758 'ERROR: "%s" is neither a valid filename nor a valid URL' % args
[0])
759 t1
= datetime
.datetime
.now()
761 logger
.info('Started at: %s\n', t0
)
762 logger
.info('Ended at: %s\n', t1
)
763 logger
.info('Total execution time: %s\n', delta_t
)
767 if __name__
== "__main__":