3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__
= "$Revision$"
124 # Extract real version number if necessary
125 if __version__
[0] == '$':
126 _v
= string
.split(__version__
)
132 DEFROOT
= "file:/usr/local/etc/httpd/htdocs/" # Default root URL
133 CHECKEXT
= 1 # Check external references (1 deep)
134 VERBOSE
= 1 # Verbosity level (0-3)
135 MAXPAGE
= 150000 # Ignore files bigger than this
136 ROUNDSIZE
= 50 # Number of links processed per round
137 DUMPFILE
= "@webchecker.pickle" # Pickled checkpoint
138 AGENTNAME
= "webchecker" # Agent name for robots.txt parser
139 NONAMES
= 0 # Force name anchor checking
149 roundsize
= ROUNDSIZE
155 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'Rd:m:nqr:t:vxa')
156 except getopt
.error
, msg
:
157 sys
.stdout
= sys
.stderr
159 print __doc__
%globals
()
162 # The extra_roots variable collects extra roots.
172 maxpage
= string
.atoi(a
)
178 roundsize
= string
.atoi(a
)
180 extra_roots
.append(a
)
182 nonames
= not nonames
184 verbose
= verbose
+ 1
186 checkext
= not checkext
189 print AGENTNAME
, "version", __version__
192 c
= load_pickle(dumpfile
=dumpfile
, verbose
=verbose
)
196 c
.setflags(checkext
=checkext
, verbose
=verbose
,
197 maxpage
=maxpage
, roundsize
=roundsize
,
201 if not restart
and not args
:
207 # The -t flag is only needed if external links are not to be
208 # checked. So -t values are ignored unless -x was specified.
210 for root
in extra_roots
:
211 # Make sure it's terminated by a slash,
212 # so that addroot doesn't discard the last
213 # directory component.
216 c
.addroot(root
, add_to_do
= 0)
223 except KeyboardInterrupt:
225 print "[run interrupted]"
229 except KeyboardInterrupt:
231 print "[report interrupted]"
234 if c
.save_pickle(dumpfile
):
235 if dumpfile
== DUMPFILE
:
236 print "Use ``%s -R'' to restart." % sys
.argv
[0]
238 print "Use ``%s -R -d %s'' to restart." % (sys
.argv
[0],
242 def load_pickle(dumpfile
=DUMPFILE
, verbose
=VERBOSE
):
244 print "Loading checkpoint from %s ..." % dumpfile
245 f
= open(dumpfile
, "rb")
250 print "Root:", string
.join(c
.roots
, "\n ")
259 roundsize
= ROUNDSIZE
262 validflags
= tuple(dir())
267 def setflags(self
, **kw
):
268 for key
in kw
.keys():
269 if key
not in self
.validflags
:
270 raise NameError, "invalid keyword argument: %s" % str(key
)
271 for key
, value
in kw
.items():
272 setattr(self
, key
, value
)
280 # Add a name table, so that the name URLs can be checked. Also
281 # serves as an implicit cache for which URLs are done.
285 # The following are not pickled:
288 self
.urlopener
= MyURLopener()
291 def note(self
, level
, format
, *args
):
292 if self
.verbose
> level
:
297 def message(self
, format
, *args
):
302 def __getstate__(self
):
303 return (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round)
305 def __setstate__(self
, state
):
307 (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round) = state
308 for root
in self
.roots
:
310 for url
in self
.bad
.keys():
313 def addroot(self
, root
, add_to_do
= 1):
314 if root
not in self
.roots
:
316 scheme
, netloc
, path
, params
, query
, fragment
= \
317 urlparse
.urlparse(root
)
318 i
= string
.rfind(path
, "/") + 1
319 if 0 < i
< len(path
):
321 troot
= urlparse
.urlunparse((scheme
, netloc
, path
,
322 params
, query
, fragment
))
323 self
.roots
.append(troot
)
326 self
.newlink((root
, ""), ("<root>", root
))
328 def addrobot(self
, root
):
329 root
= urlparse
.urljoin(root
, "/")
330 if self
.robots
.has_key(root
): return
331 url
= urlparse
.urljoin(root
, "/robots.txt")
332 self
.robots
[root
] = rp
= robotparser
.RobotFileParser()
333 self
.note(2, "Parsing %s", url
)
334 rp
.debug
= self
.verbose
> 3
339 self
.note(1, "I/O error parsing %s: %s", url
, msg
)
343 self
.round = self
.round + 1
344 self
.note(0, "\nRound %d (%s)\n", self
.round, self
.status())
345 urls
= self
.todo
.keys()
347 del urls
[self
.roundsize
:]
352 return "%d total, %d to do, %d done, %d bad" % (
353 len(self
.todo
)+len(self
.done
),
354 len(self
.todo
), len(self
.done
),
359 if not self
.todo
: s
= "Final"
361 self
.message("%s Report (%s)", s
, self
.status())
364 def report_errors(self
):
366 self
.message("\nNo errors")
368 self
.message("\nError Report:")
369 sources
= self
.errors
.keys()
371 for source
in sources
:
372 triples
= self
.errors
[source
]
375 self
.message("%d Errors in %s", len(triples
), source
)
377 self
.message("Error in %s", source
)
378 # Call self.format_url() instead of referring
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
381 # of the "source" variable comes from the list of
382 # origins, and is a URL, not a pair.
383 for url
, rawlink
, msg
in triples
:
384 if rawlink
!= self
.format_url(url
): s
= " (%s)" % rawlink
386 self
.message(" HREF %s%s\n msg %s",
387 self
.format_url(url
), s
, msg
)
389 def dopage(self
, url_pair
):
391 # All printing of URLs uses format_url(); argument changed to
392 # url_pair for clarity.
395 self
.show("Check ", self
.format_url(url_pair
),
396 " from", self
.todo
[url_pair
])
398 self
.message("Check %s", self
.format_url(url_pair
))
399 url
, local_fragment
= url_pair
400 if local_fragment
and self
.nonames
:
401 self
.markdone(url_pair
)
403 page
= self
.getpage(url_pair
)
405 # Store the page which corresponds to this URL.
406 self
.name_table
[url
] = page
407 # If there is a fragment in this url_pair, and it's not
408 # in the list of names for the page, call setbad(), since
409 # it's a missing anchor.
410 if local_fragment
and local_fragment
not in page
.getnames():
411 self
.setbad(url_pair
, ("Missing name anchor `%s'" % local_fragment
))
412 for info
in page
.getlinkinfos():
413 # getlinkinfos() now returns the fragment as well,
414 # and we store that fragment here in the "todo" dictionary.
415 link
, rawlink
, fragment
= info
416 # However, we don't want the fragment as the origin, since
417 # the origin is logically a page.
418 origin
= url
, rawlink
419 self
.newlink((link
, fragment
), origin
)
421 # If no page has been created yet, we want to
423 self
.name_table
[url_pair
[0]] = None
424 self
.markdone(url_pair
)
426 def newlink(self
, url
, origin
):
427 if self
.done
.has_key(url
):
428 self
.newdonelink(url
, origin
)
430 self
.newtodolink(url
, origin
)
432 def newdonelink(self
, url
, origin
):
433 if origin
not in self
.done
[url
]:
434 self
.done
[url
].append(origin
)
436 # Call self.format_url(), since the URL here
437 # is now a (URL, fragment) pair.
438 self
.note(3, " Done link %s", self
.format_url(url
))
440 # Make sure that if it's bad, that the origin gets added.
441 if self
.bad
.has_key(url
):
442 source
, rawlink
= origin
443 triple
= url
, rawlink
, self
.bad
[url
]
444 self
.seterror(source
, triple
)
446 def newtodolink(self
, url
, origin
):
447 # Call self.format_url(), since the URL here
448 # is now a (URL, fragment) pair.
449 if self
.todo
.has_key(url
):
450 if origin
not in self
.todo
[url
]:
451 self
.todo
[url
].append(origin
)
452 self
.note(3, " Seen todo link %s", self
.format_url(url
))
454 self
.todo
[url
] = [origin
]
455 self
.note(3, " New todo link %s", self
.format_url(url
))
457 def format_url(self
, url
):
459 if fragment
: return link
+ "#" + fragment
462 def markdone(self
, url
):
463 self
.done
[url
] = self
.todo
[url
]
467 def inroots(self
, url
):
468 for root
in self
.roots
:
469 if url
[:len(root
)] == root
:
470 return self
.isallowed(root
, url
)
473 def isallowed(self
, root
, url
):
474 root
= urlparse
.urljoin(root
, "/")
475 return self
.robots
[root
].can_fetch(AGENTNAME
, url
)
477 def getpage(self
, url_pair
):
478 # Incoming argument name is a (URL, fragment) pair.
479 # The page may have been cached in the name_table variable.
480 url
, fragment
= url_pair
481 if self
.name_table
.has_key(url
):
482 return self
.name_table
[url
]
484 if url
[:7] == 'mailto:' or url
[:5] == 'news:':
485 self
.note(1, " Not checking mailto/news URL")
487 isint
= self
.inroots(url
)
489 # Ensure that openpage gets the URL pair to
490 # print out its error message and record the error pair
493 if not self
.checkext
:
494 self
.note(1, " Not checking ext link")
496 f
= self
.openpage(url_pair
)
500 text
, nurl
= self
.readhtml(url_pair
)
503 self
.note(1, " Redirected to %s", nurl
)
506 return Page(text
, url
, maxpage
=self
.maxpage
, checker
=self
)
508 # These next three functions take (URL, fragment) pairs as
509 # arguments, so that openpage() receives the appropriate tuple to
510 # record error messages.
511 def readhtml(self
, url_pair
):
512 url
, fragment
= url_pair
514 f
, url
= self
.openhtml(url_pair
)
520 def openhtml(self
, url_pair
):
521 url
, fragment
= url_pair
522 f
= self
.openpage(url_pair
)
526 if not self
.checkforhtml(info
, url
):
531 def openpage(self
, url_pair
):
532 url
, fragment
= url_pair
534 return self
.urlopener
.open(url
)
536 msg
= self
.sanitize(msg
)
537 self
.note(0, "Error %s", msg
)
539 self
.show(" HREF ", url
, " from", self
.todo
[url_pair
])
540 self
.setbad(url_pair
, msg
)
543 def checkforhtml(self
, info
, url
):
544 if info
.has_key('content-type'):
545 ctype
= string
.lower(info
['content-type'])
549 ctype
, encoding
= mimetypes
.guess_type(url
)
550 if ctype
== 'text/html':
553 self
.note(1, " Not HTML, mime type %s", ctype
)
556 def setgood(self
, url
):
557 if self
.bad
.has_key(url
):
560 self
.note(0, "(Clear previously seen error)")
562 def setbad(self
, url
, msg
):
563 if self
.bad
.has_key(url
) and self
.bad
[url
] == msg
:
564 self
.note(0, "(Seen this error before)")
570 def markerror(self
, url
):
572 origins
= self
.todo
[url
]
574 origins
= self
.done
[url
]
575 for source
, rawlink
in origins
:
576 triple
= url
, rawlink
, self
.bad
[url
]
577 self
.seterror(source
, triple
)
579 def seterror(self
, url
, triple
):
581 # Because of the way the URLs are now processed, I need to
582 # check to make sure the URL hasn't been entered in the
583 # error list. The first element of the triple here is a
584 # (URL, fragment) pair, but the URL key is not, since it's
585 # from the list of origins.
586 if triple
not in self
.errors
[url
]:
587 self
.errors
[url
].append(triple
)
589 self
.errors
[url
] = [triple
]
591 # The following used to be toplevel functions; they have been
592 # changed into methods so they can be overridden in subclasses.
594 def show(self
, p1
, link
, p2
, origins
):
595 self
.message("%s %s", p1
, link
)
597 for source
, rawlink
in origins
:
601 if rawlink
!= link
: s
= " (%s)" % rawlink
603 self
.message("%s %s%s", p2
, source
, s
)
605 def sanitize(self
, msg
):
606 if isinstance(IOError, ClassType
) and isinstance(msg
, IOError):
607 # Do the other branch recursively
608 msg
.args
= self
.sanitize(msg
.args
)
609 elif isinstance(msg
, TupleType
):
610 if len(msg
) >= 4 and msg
[0] == 'http error' and \
611 isinstance(msg
[3], InstanceType
):
612 # Remove the Message instance -- it may contain
613 # a file object which prevents pickling.
614 msg
= msg
[:3] + msg
[4:]
617 def safeclose(self
, f
):
620 except AttributeError:
623 if url
[:4] == 'ftp:' or url
[:7] == 'file://':
624 # Apparently ftp connections don't like to be closed
629 def save_pickle(self
, dumpfile
=DUMPFILE
):
631 self
.note(0, "\nNo need to save checkpoint")
633 self
.note(0, "No dumpfile, won't save checkpoint")
635 self
.note(0, "\nSaving checkpoint to %s ...", dumpfile
)
636 newfile
= dumpfile
+ ".new"
637 f
= open(newfile
, "wb")
644 os
.rename(newfile
, dumpfile
)
645 self
.note(0, "Done.")
651 def __init__(self
, text
, url
, verbose
=VERBOSE
, maxpage
=MAXPAGE
, checker
=None):
654 self
.verbose
= verbose
655 self
.maxpage
= maxpage
656 self
.checker
= checker
658 # The parsing of the page is done in the __init__() routine in
659 # order to initialize the list of names the file
660 # contains. Stored the parser in an instance variable. Passed
661 # the URL to MyHTMLParser().
662 size
= len(self
.text
)
663 if size
> self
.maxpage
:
664 self
.note(0, "Skip huge file %s (%.0f Kbytes)", self
.url
, (size
*0.001))
667 self
.checker
.note(2, " Parsing %s (%d bytes)", self
.url
, size
)
668 self
.parser
= MyHTMLParser(url
, verbose
=self
.verbose
,
669 checker
=self
.checker
)
670 self
.parser
.feed(self
.text
)
673 def note(self
, level
, msg
, *args
):
675 apply(self
.checker
.note
, (level
, msg
) + args
)
677 if self
.verbose
>= level
:
682 # Method to retrieve names.
685 return self
.parser
.names
689 def getlinkinfos(self
):
690 # File reading is done in __init__() routine. Store parser in
691 # local variable to indicate success of parsing.
693 # If no parser was stored, fail.
694 if not self
.parser
: return []
696 rawlinks
= self
.parser
.getlinks()
697 base
= urlparse
.urljoin(self
.url
, self
.parser
.getbase() or "")
699 for rawlink
in rawlinks
:
700 t
= urlparse
.urlparse(rawlink
)
701 # DON'T DISCARD THE FRAGMENT! Instead, include
702 # it in the tuples which are returned. See Checker.dopage().
705 rawlink
= urlparse
.urlunparse(t
)
706 link
= urlparse
.urljoin(base
, rawlink
)
707 infos
.append((link
, rawlink
, fragment
))
712 class MyStringIO(StringIO
.StringIO
):
714 def __init__(self
, url
, info
):
717 StringIO
.StringIO
.__init
__(self
)
726 class MyURLopener(urllib
.FancyURLopener
):
728 http_error_default
= urllib
.URLopener
.http_error_default
732 apply(urllib
.FancyURLopener
.__init
__, args
)
734 ('User-agent', 'Python-webchecker/%s' % __version__
),
737 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
):
740 def open_file(self
, url
):
741 path
= urllib
.url2pathname(urllib
.unquote(url
))
742 if os
.path
.isdir(path
):
743 if path
[-1] != os
.sep
:
745 indexpath
= os
.path
.join(path
, "index.html")
746 if os
.path
.exists(indexpath
):
747 return self
.open_file(url
+ "index.html")
749 names
= os
.listdir(path
)
750 except os
.error
, msg
:
751 raise IOError, msg
, sys
.exc_traceback
753 s
= MyStringIO("file:"+url
, {'content-type': 'text/html'})
754 s
.write('<BASE HREF="file:%s">\n' %
755 urllib
.quote(os
.path
.join(path
, "")))
757 q
= urllib
.quote(name
)
758 s
.write('<A HREF="%s">%s</A>\n' % (q
, q
))
761 return urllib
.FancyURLopener
.open_file(self
, url
)
764 class MyHTMLParser(sgmllib
.SGMLParser
):
766 def __init__(self
, url
, verbose
=VERBOSE
, checker
=None):
767 self
.myverbose
= verbose
# now unused
768 self
.checker
= checker
773 sgmllib
.SGMLParser
.__init
__(self
)
775 def start_a(self
, attributes
):
776 self
.link_attr(attributes
, 'href')
778 # We must rescue the NAME
779 # attributes from the anchor, in order to
780 # cache the internal anchors which are made
781 # available in the page.
782 for name
, value
in attributes
:
784 if value
in self
.names
:
785 self
.checker
.message("WARNING: duplicate name %s in %s",
787 else: self
.names
.append(value
)
790 def end_a(self
): pass
792 def do_area(self
, attributes
):
793 self
.link_attr(attributes
, 'href')
795 def do_img(self
, attributes
):
796 self
.link_attr(attributes
, 'src', 'lowsrc')
798 def do_frame(self
, attributes
):
799 self
.link_attr(attributes
, 'src')
801 def link_attr(self
, attributes
, *args
):
802 for name
, value
in attributes
:
804 if value
: value
= string
.strip(value
)
805 if value
: self
.links
[value
] = None
807 def do_base(self
, attributes
):
808 for name
, value
in attributes
:
810 if value
: value
= string
.strip(value
)
813 self
.checker
.note(1, " Base %s", value
)
817 return self
.links
.keys()
823 if __name__
== '__main__':