3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__
= "$Revision$"
124 # Extract real version number if necessary
125 if __version__
[0] == '$':
126 _v
= string
.split(__version__
)
132 DEFROOT
= "file:/usr/local/etc/httpd/htdocs/" # Default root URL
133 CHECKEXT
= 1 # Check external references (1 deep)
134 VERBOSE
= 1 # Verbosity level (0-3)
135 MAXPAGE
= 150000 # Ignore files bigger than this
136 ROUNDSIZE
= 50 # Number of links processed per round
137 DUMPFILE
= "@webchecker.pickle" # Pickled checkpoint
138 AGENTNAME
= "webchecker" # Agent name for robots.txt parser
139 NONAMES
= 0 # Force name anchor checking
149 roundsize
= ROUNDSIZE
155 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'Rd:m:nqr:t:vxa')
156 except getopt
.error
, msg
:
157 sys
.stdout
= sys
.stderr
159 print __doc__
%globals
()
162 # The extra_roots variable collects extra roots.
172 maxpage
= string
.atoi(a
)
178 roundsize
= string
.atoi(a
)
180 extra_roots
.append(a
)
182 nonames
= not nonames
184 verbose
= verbose
+ 1
186 checkext
= not checkext
189 print AGENTNAME
, "version", __version__
192 c
= load_pickle(dumpfile
=dumpfile
, verbose
=verbose
)
196 c
.setflags(checkext
=checkext
, verbose
=verbose
,
197 maxpage
=maxpage
, roundsize
=roundsize
,
201 if not restart
and not args
:
207 # The -t flag is only needed if external links are not to be
208 # checked. So -t values are ignored unless -x was specified.
210 for root
in extra_roots
:
211 # Make sure it's terminated by a slash,
212 # so that addroot doesn't discard the last
213 # directory component.
216 c
.addroot(root
, add_to_do
= 0)
223 except KeyboardInterrupt:
225 print "[run interrupted]"
229 except KeyboardInterrupt:
231 print "[report interrupted]"
234 if c
.save_pickle(dumpfile
):
235 if dumpfile
== DUMPFILE
:
236 print "Use ``%s -R'' to restart." % sys
.argv
[0]
238 print "Use ``%s -R -d %s'' to restart." % (sys
.argv
[0],
242 def load_pickle(dumpfile
=DUMPFILE
, verbose
=VERBOSE
):
244 print "Loading checkpoint from %s ..." % dumpfile
245 f
= open(dumpfile
, "rb")
250 print "Root:", string
.join(c
.roots
, "\n ")
259 roundsize
= ROUNDSIZE
262 validflags
= tuple(dir())
267 def setflags(self
, **kw
):
268 for key
in kw
.keys():
269 if key
not in self
.validflags
:
270 raise NameError, "invalid keyword argument: %s" % str(key
)
271 for key
, value
in kw
.items():
272 setattr(self
, key
, value
)
280 # Add a name table, so that the name URLs can be checked. Also
281 # serves as an implicit cache for which URLs are done.
285 # The following are not pickled:
288 self
.urlopener
= MyURLopener()
291 def note(self
, level
, format
, *args
):
292 if self
.verbose
> level
:
297 def message(self
, format
, *args
):
302 def __getstate__(self
):
303 return (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round)
305 def __setstate__(self
, state
):
307 (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round) = state
308 for root
in self
.roots
:
310 for url
in self
.bad
.keys():
313 def addroot(self
, root
, add_to_do
= 1):
314 if root
not in self
.roots
:
316 scheme
, netloc
, path
, params
, query
, fragment
= \
317 urlparse
.urlparse(root
)
318 i
= string
.rfind(path
, "/") + 1
319 if 0 < i
< len(path
):
321 troot
= urlparse
.urlunparse((scheme
, netloc
, path
,
322 params
, query
, fragment
))
323 self
.roots
.append(troot
)
326 self
.newlink((root
, ""), ("<root>", root
))
328 def addrobot(self
, root
):
329 root
= urlparse
.urljoin(root
, "/")
330 if self
.robots
.has_key(root
): return
331 url
= urlparse
.urljoin(root
, "/robots.txt")
332 self
.robots
[root
] = rp
= robotparser
.RobotFileParser()
333 self
.note(2, "Parsing %s", url
)
334 rp
.debug
= self
.verbose
> 3
339 self
.note(1, "I/O error parsing %s: %s", url
, msg
)
343 self
.round = self
.round + 1
344 self
.note(0, "\nRound %d (%s)\n", self
.round, self
.status())
345 urls
= self
.todo
.keys()
347 del urls
[self
.roundsize
:]
352 return "%d total, %d to do, %d done, %d bad" % (
353 len(self
.todo
)+len(self
.done
),
354 len(self
.todo
), len(self
.done
),
359 if not self
.todo
: s
= "Final"
361 self
.message("%s Report (%s)", s
, self
.status())
364 def report_errors(self
):
366 self
.message("\nNo errors")
368 self
.message("\nError Report:")
369 sources
= self
.errors
.keys()
371 for source
in sources
:
372 triples
= self
.errors
[source
]
375 self
.message("%d Errors in %s", len(triples
), source
)
377 self
.message("Error in %s", source
)
378 # Call self.format_url() instead of referring
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
381 # of the "source" variable comes from the list of
382 # origins, and is a URL, not a pair.
383 for url
, rawlink
, msg
in triples
:
384 if rawlink
!= self
.format_url(url
): s
= " (%s)" % rawlink
386 self
.message(" HREF %s%s\n msg %s",
387 self
.format_url(url
), s
, msg
)
389 def dopage(self
, url_pair
):
391 # All printing of URLs uses format_url(); argument changed to
392 # url_pair for clarity.
395 self
.show("Check ", self
.format_url(url_pair
),
396 " from", self
.todo
[url_pair
])
398 self
.message("Check %s", self
.format_url(url_pair
))
399 url
, local_fragment
= url_pair
400 if local_fragment
and self
.nonames
:
401 self
.markdone(url_pair
)
403 page
= self
.getpage(url_pair
)
405 # Store the page which corresponds to this URL.
406 self
.name_table
[url
] = page
407 # If there is a fragment in this url_pair, and it's not
408 # in the list of names for the page, call setbad(), since
409 # it's a missing anchor.
410 if local_fragment
and local_fragment
not in page
.getnames():
411 self
.setbad(url_pair
, ("Missing name anchor `%s'" % local_fragment
))
412 for info
in page
.getlinkinfos():
413 # getlinkinfos() now returns the fragment as well,
414 # and we store that fragment here in the "todo" dictionary.
415 link
, rawlink
, fragment
= info
416 # However, we don't want the fragment as the origin, since
417 # the origin is logically a page.
418 origin
= url
, rawlink
419 self
.newlink((link
, fragment
), origin
)
421 # If no page has been created yet, we want to
423 self
.name_table
[url_pair
[0]] = None
424 self
.markdone(url_pair
)
426 def newlink(self
, url
, origin
):
427 if self
.done
.has_key(url
):
428 self
.newdonelink(url
, origin
)
430 self
.newtodolink(url
, origin
)
432 def newdonelink(self
, url
, origin
):
433 if origin
not in self
.done
[url
]:
434 self
.done
[url
].append(origin
)
436 # Call self.format_url(), since the URL here
437 # is now a (URL, fragment) pair.
438 self
.note(3, " Done link %s", self
.format_url(url
))
440 # Make sure that if it's bad, that the origin gets added.
441 if self
.bad
.has_key(url
):
442 source
, rawlink
= origin
443 triple
= url
, rawlink
, self
.bad
[url
]
444 self
.seterror(source
, triple
)
446 def newtodolink(self
, url
, origin
):
447 # Call self.format_url(), since the URL here
448 # is now a (URL, fragment) pair.
449 if self
.todo
.has_key(url
):
450 if origin
not in self
.todo
[url
]:
451 self
.todo
[url
].append(origin
)
452 self
.note(3, " Seen todo link %s", self
.format_url(url
))
454 self
.todo
[url
] = [origin
]
455 self
.note(3, " New todo link %s", self
.format_url(url
))
457 def format_url(self
, url
):
459 if fragment
: return link
+ "#" + fragment
462 def markdone(self
, url
):
463 self
.done
[url
] = self
.todo
[url
]
467 def inroots(self
, url
):
468 for root
in self
.roots
:
469 if url
[:len(root
)] == root
:
470 return self
.isallowed(root
, url
)
473 def isallowed(self
, root
, url
):
474 root
= urlparse
.urljoin(root
, "/")
475 return self
.robots
[root
].can_fetch(AGENTNAME
, url
)
477 def getpage(self
, url_pair
):
478 # Incoming argument name is a (URL, fragment) pair.
479 # The page may have been cached in the name_table variable.
480 url
, fragment
= url_pair
481 if self
.name_table
.has_key(url
):
482 return self
.name_table
[url
]
484 scheme
= urllib
.splittype(url
)
485 if scheme
in ('mailto', 'news', 'javascript', 'telnet'):
486 self
.note(1, " Not checking %s URL" % scheme
)
488 isint
= self
.inroots(url
)
490 # Ensure that openpage gets the URL pair to
491 # print out its error message and record the error pair
494 if not self
.checkext
:
495 self
.note(1, " Not checking ext link")
497 f
= self
.openpage(url_pair
)
501 text
, nurl
= self
.readhtml(url_pair
)
504 self
.note(1, " Redirected to %s", nurl
)
507 return Page(text
, url
, maxpage
=self
.maxpage
, checker
=self
)
509 # These next three functions take (URL, fragment) pairs as
510 # arguments, so that openpage() receives the appropriate tuple to
511 # record error messages.
512 def readhtml(self
, url_pair
):
513 url
, fragment
= url_pair
515 f
, url
= self
.openhtml(url_pair
)
521 def openhtml(self
, url_pair
):
522 url
, fragment
= url_pair
523 f
= self
.openpage(url_pair
)
527 if not self
.checkforhtml(info
, url
):
532 def openpage(self
, url_pair
):
533 url
, fragment
= url_pair
535 return self
.urlopener
.open(url
)
537 msg
= self
.sanitize(msg
)
538 self
.note(0, "Error %s", msg
)
540 self
.show(" HREF ", url
, " from", self
.todo
[url_pair
])
541 self
.setbad(url_pair
, msg
)
544 def checkforhtml(self
, info
, url
):
545 if info
.has_key('content-type'):
546 ctype
= string
.lower(info
['content-type'])
550 ctype
, encoding
= mimetypes
.guess_type(url
)
551 if ctype
== 'text/html':
554 self
.note(1, " Not HTML, mime type %s", ctype
)
557 def setgood(self
, url
):
558 if self
.bad
.has_key(url
):
561 self
.note(0, "(Clear previously seen error)")
563 def setbad(self
, url
, msg
):
564 if self
.bad
.has_key(url
) and self
.bad
[url
] == msg
:
565 self
.note(0, "(Seen this error before)")
571 def markerror(self
, url
):
573 origins
= self
.todo
[url
]
575 origins
= self
.done
[url
]
576 for source
, rawlink
in origins
:
577 triple
= url
, rawlink
, self
.bad
[url
]
578 self
.seterror(source
, triple
)
580 def seterror(self
, url
, triple
):
582 # Because of the way the URLs are now processed, I need to
583 # check to make sure the URL hasn't been entered in the
584 # error list. The first element of the triple here is a
585 # (URL, fragment) pair, but the URL key is not, since it's
586 # from the list of origins.
587 if triple
not in self
.errors
[url
]:
588 self
.errors
[url
].append(triple
)
590 self
.errors
[url
] = [triple
]
592 # The following used to be toplevel functions; they have been
593 # changed into methods so they can be overridden in subclasses.
595 def show(self
, p1
, link
, p2
, origins
):
596 self
.message("%s %s", p1
, link
)
598 for source
, rawlink
in origins
:
602 if rawlink
!= link
: s
= " (%s)" % rawlink
604 self
.message("%s %s%s", p2
, source
, s
)
606 def sanitize(self
, msg
):
607 if isinstance(IOError, ClassType
) and isinstance(msg
, IOError):
608 # Do the other branch recursively
609 msg
.args
= self
.sanitize(msg
.args
)
610 elif isinstance(msg
, TupleType
):
611 if len(msg
) >= 4 and msg
[0] == 'http error' and \
612 isinstance(msg
[3], InstanceType
):
613 # Remove the Message instance -- it may contain
614 # a file object which prevents pickling.
615 msg
= msg
[:3] + msg
[4:]
618 def safeclose(self
, f
):
621 except AttributeError:
624 if url
[:4] == 'ftp:' or url
[:7] == 'file://':
625 # Apparently ftp connections don't like to be closed
630 def save_pickle(self
, dumpfile
=DUMPFILE
):
632 self
.note(0, "\nNo need to save checkpoint")
634 self
.note(0, "No dumpfile, won't save checkpoint")
636 self
.note(0, "\nSaving checkpoint to %s ...", dumpfile
)
637 newfile
= dumpfile
+ ".new"
638 f
= open(newfile
, "wb")
645 os
.rename(newfile
, dumpfile
)
646 self
.note(0, "Done.")
652 def __init__(self
, text
, url
, verbose
=VERBOSE
, maxpage
=MAXPAGE
, checker
=None):
655 self
.verbose
= verbose
656 self
.maxpage
= maxpage
657 self
.checker
= checker
659 # The parsing of the page is done in the __init__() routine in
660 # order to initialize the list of names the file
661 # contains. Stored the parser in an instance variable. Passed
662 # the URL to MyHTMLParser().
663 size
= len(self
.text
)
664 if size
> self
.maxpage
:
665 self
.note(0, "Skip huge file %s (%.0f Kbytes)", self
.url
, (size
*0.001))
668 self
.checker
.note(2, " Parsing %s (%d bytes)", self
.url
, size
)
669 self
.parser
= MyHTMLParser(url
, verbose
=self
.verbose
,
670 checker
=self
.checker
)
671 self
.parser
.feed(self
.text
)
674 def note(self
, level
, msg
, *args
):
676 apply(self
.checker
.note
, (level
, msg
) + args
)
678 if self
.verbose
>= level
:
683 # Method to retrieve names.
686 return self
.parser
.names
690 def getlinkinfos(self
):
691 # File reading is done in __init__() routine. Store parser in
692 # local variable to indicate success of parsing.
694 # If no parser was stored, fail.
695 if not self
.parser
: return []
697 rawlinks
= self
.parser
.getlinks()
698 base
= urlparse
.urljoin(self
.url
, self
.parser
.getbase() or "")
700 for rawlink
in rawlinks
:
701 t
= urlparse
.urlparse(rawlink
)
702 # DON'T DISCARD THE FRAGMENT! Instead, include
703 # it in the tuples which are returned. See Checker.dopage().
706 rawlink
= urlparse
.urlunparse(t
)
707 link
= urlparse
.urljoin(base
, rawlink
)
708 infos
.append((link
, rawlink
, fragment
))
713 class MyStringIO(StringIO
.StringIO
):
715 def __init__(self
, url
, info
):
718 StringIO
.StringIO
.__init
__(self
)
727 class MyURLopener(urllib
.FancyURLopener
):
729 http_error_default
= urllib
.URLopener
.http_error_default
733 apply(urllib
.FancyURLopener
.__init
__, args
)
735 ('User-agent', 'Python-webchecker/%s' % __version__
),
738 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
):
741 def open_file(self
, url
):
742 path
= urllib
.url2pathname(urllib
.unquote(url
))
743 if os
.path
.isdir(path
):
744 if path
[-1] != os
.sep
:
746 indexpath
= os
.path
.join(path
, "index.html")
747 if os
.path
.exists(indexpath
):
748 return self
.open_file(url
+ "index.html")
750 names
= os
.listdir(path
)
751 except os
.error
, msg
:
752 raise IOError, msg
, sys
.exc_traceback
754 s
= MyStringIO("file:"+url
, {'content-type': 'text/html'})
755 s
.write('<BASE HREF="file:%s">\n' %
756 urllib
.quote(os
.path
.join(path
, "")))
758 q
= urllib
.quote(name
)
759 s
.write('<A HREF="%s">%s</A>\n' % (q
, q
))
762 return urllib
.FancyURLopener
.open_file(self
, url
)
765 class MyHTMLParser(sgmllib
.SGMLParser
):
767 def __init__(self
, url
, verbose
=VERBOSE
, checker
=None):
768 self
.myverbose
= verbose
# now unused
769 self
.checker
= checker
774 sgmllib
.SGMLParser
.__init
__(self
)
776 def start_a(self
, attributes
):
777 self
.link_attr(attributes
, 'href')
779 # We must rescue the NAME
780 # attributes from the anchor, in order to
781 # cache the internal anchors which are made
782 # available in the page.
783 for name
, value
in attributes
:
785 if value
in self
.names
:
786 self
.checker
.message("WARNING: duplicate name %s in %s",
788 else: self
.names
.append(value
)
791 def end_a(self
): pass
793 def do_area(self
, attributes
):
794 self
.link_attr(attributes
, 'href')
796 def do_body(self
, attributes
):
797 self
.link_attr(attributes
, 'background', 'bgsound')
799 def do_img(self
, attributes
):
800 self
.link_attr(attributes
, 'src', 'lowsrc')
802 def do_frame(self
, attributes
):
803 self
.link_attr(attributes
, 'src', 'longdesc')
805 def do_iframe(self
, attributes
):
806 self
.link_attr(attributes
, 'src', 'longdesc')
808 def do_link(self
, attributes
):
809 for name
, value
in attributes
:
811 parts
= string
.split(string
.lower(value
))
812 if ( parts
== ["stylesheet"]
813 or parts
== ["alternate", "stylesheet"]):
814 self
.link_attr(attributes
, "href")
817 def do_object(self
, attributes
):
818 self
.link_attr(attributes
, 'data', 'usemap')
820 def do_script(self
, attributes
):
821 self
.link_attr(attributes
, 'src')
823 def do_table(self
, attributes
):
824 self
.link_attr(attributes
, 'background')
826 def do_td(self
, attributes
):
827 self
.link_attr(attributes
, 'background')
829 def do_th(self
, attributes
):
830 self
.link_attr(attributes
, 'background')
832 def do_tr(self
, attributes
):
833 self
.link_attr(attributes
, 'background')
835 def link_attr(self
, attributes
, *args
):
836 for name
, value
in attributes
:
838 if value
: value
= string
.strip(value
)
839 if value
: self
.links
[value
] = None
841 def do_base(self
, attributes
):
842 for name
, value
in attributes
:
844 if value
: value
= string
.strip(value
)
847 self
.checker
.note(1, " Base %s", value
)
851 return self
.links
.keys()
857 if __name__
== '__main__':