3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__
= "$Revision$"
124 # Extract real version number if necessary
125 if __version__
[0] == '$':
126 _v
= __version__
.split()
132 DEFROOT
= "file:/usr/local/etc/httpd/htdocs/" # Default root URL
133 CHECKEXT
= 1 # Check external references (1 deep)
134 VERBOSE
= 1 # Verbosity level (0-3)
135 MAXPAGE
= 150000 # Ignore files bigger than this
136 ROUNDSIZE
= 50 # Number of links processed per round
137 DUMPFILE
= "@webchecker.pickle" # Pickled checkpoint
138 AGENTNAME
= "webchecker" # Agent name for robots.txt parser
139 NONAMES
= 0 # Force name anchor checking
149 roundsize
= ROUNDSIZE
155 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'Rd:m:nqr:t:vxa')
156 except getopt
.error
, msg
:
157 sys
.stdout
= sys
.stderr
159 print __doc__
%globals
()
162 # The extra_roots variable collects extra roots.
180 extra_roots
.append(a
)
182 nonames
= not nonames
184 verbose
= verbose
+ 1
186 checkext
= not checkext
189 print AGENTNAME
, "version", __version__
192 c
= load_pickle(dumpfile
=dumpfile
, verbose
=verbose
)
196 c
.setflags(checkext
=checkext
, verbose
=verbose
,
197 maxpage
=maxpage
, roundsize
=roundsize
,
201 if not restart
and not args
:
207 # The -t flag is only needed if external links are not to be
208 # checked. So -t values are ignored unless -x was specified.
210 for root
in extra_roots
:
211 # Make sure it's terminated by a slash,
212 # so that addroot doesn't discard the last
213 # directory component.
216 c
.addroot(root
, add_to_do
= 0)
223 except KeyboardInterrupt:
225 print "[run interrupted]"
229 except KeyboardInterrupt:
231 print "[report interrupted]"
234 if c
.save_pickle(dumpfile
):
235 if dumpfile
== DUMPFILE
:
236 print "Use ``%s -R'' to restart." % sys
.argv
[0]
238 print "Use ``%s -R -d %s'' to restart." % (sys
.argv
[0],
242 def load_pickle(dumpfile
=DUMPFILE
, verbose
=VERBOSE
):
244 print "Loading checkpoint from %s ..." % dumpfile
245 f
= open(dumpfile
, "rb")
250 print "Root:", "\n ".join(c
.roots
)
259 roundsize
= ROUNDSIZE
262 validflags
= tuple(dir())
267 def setflags(self
, **kw
):
268 for key
in kw
.keys():
269 if key
not in self
.validflags
:
270 raise NameError, "invalid keyword argument: %s" % str(key
)
271 for key
, value
in kw
.items():
272 setattr(self
, key
, value
)
280 # Add a name table, so that the name URLs can be checked. Also
281 # serves as an implicit cache for which URLs are done.
285 # The following are not pickled:
288 self
.urlopener
= MyURLopener()
291 def note(self
, level
, format
, *args
):
292 if self
.verbose
> level
:
297 def message(self
, format
, *args
):
302 def __getstate__(self
):
303 return (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round)
305 def __setstate__(self
, state
):
307 (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round) = state
308 for root
in self
.roots
:
310 for url
in self
.bad
.keys():
313 def addroot(self
, root
, add_to_do
= 1):
314 if root
not in self
.roots
:
316 scheme
, netloc
, path
, params
, query
, fragment
= \
317 urlparse
.urlparse(root
)
318 i
= path
.rfind("/") + 1
319 if 0 < i
< len(path
):
321 troot
= urlparse
.urlunparse((scheme
, netloc
, path
,
322 params
, query
, fragment
))
323 self
.roots
.append(troot
)
326 self
.newlink((root
, ""), ("<root>", root
))
328 def addrobot(self
, root
):
329 root
= urlparse
.urljoin(root
, "/")
330 if self
.robots
.has_key(root
): return
331 url
= urlparse
.urljoin(root
, "/robots.txt")
332 self
.robots
[root
] = rp
= robotparser
.RobotFileParser()
333 self
.note(2, "Parsing %s", url
)
334 rp
.debug
= self
.verbose
> 3
338 except (OSError, IOError), msg
:
339 self
.note(1, "I/O error parsing %s: %s", url
, msg
)
343 self
.round = self
.round + 1
344 self
.note(0, "\nRound %d (%s)\n", self
.round, self
.status())
345 urls
= self
.todo
.keys()
347 del urls
[self
.roundsize
:]
352 return "%d total, %d to do, %d done, %d bad" % (
353 len(self
.todo
)+len(self
.done
),
354 len(self
.todo
), len(self
.done
),
359 if not self
.todo
: s
= "Final"
361 self
.message("%s Report (%s)", s
, self
.status())
364 def report_errors(self
):
366 self
.message("\nNo errors")
368 self
.message("\nError Report:")
369 sources
= self
.errors
.keys()
371 for source
in sources
:
372 triples
= self
.errors
[source
]
375 self
.message("%d Errors in %s", len(triples
), source
)
377 self
.message("Error in %s", source
)
378 # Call self.format_url() instead of referring
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
381 # of the "source" variable comes from the list of
382 # origins, and is a URL, not a pair.
383 for url
, rawlink
, msg
in triples
:
384 if rawlink
!= self
.format_url(url
): s
= " (%s)" % rawlink
386 self
.message(" HREF %s%s\n msg %s",
387 self
.format_url(url
), s
, msg
)
389 def dopage(self
, url_pair
):
391 # All printing of URLs uses format_url(); argument changed to
392 # url_pair for clarity.
395 self
.show("Check ", self
.format_url(url_pair
),
396 " from", self
.todo
[url_pair
])
398 self
.message("Check %s", self
.format_url(url_pair
))
399 url
, local_fragment
= url_pair
400 if local_fragment
and self
.nonames
:
401 self
.markdone(url_pair
)
404 page
= self
.getpage(url_pair
)
405 except sgmllib
.SGMLParseError
, msg
:
406 msg
= self
.sanitize(msg
)
407 self
.note(0, "Error parsing %s: %s",
408 self
.format_url(url_pair
), msg
)
409 # Dont actually mark the URL as bad - it exists, just
413 # Store the page which corresponds to this URL.
414 self
.name_table
[url
] = page
415 # If there is a fragment in this url_pair, and it's not
416 # in the list of names for the page, call setbad(), since
417 # it's a missing anchor.
418 if local_fragment
and local_fragment
not in page
.getnames():
419 self
.setbad(url_pair
, ("Missing name anchor `%s'" % local_fragment
))
420 for info
in page
.getlinkinfos():
421 # getlinkinfos() now returns the fragment as well,
422 # and we store that fragment here in the "todo" dictionary.
423 link
, rawlink
, fragment
= info
424 # However, we don't want the fragment as the origin, since
425 # the origin is logically a page.
426 origin
= url
, rawlink
427 self
.newlink((link
, fragment
), origin
)
429 # If no page has been created yet, we want to
431 self
.name_table
[url_pair
[0]] = None
432 self
.markdone(url_pair
)
434 def newlink(self
, url
, origin
):
435 if self
.done
.has_key(url
):
436 self
.newdonelink(url
, origin
)
438 self
.newtodolink(url
, origin
)
440 def newdonelink(self
, url
, origin
):
441 if origin
not in self
.done
[url
]:
442 self
.done
[url
].append(origin
)
444 # Call self.format_url(), since the URL here
445 # is now a (URL, fragment) pair.
446 self
.note(3, " Done link %s", self
.format_url(url
))
448 # Make sure that if it's bad, that the origin gets added.
449 if self
.bad
.has_key(url
):
450 source
, rawlink
= origin
451 triple
= url
, rawlink
, self
.bad
[url
]
452 self
.seterror(source
, triple
)
454 def newtodolink(self
, url
, origin
):
455 # Call self.format_url(), since the URL here
456 # is now a (URL, fragment) pair.
457 if self
.todo
.has_key(url
):
458 if origin
not in self
.todo
[url
]:
459 self
.todo
[url
].append(origin
)
460 self
.note(3, " Seen todo link %s", self
.format_url(url
))
462 self
.todo
[url
] = [origin
]
463 self
.note(3, " New todo link %s", self
.format_url(url
))
465 def format_url(self
, url
):
467 if fragment
: return link
+ "#" + fragment
470 def markdone(self
, url
):
471 self
.done
[url
] = self
.todo
[url
]
475 def inroots(self
, url
):
476 for root
in self
.roots
:
477 if url
[:len(root
)] == root
:
478 return self
.isallowed(root
, url
)
481 def isallowed(self
, root
, url
):
482 root
= urlparse
.urljoin(root
, "/")
483 return self
.robots
[root
].can_fetch(AGENTNAME
, url
)
485 def getpage(self
, url_pair
):
486 # Incoming argument name is a (URL, fragment) pair.
487 # The page may have been cached in the name_table variable.
488 url
, fragment
= url_pair
489 if self
.name_table
.has_key(url
):
490 return self
.name_table
[url
]
492 scheme
, path
= urllib
.splittype(url
)
493 if scheme
in ('mailto', 'news', 'javascript', 'telnet'):
494 self
.note(1, " Not checking %s URL" % scheme
)
496 isint
= self
.inroots(url
)
498 # Ensure that openpage gets the URL pair to
499 # print out its error message and record the error pair
502 if not self
.checkext
:
503 self
.note(1, " Not checking ext link")
505 f
= self
.openpage(url_pair
)
509 text
, nurl
= self
.readhtml(url_pair
)
512 self
.note(1, " Redirected to %s", nurl
)
515 return Page(text
, url
, maxpage
=self
.maxpage
, checker
=self
)
517 # These next three functions take (URL, fragment) pairs as
518 # arguments, so that openpage() receives the appropriate tuple to
519 # record error messages.
520 def readhtml(self
, url_pair
):
521 url
, fragment
= url_pair
523 f
, url
= self
.openhtml(url_pair
)
529 def openhtml(self
, url_pair
):
530 url
, fragment
= url_pair
531 f
= self
.openpage(url_pair
)
535 if not self
.checkforhtml(info
, url
):
540 def openpage(self
, url_pair
):
541 url
, fragment
= url_pair
543 return self
.urlopener
.open(url
)
544 except (OSError, IOError), msg
:
545 msg
= self
.sanitize(msg
)
546 self
.note(0, "Error %s", msg
)
548 self
.show(" HREF ", url
, " from", self
.todo
[url_pair
])
549 self
.setbad(url_pair
, msg
)
552 def checkforhtml(self
, info
, url
):
553 if info
.has_key('content-type'):
554 ctype
= cgi
.parse_header(info
['content-type'])[0].lower()
556 # handle content-type: text/html; charset=iso8859-1 :
557 ctype
= ctype
.split(';', 1)[0].strip()
561 ctype
, encoding
= mimetypes
.guess_type(url
)
562 if ctype
== 'text/html':
565 self
.note(1, " Not HTML, mime type %s", ctype
)
568 def setgood(self
, url
):
569 if self
.bad
.has_key(url
):
572 self
.note(0, "(Clear previously seen error)")
574 def setbad(self
, url
, msg
):
575 if self
.bad
.has_key(url
) and self
.bad
[url
] == msg
:
576 self
.note(0, "(Seen this error before)")
582 def markerror(self
, url
):
584 origins
= self
.todo
[url
]
586 origins
= self
.done
[url
]
587 for source
, rawlink
in origins
:
588 triple
= url
, rawlink
, self
.bad
[url
]
589 self
.seterror(source
, triple
)
591 def seterror(self
, url
, triple
):
593 # Because of the way the URLs are now processed, I need to
594 # check to make sure the URL hasn't been entered in the
595 # error list. The first element of the triple here is a
596 # (URL, fragment) pair, but the URL key is not, since it's
597 # from the list of origins.
598 if triple
not in self
.errors
[url
]:
599 self
.errors
[url
].append(triple
)
601 self
.errors
[url
] = [triple
]
603 # The following used to be toplevel functions; they have been
604 # changed into methods so they can be overridden in subclasses.
606 def show(self
, p1
, link
, p2
, origins
):
607 self
.message("%s %s", p1
, link
)
609 for source
, rawlink
in origins
:
613 if rawlink
!= link
: s
= " (%s)" % rawlink
615 self
.message("%s %s%s", p2
, source
, s
)
617 def sanitize(self
, msg
):
618 if isinstance(IOError, ClassType
) and isinstance(msg
, IOError):
619 # Do the other branch recursively
620 msg
.args
= self
.sanitize(msg
.args
)
621 elif isinstance(msg
, TupleType
):
622 if len(msg
) >= 4 and msg
[0] == 'http error' and \
623 isinstance(msg
[3], InstanceType
):
624 # Remove the Message instance -- it may contain
625 # a file object which prevents pickling.
626 msg
= msg
[:3] + msg
[4:]
629 def safeclose(self
, f
):
632 except AttributeError:
635 if url
[:4] == 'ftp:' or url
[:7] == 'file://':
636 # Apparently ftp connections don't like to be closed
641 def save_pickle(self
, dumpfile
=DUMPFILE
):
643 self
.note(0, "\nNo need to save checkpoint")
645 self
.note(0, "No dumpfile, won't save checkpoint")
647 self
.note(0, "\nSaving checkpoint to %s ...", dumpfile
)
648 newfile
= dumpfile
+ ".new"
649 f
= open(newfile
, "wb")
656 os
.rename(newfile
, dumpfile
)
657 self
.note(0, "Done.")
663 def __init__(self
, text
, url
, verbose
=VERBOSE
, maxpage
=MAXPAGE
, checker
=None):
666 self
.verbose
= verbose
667 self
.maxpage
= maxpage
668 self
.checker
= checker
670 # The parsing of the page is done in the __init__() routine in
671 # order to initialize the list of names the file
672 # contains. Stored the parser in an instance variable. Passed
673 # the URL to MyHTMLParser().
674 size
= len(self
.text
)
675 if size
> self
.maxpage
:
676 self
.note(0, "Skip huge file %s (%.0f Kbytes)", self
.url
, (size
*0.001))
679 self
.checker
.note(2, " Parsing %s (%d bytes)", self
.url
, size
)
680 self
.parser
= MyHTMLParser(url
, verbose
=self
.verbose
,
681 checker
=self
.checker
)
682 self
.parser
.feed(self
.text
)
685 def note(self
, level
, msg
, *args
):
687 apply(self
.checker
.note
, (level
, msg
) + args
)
689 if self
.verbose
>= level
:
694 # Method to retrieve names.
697 return self
.parser
.names
701 def getlinkinfos(self
):
702 # File reading is done in __init__() routine. Store parser in
703 # local variable to indicate success of parsing.
705 # If no parser was stored, fail.
706 if not self
.parser
: return []
708 rawlinks
= self
.parser
.getlinks()
709 base
= urlparse
.urljoin(self
.url
, self
.parser
.getbase() or "")
711 for rawlink
in rawlinks
:
712 t
= urlparse
.urlparse(rawlink
)
713 # DON'T DISCARD THE FRAGMENT! Instead, include
714 # it in the tuples which are returned. See Checker.dopage().
717 rawlink
= urlparse
.urlunparse(t
)
718 link
= urlparse
.urljoin(base
, rawlink
)
719 infos
.append((link
, rawlink
, fragment
))
724 class MyStringIO(StringIO
.StringIO
):
726 def __init__(self
, url
, info
):
729 StringIO
.StringIO
.__init
__(self
)
738 class MyURLopener(urllib
.FancyURLopener
):
740 http_error_default
= urllib
.URLopener
.http_error_default
744 apply(urllib
.FancyURLopener
.__init
__, args
)
746 ('User-agent', 'Python-webchecker/%s' % __version__
),
749 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
):
752 def open_file(self
, url
):
753 path
= urllib
.url2pathname(urllib
.unquote(url
))
754 if os
.path
.isdir(path
):
755 if path
[-1] != os
.sep
:
757 indexpath
= os
.path
.join(path
, "index.html")
758 if os
.path
.exists(indexpath
):
759 return self
.open_file(url
+ "index.html")
761 names
= os
.listdir(path
)
762 except os
.error
, msg
:
763 raise IOError, msg
, sys
.exc_traceback
765 s
= MyStringIO("file:"+url
, {'content-type': 'text/html'})
766 s
.write('<BASE HREF="file:%s">\n' %
767 urllib
.quote(os
.path
.join(path
, "")))
769 q
= urllib
.quote(name
)
770 s
.write('<A HREF="%s">%s</A>\n' % (q
, q
))
773 return urllib
.FancyURLopener
.open_file(self
, url
)
776 class MyHTMLParser(sgmllib
.SGMLParser
):
778 def __init__(self
, url
, verbose
=VERBOSE
, checker
=None):
779 self
.myverbose
= verbose
# now unused
780 self
.checker
= checker
785 sgmllib
.SGMLParser
.__init
__(self
)
787 def start_a(self
, attributes
):
788 self
.link_attr(attributes
, 'href')
790 # We must rescue the NAME
791 # attributes from the anchor, in order to
792 # cache the internal anchors which are made
793 # available in the page.
794 for name
, value
in attributes
:
796 if value
in self
.names
:
797 self
.checker
.message("WARNING: duplicate name %s in %s",
799 else: self
.names
.append(value
)
802 def end_a(self
): pass
804 def do_area(self
, attributes
):
805 self
.link_attr(attributes
, 'href')
807 def do_body(self
, attributes
):
808 self
.link_attr(attributes
, 'background', 'bgsound')
810 def do_img(self
, attributes
):
811 self
.link_attr(attributes
, 'src', 'lowsrc')
813 def do_frame(self
, attributes
):
814 self
.link_attr(attributes
, 'src', 'longdesc')
816 def do_iframe(self
, attributes
):
817 self
.link_attr(attributes
, 'src', 'longdesc')
819 def do_link(self
, attributes
):
820 for name
, value
in attributes
:
822 parts
= value
.lower().split()
823 if ( parts
== ["stylesheet"]
824 or parts
== ["alternate", "stylesheet"]):
825 self
.link_attr(attributes
, "href")
828 def do_object(self
, attributes
):
829 self
.link_attr(attributes
, 'data', 'usemap')
831 def do_script(self
, attributes
):
832 self
.link_attr(attributes
, 'src')
834 def do_table(self
, attributes
):
835 self
.link_attr(attributes
, 'background')
837 def do_td(self
, attributes
):
838 self
.link_attr(attributes
, 'background')
840 def do_th(self
, attributes
):
841 self
.link_attr(attributes
, 'background')
843 def do_tr(self
, attributes
):
844 self
.link_attr(attributes
, 'background')
846 def link_attr(self
, attributes
, *args
):
847 for name
, value
in attributes
:
849 if value
: value
= value
.strip()
850 if value
: self
.links
[value
] = None
852 def do_base(self
, attributes
):
853 for name
, value
in attributes
:
855 if value
: value
= value
.strip()
858 self
.checker
.note(1, " Base %s", value
)
862 return self
.links
.keys()
868 if __name__
== '__main__':