3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__
= "$Revision$"
125 # Extract real version number if necessary
126 if __version__
[0] == '$':
127 _v
= string
.split(__version__
)
133 DEFROOT
= "file:/usr/local/etc/httpd/htdocs/" # Default root URL
134 CHECKEXT
= 1 # Check external references (1 deep)
135 VERBOSE
= 1 # Verbosity level (0-3)
136 MAXPAGE
= 150000 # Ignore files bigger than this
137 ROUNDSIZE
= 50 # Number of links processed per round
138 DUMPFILE
= "@webchecker.pickle" # Pickled checkpoint
139 AGENTNAME
= "webchecker" # Agent name for robots.txt parser
140 NONAMES
= 0 # Force name anchor checking
150 roundsize
= ROUNDSIZE
156 opts
, args
= getopt
.getopt(sys
.argv
[1:], 'Rd:m:nqr:t:vxa')
157 except getopt
.error
, msg
:
158 sys
.stdout
= sys
.stderr
160 print __doc__
%globals
()
163 # The extra_roots variable collects extra roots.
173 maxpage
= string
.atoi(a
)
179 roundsize
= string
.atoi(a
)
181 extra_roots
.append(a
)
183 nonames
= not nonames
185 verbose
= verbose
+ 1
187 checkext
= not checkext
190 print AGENTNAME
, "version", __version__
193 c
= load_pickle(dumpfile
=dumpfile
, verbose
=verbose
)
197 c
.setflags(checkext
=checkext
, verbose
=verbose
,
198 maxpage
=maxpage
, roundsize
=roundsize
,
202 if not restart
and not args
:
208 # The -t flag is only needed if external links are not to be
209 # checked. So -t values are ignored unless -x was specified.
211 for root
in extra_roots
:
212 # Make sure it's terminated by a slash,
213 # so that addroot doesn't discard the last
214 # directory component.
217 c
.addroot(root
, add_to_do
= 0)
224 except KeyboardInterrupt:
226 print "[run interrupted]"
230 except KeyboardInterrupt:
232 print "[report interrupted]"
235 if c
.save_pickle(dumpfile
):
236 if dumpfile
== DUMPFILE
:
237 print "Use ``%s -R'' to restart." % sys
.argv
[0]
239 print "Use ``%s -R -d %s'' to restart." % (sys
.argv
[0],
243 def load_pickle(dumpfile
=DUMPFILE
, verbose
=VERBOSE
):
245 print "Loading checkpoint from %s ..." % dumpfile
246 f
= open(dumpfile
, "rb")
251 print "Root:", string
.join(c
.roots
, "\n ")
260 roundsize
= ROUNDSIZE
263 validflags
= tuple(dir())
268 def setflags(self
, **kw
):
269 for key
in kw
.keys():
270 if key
not in self
.validflags
:
271 raise NameError, "invalid keyword argument: %s" % str(key
)
272 for key
, value
in kw
.items():
273 setattr(self
, key
, value
)
281 # Add a name table, so that the name URLs can be checked. Also
282 # serves as an implicit cache for which URLs are done.
286 # The following are not pickled:
289 self
.urlopener
= MyURLopener()
292 def note(self
, level
, format
, *args
):
293 if self
.verbose
> level
:
298 def message(self
, format
, *args
):
303 def __getstate__(self
):
304 return (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round)
306 def __setstate__(self
, state
):
308 (self
.roots
, self
.todo
, self
.done
, self
.bad
, self
.round) = state
309 for root
in self
.roots
:
311 for url
in self
.bad
.keys():
314 def addroot(self
, root
, add_to_do
= 1):
315 if root
not in self
.roots
:
317 scheme
, netloc
, path
, params
, query
, fragment
= \
318 urlparse
.urlparse(root
)
319 i
= string
.rfind(path
, "/") + 1
320 if 0 < i
< len(path
):
322 troot
= urlparse
.urlunparse((scheme
, netloc
, path
,
323 params
, query
, fragment
))
324 self
.roots
.append(troot
)
327 self
.newlink((root
, ""), ("<root>", root
))
329 def addrobot(self
, root
):
330 root
= urlparse
.urljoin(root
, "/")
331 if self
.robots
.has_key(root
): return
332 url
= urlparse
.urljoin(root
, "/robots.txt")
333 self
.robots
[root
] = rp
= robotparser
.RobotFileParser()
334 self
.note(2, "Parsing %s", url
)
335 rp
.debug
= self
.verbose
> 3
339 except (OSError, IOError), msg
:
340 self
.note(1, "I/O error parsing %s: %s", url
, msg
)
344 self
.round = self
.round + 1
345 self
.note(0, "\nRound %d (%s)\n", self
.round, self
.status())
346 urls
= self
.todo
.keys()
348 del urls
[self
.roundsize
:]
353 return "%d total, %d to do, %d done, %d bad" % (
354 len(self
.todo
)+len(self
.done
),
355 len(self
.todo
), len(self
.done
),
360 if not self
.todo
: s
= "Final"
362 self
.message("%s Report (%s)", s
, self
.status())
365 def report_errors(self
):
367 self
.message("\nNo errors")
369 self
.message("\nError Report:")
370 sources
= self
.errors
.keys()
372 for source
in sources
:
373 triples
= self
.errors
[source
]
376 self
.message("%d Errors in %s", len(triples
), source
)
378 self
.message("Error in %s", source
)
379 # Call self.format_url() instead of referring
380 # to the URL directly, since the URLs in these
381 # triples is now a (URL, fragment) pair. The value
382 # of the "source" variable comes from the list of
383 # origins, and is a URL, not a pair.
384 for url
, rawlink
, msg
in triples
:
385 if rawlink
!= self
.format_url(url
): s
= " (%s)" % rawlink
387 self
.message(" HREF %s%s\n msg %s",
388 self
.format_url(url
), s
, msg
)
390 def dopage(self
, url_pair
):
392 # All printing of URLs uses format_url(); argument changed to
393 # url_pair for clarity.
396 self
.show("Check ", self
.format_url(url_pair
),
397 " from", self
.todo
[url_pair
])
399 self
.message("Check %s", self
.format_url(url_pair
))
400 url
, local_fragment
= url_pair
401 if local_fragment
and self
.nonames
:
402 self
.markdone(url_pair
)
404 page
= self
.getpage(url_pair
)
406 # Store the page which corresponds to this URL.
407 self
.name_table
[url
] = page
408 # If there is a fragment in this url_pair, and it's not
409 # in the list of names for the page, call setbad(), since
410 # it's a missing anchor.
411 if local_fragment
and local_fragment
not in page
.getnames():
412 self
.setbad(url_pair
, ("Missing name anchor `%s'" % local_fragment
))
413 for info
in page
.getlinkinfos():
414 # getlinkinfos() now returns the fragment as well,
415 # and we store that fragment here in the "todo" dictionary.
416 link
, rawlink
, fragment
= info
417 # However, we don't want the fragment as the origin, since
418 # the origin is logically a page.
419 origin
= url
, rawlink
420 self
.newlink((link
, fragment
), origin
)
422 # If no page has been created yet, we want to
424 self
.name_table
[url_pair
[0]] = None
425 self
.markdone(url_pair
)
427 def newlink(self
, url
, origin
):
428 if self
.done
.has_key(url
):
429 self
.newdonelink(url
, origin
)
431 self
.newtodolink(url
, origin
)
433 def newdonelink(self
, url
, origin
):
434 if origin
not in self
.done
[url
]:
435 self
.done
[url
].append(origin
)
437 # Call self.format_url(), since the URL here
438 # is now a (URL, fragment) pair.
439 self
.note(3, " Done link %s", self
.format_url(url
))
441 # Make sure that if it's bad, that the origin gets added.
442 if self
.bad
.has_key(url
):
443 source
, rawlink
= origin
444 triple
= url
, rawlink
, self
.bad
[url
]
445 self
.seterror(source
, triple
)
447 def newtodolink(self
, url
, origin
):
448 # Call self.format_url(), since the URL here
449 # is now a (URL, fragment) pair.
450 if self
.todo
.has_key(url
):
451 if origin
not in self
.todo
[url
]:
452 self
.todo
[url
].append(origin
)
453 self
.note(3, " Seen todo link %s", self
.format_url(url
))
455 self
.todo
[url
] = [origin
]
456 self
.note(3, " New todo link %s", self
.format_url(url
))
458 def format_url(self
, url
):
460 if fragment
: return link
+ "#" + fragment
463 def markdone(self
, url
):
464 self
.done
[url
] = self
.todo
[url
]
468 def inroots(self
, url
):
469 for root
in self
.roots
:
470 if url
[:len(root
)] == root
:
471 return self
.isallowed(root
, url
)
474 def isallowed(self
, root
, url
):
475 root
= urlparse
.urljoin(root
, "/")
476 return self
.robots
[root
].can_fetch(AGENTNAME
, url
)
478 def getpage(self
, url_pair
):
479 # Incoming argument name is a (URL, fragment) pair.
480 # The page may have been cached in the name_table variable.
481 url
, fragment
= url_pair
482 if self
.name_table
.has_key(url
):
483 return self
.name_table
[url
]
485 scheme
, path
= urllib
.splittype(url
)
486 if scheme
in ('mailto', 'news', 'javascript', 'telnet'):
487 self
.note(1, " Not checking %s URL" % scheme
)
489 isint
= self
.inroots(url
)
491 # Ensure that openpage gets the URL pair to
492 # print out its error message and record the error pair
495 if not self
.checkext
:
496 self
.note(1, " Not checking ext link")
498 f
= self
.openpage(url_pair
)
502 text
, nurl
= self
.readhtml(url_pair
)
505 self
.note(1, " Redirected to %s", nurl
)
508 return Page(text
, url
, maxpage
=self
.maxpage
, checker
=self
)
510 # These next three functions take (URL, fragment) pairs as
511 # arguments, so that openpage() receives the appropriate tuple to
512 # record error messages.
513 def readhtml(self
, url_pair
):
514 url
, fragment
= url_pair
516 f
, url
= self
.openhtml(url_pair
)
522 def openhtml(self
, url_pair
):
523 url
, fragment
= url_pair
524 f
= self
.openpage(url_pair
)
528 if not self
.checkforhtml(info
, url
):
533 def openpage(self
, url_pair
):
534 url
, fragment
= url_pair
536 return self
.urlopener
.open(url
)
537 except (OSError, IOError), msg
:
538 msg
= self
.sanitize(msg
)
539 self
.note(0, "Error %s", msg
)
541 self
.show(" HREF ", url
, " from", self
.todo
[url_pair
])
542 self
.setbad(url_pair
, msg
)
545 def checkforhtml(self
, info
, url
):
546 if info
.has_key('content-type'):
547 ctype
= string
.lower(cgi
.parse_header(info
['content-type'])[0])
551 ctype
, encoding
= mimetypes
.guess_type(url
)
552 if ctype
== 'text/html':
555 self
.note(1, " Not HTML, mime type %s", ctype
)
558 def setgood(self
, url
):
559 if self
.bad
.has_key(url
):
562 self
.note(0, "(Clear previously seen error)")
564 def setbad(self
, url
, msg
):
565 if self
.bad
.has_key(url
) and self
.bad
[url
] == msg
:
566 self
.note(0, "(Seen this error before)")
572 def markerror(self
, url
):
574 origins
= self
.todo
[url
]
576 origins
= self
.done
[url
]
577 for source
, rawlink
in origins
:
578 triple
= url
, rawlink
, self
.bad
[url
]
579 self
.seterror(source
, triple
)
581 def seterror(self
, url
, triple
):
583 # Because of the way the URLs are now processed, I need to
584 # check to make sure the URL hasn't been entered in the
585 # error list. The first element of the triple here is a
586 # (URL, fragment) pair, but the URL key is not, since it's
587 # from the list of origins.
588 if triple
not in self
.errors
[url
]:
589 self
.errors
[url
].append(triple
)
591 self
.errors
[url
] = [triple
]
593 # The following used to be toplevel functions; they have been
594 # changed into methods so they can be overridden in subclasses.
596 def show(self
, p1
, link
, p2
, origins
):
597 self
.message("%s %s", p1
, link
)
599 for source
, rawlink
in origins
:
603 if rawlink
!= link
: s
= " (%s)" % rawlink
605 self
.message("%s %s%s", p2
, source
, s
)
607 def sanitize(self
, msg
):
608 if isinstance(IOError, ClassType
) and isinstance(msg
, IOError):
609 # Do the other branch recursively
610 msg
.args
= self
.sanitize(msg
.args
)
611 elif isinstance(msg
, TupleType
):
612 if len(msg
) >= 4 and msg
[0] == 'http error' and \
613 isinstance(msg
[3], InstanceType
):
614 # Remove the Message instance -- it may contain
615 # a file object which prevents pickling.
616 msg
= msg
[:3] + msg
[4:]
619 def safeclose(self
, f
):
622 except AttributeError:
625 if url
[:4] == 'ftp:' or url
[:7] == 'file://':
626 # Apparently ftp connections don't like to be closed
631 def save_pickle(self
, dumpfile
=DUMPFILE
):
633 self
.note(0, "\nNo need to save checkpoint")
635 self
.note(0, "No dumpfile, won't save checkpoint")
637 self
.note(0, "\nSaving checkpoint to %s ...", dumpfile
)
638 newfile
= dumpfile
+ ".new"
639 f
= open(newfile
, "wb")
646 os
.rename(newfile
, dumpfile
)
647 self
.note(0, "Done.")
653 def __init__(self
, text
, url
, verbose
=VERBOSE
, maxpage
=MAXPAGE
, checker
=None):
656 self
.verbose
= verbose
657 self
.maxpage
= maxpage
658 self
.checker
= checker
660 # The parsing of the page is done in the __init__() routine in
661 # order to initialize the list of names the file
662 # contains. Stored the parser in an instance variable. Passed
663 # the URL to MyHTMLParser().
664 size
= len(self
.text
)
665 if size
> self
.maxpage
:
666 self
.note(0, "Skip huge file %s (%.0f Kbytes)", self
.url
, (size
*0.001))
669 self
.checker
.note(2, " Parsing %s (%d bytes)", self
.url
, size
)
670 self
.parser
= MyHTMLParser(url
, verbose
=self
.verbose
,
671 checker
=self
.checker
)
672 self
.parser
.feed(self
.text
)
675 def note(self
, level
, msg
, *args
):
677 apply(self
.checker
.note
, (level
, msg
) + args
)
679 if self
.verbose
>= level
:
684 # Method to retrieve names.
687 return self
.parser
.names
691 def getlinkinfos(self
):
692 # File reading is done in __init__() routine. Store parser in
693 # local variable to indicate success of parsing.
695 # If no parser was stored, fail.
696 if not self
.parser
: return []
698 rawlinks
= self
.parser
.getlinks()
699 base
= urlparse
.urljoin(self
.url
, self
.parser
.getbase() or "")
701 for rawlink
in rawlinks
:
702 t
= urlparse
.urlparse(rawlink
)
703 # DON'T DISCARD THE FRAGMENT! Instead, include
704 # it in the tuples which are returned. See Checker.dopage().
707 rawlink
= urlparse
.urlunparse(t
)
708 link
= urlparse
.urljoin(base
, rawlink
)
709 infos
.append((link
, rawlink
, fragment
))
714 class MyStringIO(StringIO
.StringIO
):
716 def __init__(self
, url
, info
):
719 StringIO
.StringIO
.__init
__(self
)
728 class MyURLopener(urllib
.FancyURLopener
):
730 http_error_default
= urllib
.URLopener
.http_error_default
734 apply(urllib
.FancyURLopener
.__init
__, args
)
736 ('User-agent', 'Python-webchecker/%s' % __version__
),
739 def http_error_401(self
, url
, fp
, errcode
, errmsg
, headers
):
742 def open_file(self
, url
):
743 path
= urllib
.url2pathname(urllib
.unquote(url
))
744 if os
.path
.isdir(path
):
745 if path
[-1] != os
.sep
:
747 indexpath
= os
.path
.join(path
, "index.html")
748 if os
.path
.exists(indexpath
):
749 return self
.open_file(url
+ "index.html")
751 names
= os
.listdir(path
)
752 except os
.error
, msg
:
753 raise IOError, msg
, sys
.exc_traceback
755 s
= MyStringIO("file:"+url
, {'content-type': 'text/html'})
756 s
.write('<BASE HREF="file:%s">\n' %
757 urllib
.quote(os
.path
.join(path
, "")))
759 q
= urllib
.quote(name
)
760 s
.write('<A HREF="%s">%s</A>\n' % (q
, q
))
763 return urllib
.FancyURLopener
.open_file(self
, url
)
766 class MyHTMLParser(sgmllib
.SGMLParser
):
768 def __init__(self
, url
, verbose
=VERBOSE
, checker
=None):
769 self
.myverbose
= verbose
# now unused
770 self
.checker
= checker
775 sgmllib
.SGMLParser
.__init
__(self
)
777 def start_a(self
, attributes
):
778 self
.link_attr(attributes
, 'href')
780 # We must rescue the NAME
781 # attributes from the anchor, in order to
782 # cache the internal anchors which are made
783 # available in the page.
784 for name
, value
in attributes
:
786 if value
in self
.names
:
787 self
.checker
.message("WARNING: duplicate name %s in %s",
789 else: self
.names
.append(value
)
792 def end_a(self
): pass
794 def do_area(self
, attributes
):
795 self
.link_attr(attributes
, 'href')
797 def do_body(self
, attributes
):
798 self
.link_attr(attributes
, 'background', 'bgsound')
800 def do_img(self
, attributes
):
801 self
.link_attr(attributes
, 'src', 'lowsrc')
803 def do_frame(self
, attributes
):
804 self
.link_attr(attributes
, 'src', 'longdesc')
806 def do_iframe(self
, attributes
):
807 self
.link_attr(attributes
, 'src', 'longdesc')
809 def do_link(self
, attributes
):
810 for name
, value
in attributes
:
812 parts
= string
.split(string
.lower(value
))
813 if ( parts
== ["stylesheet"]
814 or parts
== ["alternate", "stylesheet"]):
815 self
.link_attr(attributes
, "href")
818 def do_object(self
, attributes
):
819 self
.link_attr(attributes
, 'data', 'usemap')
821 def do_script(self
, attributes
):
822 self
.link_attr(attributes
, 'src')
824 def do_table(self
, attributes
):
825 self
.link_attr(attributes
, 'background')
827 def do_td(self
, attributes
):
828 self
.link_attr(attributes
, 'background')
830 def do_th(self
, attributes
):
831 self
.link_attr(attributes
, 'background')
833 def do_tr(self
, attributes
):
834 self
.link_attr(attributes
, 'background')
836 def link_attr(self
, attributes
, *args
):
837 for name
, value
in attributes
:
839 if value
: value
= string
.strip(value
)
840 if value
: self
.links
[value
] = None
842 def do_base(self
, attributes
):
843 for name
, value
in attributes
:
845 if value
: value
= string
.strip(value
)
848 self
.checker
.note(1, " Base %s", value
)
852 return self
.links
.keys()
858 if __name__
== '__main__':