Use full package paths in imports.
[python/dscho.git] / Tools / webchecker / webchecker.py
blobfa70f6575e9ff6a681a525663439bc0e08a85de6
1 #! /usr/bin/env python
3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
6 """Web tree checker.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
11 prefix of it.
13 File URL extension:
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
25 Report printed:
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
34 this output.
36 Checkpoint feature:
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
52 Miscellaneous:
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
71 honor the <BASE> tag.
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
78 better. Later...)
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
85 Options:
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
98 Arguments:
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__ = "$Revision$"
109 import sys
110 import os
111 from types import *
112 import string
113 import StringIO
114 import getopt
115 import pickle
117 import urllib
118 import urlparse
119 import sgmllib
120 import cgi
122 import mimetypes
123 import robotparser
125 # Extract real version number if necessary
126 if __version__[0] == '$':
127 _v = string.split(__version__)
128 if len(_v) == 3:
129 __version__ = _v[1]
132 # Tunable parameters
133 DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
134 CHECKEXT = 1 # Check external references (1 deep)
135 VERBOSE = 1 # Verbosity level (0-3)
136 MAXPAGE = 150000 # Ignore files bigger than this
137 ROUNDSIZE = 50 # Number of links processed per round
138 DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
139 AGENTNAME = "webchecker" # Agent name for robots.txt parser
140 NONAMES = 0 # Force name anchor checking
143 # Global variables
146 def main():
147 checkext = CHECKEXT
148 verbose = VERBOSE
149 maxpage = MAXPAGE
150 roundsize = ROUNDSIZE
151 dumpfile = DUMPFILE
152 restart = 0
153 norun = 0
155 try:
156 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
157 except getopt.error, msg:
158 sys.stdout = sys.stderr
159 print msg
160 print __doc__%globals()
161 sys.exit(2)
163 # The extra_roots variable collects extra roots.
164 extra_roots = []
165 nonames = NONAMES
167 for o, a in opts:
168 if o == '-R':
169 restart = 1
170 if o == '-d':
171 dumpfile = a
172 if o == '-m':
173 maxpage = string.atoi(a)
174 if o == '-n':
175 norun = 1
176 if o == '-q':
177 verbose = 0
178 if o == '-r':
179 roundsize = string.atoi(a)
180 if o == '-t':
181 extra_roots.append(a)
182 if o == '-a':
183 nonames = not nonames
184 if o == '-v':
185 verbose = verbose + 1
186 if o == '-x':
187 checkext = not checkext
189 if verbose > 0:
190 print AGENTNAME, "version", __version__
192 if restart:
193 c = load_pickle(dumpfile=dumpfile, verbose=verbose)
194 else:
195 c = Checker()
197 c.setflags(checkext=checkext, verbose=verbose,
198 maxpage=maxpage, roundsize=roundsize,
199 nonames=nonames
202 if not restart and not args:
203 args.append(DEFROOT)
205 for arg in args:
206 c.addroot(arg)
208 # The -t flag is only needed if external links are not to be
209 # checked. So -t values are ignored unless -x was specified.
210 if not checkext:
211 for root in extra_roots:
212 # Make sure it's terminated by a slash,
213 # so that addroot doesn't discard the last
214 # directory component.
215 if root[-1] != "/":
216 root = root + "/"
217 c.addroot(root, add_to_do = 0)
219 try:
221 if not norun:
222 try:
223 c.run()
224 except KeyboardInterrupt:
225 if verbose > 0:
226 print "[run interrupted]"
228 try:
229 c.report()
230 except KeyboardInterrupt:
231 if verbose > 0:
232 print "[report interrupted]"
234 finally:
235 if c.save_pickle(dumpfile):
236 if dumpfile == DUMPFILE:
237 print "Use ``%s -R'' to restart." % sys.argv[0]
238 else:
239 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
240 dumpfile)
243 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
244 if verbose > 0:
245 print "Loading checkpoint from %s ..." % dumpfile
246 f = open(dumpfile, "rb")
247 c = pickle.load(f)
248 f.close()
249 if verbose > 0:
250 print "Done."
251 print "Root:", string.join(c.roots, "\n ")
252 return c
255 class Checker:
257 checkext = CHECKEXT
258 verbose = VERBOSE
259 maxpage = MAXPAGE
260 roundsize = ROUNDSIZE
261 nonames = NONAMES
263 validflags = tuple(dir())
265 def __init__(self):
266 self.reset()
268 def setflags(self, **kw):
269 for key in kw.keys():
270 if key not in self.validflags:
271 raise NameError, "invalid keyword argument: %s" % str(key)
272 for key, value in kw.items():
273 setattr(self, key, value)
275 def reset(self):
276 self.roots = []
277 self.todo = {}
278 self.done = {}
279 self.bad = {}
281 # Add a name table, so that the name URLs can be checked. Also
282 # serves as an implicit cache for which URLs are done.
283 self.name_table = {}
285 self.round = 0
286 # The following are not pickled:
287 self.robots = {}
288 self.errors = {}
289 self.urlopener = MyURLopener()
290 self.changed = 0
292 def note(self, level, format, *args):
293 if self.verbose > level:
294 if args:
295 format = format%args
296 self.message(format)
298 def message(self, format, *args):
299 if args:
300 format = format%args
301 print format
303 def __getstate__(self):
304 return (self.roots, self.todo, self.done, self.bad, self.round)
306 def __setstate__(self, state):
307 self.reset()
308 (self.roots, self.todo, self.done, self.bad, self.round) = state
309 for root in self.roots:
310 self.addrobot(root)
311 for url in self.bad.keys():
312 self.markerror(url)
314 def addroot(self, root, add_to_do = 1):
315 if root not in self.roots:
316 troot = root
317 scheme, netloc, path, params, query, fragment = \
318 urlparse.urlparse(root)
319 i = string.rfind(path, "/") + 1
320 if 0 < i < len(path):
321 path = path[:i]
322 troot = urlparse.urlunparse((scheme, netloc, path,
323 params, query, fragment))
324 self.roots.append(troot)
325 self.addrobot(root)
326 if add_to_do:
327 self.newlink((root, ""), ("<root>", root))
329 def addrobot(self, root):
330 root = urlparse.urljoin(root, "/")
331 if self.robots.has_key(root): return
332 url = urlparse.urljoin(root, "/robots.txt")
333 self.robots[root] = rp = robotparser.RobotFileParser()
334 self.note(2, "Parsing %s", url)
335 rp.debug = self.verbose > 3
336 rp.set_url(url)
337 try:
338 rp.read()
339 except (OSError, IOError), msg:
340 self.note(1, "I/O error parsing %s: %s", url, msg)
342 def run(self):
343 while self.todo:
344 self.round = self.round + 1
345 self.note(0, "\nRound %d (%s)\n", self.round, self.status())
346 urls = self.todo.keys()
347 urls.sort()
348 del urls[self.roundsize:]
349 for url in urls:
350 self.dopage(url)
352 def status(self):
353 return "%d total, %d to do, %d done, %d bad" % (
354 len(self.todo)+len(self.done),
355 len(self.todo), len(self.done),
356 len(self.bad))
358 def report(self):
359 self.message("")
360 if not self.todo: s = "Final"
361 else: s = "Interim"
362 self.message("%s Report (%s)", s, self.status())
363 self.report_errors()
365 def report_errors(self):
366 if not self.bad:
367 self.message("\nNo errors")
368 return
369 self.message("\nError Report:")
370 sources = self.errors.keys()
371 sources.sort()
372 for source in sources:
373 triples = self.errors[source]
374 self.message("")
375 if len(triples) > 1:
376 self.message("%d Errors in %s", len(triples), source)
377 else:
378 self.message("Error in %s", source)
379 # Call self.format_url() instead of referring
380 # to the URL directly, since the URLs in these
381 # triples is now a (URL, fragment) pair. The value
382 # of the "source" variable comes from the list of
383 # origins, and is a URL, not a pair.
384 for url, rawlink, msg in triples:
385 if rawlink != self.format_url(url): s = " (%s)" % rawlink
386 else: s = ""
387 self.message(" HREF %s%s\n msg %s",
388 self.format_url(url), s, msg)
390 def dopage(self, url_pair):
392 # All printing of URLs uses format_url(); argument changed to
393 # url_pair for clarity.
394 if self.verbose > 1:
395 if self.verbose > 2:
396 self.show("Check ", self.format_url(url_pair),
397 " from", self.todo[url_pair])
398 else:
399 self.message("Check %s", self.format_url(url_pair))
400 url, local_fragment = url_pair
401 if local_fragment and self.nonames:
402 self.markdone(url_pair)
403 return
404 page = self.getpage(url_pair)
405 if page:
406 # Store the page which corresponds to this URL.
407 self.name_table[url] = page
408 # If there is a fragment in this url_pair, and it's not
409 # in the list of names for the page, call setbad(), since
410 # it's a missing anchor.
411 if local_fragment and local_fragment not in page.getnames():
412 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
413 for info in page.getlinkinfos():
414 # getlinkinfos() now returns the fragment as well,
415 # and we store that fragment here in the "todo" dictionary.
416 link, rawlink, fragment = info
417 # However, we don't want the fragment as the origin, since
418 # the origin is logically a page.
419 origin = url, rawlink
420 self.newlink((link, fragment), origin)
421 else:
422 # If no page has been created yet, we want to
423 # record that fact.
424 self.name_table[url_pair[0]] = None
425 self.markdone(url_pair)
427 def newlink(self, url, origin):
428 if self.done.has_key(url):
429 self.newdonelink(url, origin)
430 else:
431 self.newtodolink(url, origin)
433 def newdonelink(self, url, origin):
434 if origin not in self.done[url]:
435 self.done[url].append(origin)
437 # Call self.format_url(), since the URL here
438 # is now a (URL, fragment) pair.
439 self.note(3, " Done link %s", self.format_url(url))
441 # Make sure that if it's bad, that the origin gets added.
442 if self.bad.has_key(url):
443 source, rawlink = origin
444 triple = url, rawlink, self.bad[url]
445 self.seterror(source, triple)
447 def newtodolink(self, url, origin):
448 # Call self.format_url(), since the URL here
449 # is now a (URL, fragment) pair.
450 if self.todo.has_key(url):
451 if origin not in self.todo[url]:
452 self.todo[url].append(origin)
453 self.note(3, " Seen todo link %s", self.format_url(url))
454 else:
455 self.todo[url] = [origin]
456 self.note(3, " New todo link %s", self.format_url(url))
458 def format_url(self, url):
459 link, fragment = url
460 if fragment: return link + "#" + fragment
461 else: return link
463 def markdone(self, url):
464 self.done[url] = self.todo[url]
465 del self.todo[url]
466 self.changed = 1
468 def inroots(self, url):
469 for root in self.roots:
470 if url[:len(root)] == root:
471 return self.isallowed(root, url)
472 return 0
474 def isallowed(self, root, url):
475 root = urlparse.urljoin(root, "/")
476 return self.robots[root].can_fetch(AGENTNAME, url)
478 def getpage(self, url_pair):
479 # Incoming argument name is a (URL, fragment) pair.
480 # The page may have been cached in the name_table variable.
481 url, fragment = url_pair
482 if self.name_table.has_key(url):
483 return self.name_table[url]
485 scheme, path = urllib.splittype(url)
486 if scheme in ('mailto', 'news', 'javascript', 'telnet'):
487 self.note(1, " Not checking %s URL" % scheme)
488 return None
489 isint = self.inroots(url)
491 # Ensure that openpage gets the URL pair to
492 # print out its error message and record the error pair
493 # correctly.
494 if not isint:
495 if not self.checkext:
496 self.note(1, " Not checking ext link")
497 return None
498 f = self.openpage(url_pair)
499 if f:
500 self.safeclose(f)
501 return None
502 text, nurl = self.readhtml(url_pair)
504 if nurl != url:
505 self.note(1, " Redirected to %s", nurl)
506 url = nurl
507 if text:
508 return Page(text, url, maxpage=self.maxpage, checker=self)
510 # These next three functions take (URL, fragment) pairs as
511 # arguments, so that openpage() receives the appropriate tuple to
512 # record error messages.
513 def readhtml(self, url_pair):
514 url, fragment = url_pair
515 text = None
516 f, url = self.openhtml(url_pair)
517 if f:
518 text = f.read()
519 f.close()
520 return text, url
522 def openhtml(self, url_pair):
523 url, fragment = url_pair
524 f = self.openpage(url_pair)
525 if f:
526 url = f.geturl()
527 info = f.info()
528 if not self.checkforhtml(info, url):
529 self.safeclose(f)
530 f = None
531 return f, url
533 def openpage(self, url_pair):
534 url, fragment = url_pair
535 try:
536 return self.urlopener.open(url)
537 except (OSError, IOError), msg:
538 msg = self.sanitize(msg)
539 self.note(0, "Error %s", msg)
540 if self.verbose > 0:
541 self.show(" HREF ", url, " from", self.todo[url_pair])
542 self.setbad(url_pair, msg)
543 return None
545 def checkforhtml(self, info, url):
546 if info.has_key('content-type'):
547 ctype = string.lower(cgi.parse_header(info['content-type'])[0])
548 else:
549 if url[-1:] == "/":
550 return 1
551 ctype, encoding = mimetypes.guess_type(url)
552 if ctype == 'text/html':
553 return 1
554 else:
555 self.note(1, " Not HTML, mime type %s", ctype)
556 return 0
558 def setgood(self, url):
559 if self.bad.has_key(url):
560 del self.bad[url]
561 self.changed = 1
562 self.note(0, "(Clear previously seen error)")
564 def setbad(self, url, msg):
565 if self.bad.has_key(url) and self.bad[url] == msg:
566 self.note(0, "(Seen this error before)")
567 return
568 self.bad[url] = msg
569 self.changed = 1
570 self.markerror(url)
572 def markerror(self, url):
573 try:
574 origins = self.todo[url]
575 except KeyError:
576 origins = self.done[url]
577 for source, rawlink in origins:
578 triple = url, rawlink, self.bad[url]
579 self.seterror(source, triple)
581 def seterror(self, url, triple):
582 try:
583 # Because of the way the URLs are now processed, I need to
584 # check to make sure the URL hasn't been entered in the
585 # error list. The first element of the triple here is a
586 # (URL, fragment) pair, but the URL key is not, since it's
587 # from the list of origins.
588 if triple not in self.errors[url]:
589 self.errors[url].append(triple)
590 except KeyError:
591 self.errors[url] = [triple]
593 # The following used to be toplevel functions; they have been
594 # changed into methods so they can be overridden in subclasses.
596 def show(self, p1, link, p2, origins):
597 self.message("%s %s", p1, link)
598 i = 0
599 for source, rawlink in origins:
600 i = i+1
601 if i == 2:
602 p2 = ' '*len(p2)
603 if rawlink != link: s = " (%s)" % rawlink
604 else: s = ""
605 self.message("%s %s%s", p2, source, s)
607 def sanitize(self, msg):
608 if isinstance(IOError, ClassType) and isinstance(msg, IOError):
609 # Do the other branch recursively
610 msg.args = self.sanitize(msg.args)
611 elif isinstance(msg, TupleType):
612 if len(msg) >= 4 and msg[0] == 'http error' and \
613 isinstance(msg[3], InstanceType):
614 # Remove the Message instance -- it may contain
615 # a file object which prevents pickling.
616 msg = msg[:3] + msg[4:]
617 return msg
619 def safeclose(self, f):
620 try:
621 url = f.geturl()
622 except AttributeError:
623 pass
624 else:
625 if url[:4] == 'ftp:' or url[:7] == 'file://':
626 # Apparently ftp connections don't like to be closed
627 # prematurely...
628 text = f.read()
629 f.close()
631 def save_pickle(self, dumpfile=DUMPFILE):
632 if not self.changed:
633 self.note(0, "\nNo need to save checkpoint")
634 elif not dumpfile:
635 self.note(0, "No dumpfile, won't save checkpoint")
636 else:
637 self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
638 newfile = dumpfile + ".new"
639 f = open(newfile, "wb")
640 pickle.dump(self, f)
641 f.close()
642 try:
643 os.unlink(dumpfile)
644 except os.error:
645 pass
646 os.rename(newfile, dumpfile)
647 self.note(0, "Done.")
648 return 1
651 class Page:
653 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
654 self.text = text
655 self.url = url
656 self.verbose = verbose
657 self.maxpage = maxpage
658 self.checker = checker
660 # The parsing of the page is done in the __init__() routine in
661 # order to initialize the list of names the file
662 # contains. Stored the parser in an instance variable. Passed
663 # the URL to MyHTMLParser().
664 size = len(self.text)
665 if size > self.maxpage:
666 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
667 self.parser = None
668 return
669 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
670 self.parser = MyHTMLParser(url, verbose=self.verbose,
671 checker=self.checker)
672 self.parser.feed(self.text)
673 self.parser.close()
675 def note(self, level, msg, *args):
676 if self.checker:
677 apply(self.checker.note, (level, msg) + args)
678 else:
679 if self.verbose >= level:
680 if args:
681 msg = msg%args
682 print msg
684 # Method to retrieve names.
685 def getnames(self):
686 if self.parser:
687 return self.parser.names
688 else:
689 return []
691 def getlinkinfos(self):
692 # File reading is done in __init__() routine. Store parser in
693 # local variable to indicate success of parsing.
695 # If no parser was stored, fail.
696 if not self.parser: return []
698 rawlinks = self.parser.getlinks()
699 base = urlparse.urljoin(self.url, self.parser.getbase() or "")
700 infos = []
701 for rawlink in rawlinks:
702 t = urlparse.urlparse(rawlink)
703 # DON'T DISCARD THE FRAGMENT! Instead, include
704 # it in the tuples which are returned. See Checker.dopage().
705 fragment = t[-1]
706 t = t[:-1] + ('',)
707 rawlink = urlparse.urlunparse(t)
708 link = urlparse.urljoin(base, rawlink)
709 infos.append((link, rawlink, fragment))
711 return infos
714 class MyStringIO(StringIO.StringIO):
716 def __init__(self, url, info):
717 self.__url = url
718 self.__info = info
719 StringIO.StringIO.__init__(self)
721 def info(self):
722 return self.__info
724 def geturl(self):
725 return self.__url
728 class MyURLopener(urllib.FancyURLopener):
730 http_error_default = urllib.URLopener.http_error_default
732 def __init__(*args):
733 self = args[0]
734 apply(urllib.FancyURLopener.__init__, args)
735 self.addheaders = [
736 ('User-agent', 'Python-webchecker/%s' % __version__),
739 def http_error_401(self, url, fp, errcode, errmsg, headers):
740 return None
742 def open_file(self, url):
743 path = urllib.url2pathname(urllib.unquote(url))
744 if os.path.isdir(path):
745 if path[-1] != os.sep:
746 url = url + '/'
747 indexpath = os.path.join(path, "index.html")
748 if os.path.exists(indexpath):
749 return self.open_file(url + "index.html")
750 try:
751 names = os.listdir(path)
752 except os.error, msg:
753 raise IOError, msg, sys.exc_traceback
754 names.sort()
755 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
756 s.write('<BASE HREF="file:%s">\n' %
757 urllib.quote(os.path.join(path, "")))
758 for name in names:
759 q = urllib.quote(name)
760 s.write('<A HREF="%s">%s</A>\n' % (q, q))
761 s.seek(0)
762 return s
763 return urllib.FancyURLopener.open_file(self, url)
766 class MyHTMLParser(sgmllib.SGMLParser):
768 def __init__(self, url, verbose=VERBOSE, checker=None):
769 self.myverbose = verbose # now unused
770 self.checker = checker
771 self.base = None
772 self.links = {}
773 self.names = []
774 self.url = url
775 sgmllib.SGMLParser.__init__(self)
777 def start_a(self, attributes):
778 self.link_attr(attributes, 'href')
780 # We must rescue the NAME
781 # attributes from the anchor, in order to
782 # cache the internal anchors which are made
783 # available in the page.
784 for name, value in attributes:
785 if name == "name":
786 if value in self.names:
787 self.checker.message("WARNING: duplicate name %s in %s",
788 value, self.url)
789 else: self.names.append(value)
790 break
792 def end_a(self): pass
794 def do_area(self, attributes):
795 self.link_attr(attributes, 'href')
797 def do_body(self, attributes):
798 self.link_attr(attributes, 'background', 'bgsound')
800 def do_img(self, attributes):
801 self.link_attr(attributes, 'src', 'lowsrc')
803 def do_frame(self, attributes):
804 self.link_attr(attributes, 'src', 'longdesc')
806 def do_iframe(self, attributes):
807 self.link_attr(attributes, 'src', 'longdesc')
809 def do_link(self, attributes):
810 for name, value in attributes:
811 if name == "rel":
812 parts = string.split(string.lower(value))
813 if ( parts == ["stylesheet"]
814 or parts == ["alternate", "stylesheet"]):
815 self.link_attr(attributes, "href")
816 break
818 def do_object(self, attributes):
819 self.link_attr(attributes, 'data', 'usemap')
821 def do_script(self, attributes):
822 self.link_attr(attributes, 'src')
824 def do_table(self, attributes):
825 self.link_attr(attributes, 'background')
827 def do_td(self, attributes):
828 self.link_attr(attributes, 'background')
830 def do_th(self, attributes):
831 self.link_attr(attributes, 'background')
833 def do_tr(self, attributes):
834 self.link_attr(attributes, 'background')
836 def link_attr(self, attributes, *args):
837 for name, value in attributes:
838 if name in args:
839 if value: value = string.strip(value)
840 if value: self.links[value] = None
842 def do_base(self, attributes):
843 for name, value in attributes:
844 if name == 'href':
845 if value: value = string.strip(value)
846 if value:
847 if self.checker:
848 self.checker.note(1, " Base %s", value)
849 self.base = value
851 def getlinks(self):
852 return self.links.keys()
854 def getbase(self):
855 return self.base
858 if __name__ == '__main__':
859 main()