py-cvs-rel2_1 (Rev 1.2) merge
[python/dscho.git] / Tools / webchecker / webchecker.py
blob091d0d2570d77337848f3df1b7fd690fc8fccf6c
1 #! /usr/bin/env python
3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
6 """Web tree checker.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
11 prefix of it.
13 File URL extension:
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
25 Report printed:
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
34 this output.
36 Checkpoint feature:
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
52 Miscellaneous:
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
71 honor the <BASE> tag.
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
78 better. Later...)
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
85 Options:
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
98 Arguments:
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__ = "$Revision$"
109 import sys
110 import os
111 from types import *
112 import string
113 import StringIO
114 import getopt
115 import pickle
117 import urllib
118 import urlparse
119 import sgmllib
121 import mimetypes
122 import robotparser
124 # Extract real version number if necessary
125 if __version__[0] == '$':
126 _v = string.split(__version__)
127 if len(_v) == 3:
128 __version__ = _v[1]
131 # Tunable parameters
132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
133 CHECKEXT = 1 # Check external references (1 deep)
134 VERBOSE = 1 # Verbosity level (0-3)
135 MAXPAGE = 150000 # Ignore files bigger than this
136 ROUNDSIZE = 50 # Number of links processed per round
137 DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
138 AGENTNAME = "webchecker" # Agent name for robots.txt parser
139 NONAMES = 0 # Force name anchor checking
142 # Global variables
145 def main():
146 checkext = CHECKEXT
147 verbose = VERBOSE
148 maxpage = MAXPAGE
149 roundsize = ROUNDSIZE
150 dumpfile = DUMPFILE
151 restart = 0
152 norun = 0
154 try:
155 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
156 except getopt.error, msg:
157 sys.stdout = sys.stderr
158 print msg
159 print __doc__%globals()
160 sys.exit(2)
162 # The extra_roots variable collects extra roots.
163 extra_roots = []
164 nonames = NONAMES
166 for o, a in opts:
167 if o == '-R':
168 restart = 1
169 if o == '-d':
170 dumpfile = a
171 if o == '-m':
172 maxpage = string.atoi(a)
173 if o == '-n':
174 norun = 1
175 if o == '-q':
176 verbose = 0
177 if o == '-r':
178 roundsize = string.atoi(a)
179 if o == '-t':
180 extra_roots.append(a)
181 if o == '-a':
182 nonames = not nonames
183 if o == '-v':
184 verbose = verbose + 1
185 if o == '-x':
186 checkext = not checkext
188 if verbose > 0:
189 print AGENTNAME, "version", __version__
191 if restart:
192 c = load_pickle(dumpfile=dumpfile, verbose=verbose)
193 else:
194 c = Checker()
196 c.setflags(checkext=checkext, verbose=verbose,
197 maxpage=maxpage, roundsize=roundsize,
198 nonames=nonames
201 if not restart and not args:
202 args.append(DEFROOT)
204 for arg in args:
205 c.addroot(arg)
207 # The -t flag is only needed if external links are not to be
208 # checked. So -t values are ignored unless -x was specified.
209 if not checkext:
210 for root in extra_roots:
211 # Make sure it's terminated by a slash,
212 # so that addroot doesn't discard the last
213 # directory component.
214 if root[-1] != "/":
215 root = root + "/"
216 c.addroot(root, add_to_do = 0)
218 try:
220 if not norun:
221 try:
222 c.run()
223 except KeyboardInterrupt:
224 if verbose > 0:
225 print "[run interrupted]"
227 try:
228 c.report()
229 except KeyboardInterrupt:
230 if verbose > 0:
231 print "[report interrupted]"
233 finally:
234 if c.save_pickle(dumpfile):
235 if dumpfile == DUMPFILE:
236 print "Use ``%s -R'' to restart." % sys.argv[0]
237 else:
238 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
239 dumpfile)
242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
243 if verbose > 0:
244 print "Loading checkpoint from %s ..." % dumpfile
245 f = open(dumpfile, "rb")
246 c = pickle.load(f)
247 f.close()
248 if verbose > 0:
249 print "Done."
250 print "Root:", string.join(c.roots, "\n ")
251 return c
254 class Checker:
256 checkext = CHECKEXT
257 verbose = VERBOSE
258 maxpage = MAXPAGE
259 roundsize = ROUNDSIZE
260 nonames = NONAMES
262 validflags = tuple(dir())
264 def __init__(self):
265 self.reset()
267 def setflags(self, **kw):
268 for key in kw.keys():
269 if key not in self.validflags:
270 raise NameError, "invalid keyword argument: %s" % str(key)
271 for key, value in kw.items():
272 setattr(self, key, value)
274 def reset(self):
275 self.roots = []
276 self.todo = {}
277 self.done = {}
278 self.bad = {}
280 # Add a name table, so that the name URLs can be checked. Also
281 # serves as an implicit cache for which URLs are done.
282 self.name_table = {}
284 self.round = 0
285 # The following are not pickled:
286 self.robots = {}
287 self.errors = {}
288 self.urlopener = MyURLopener()
289 self.changed = 0
291 def note(self, level, format, *args):
292 if self.verbose > level:
293 if args:
294 format = format%args
295 self.message(format)
297 def message(self, format, *args):
298 if args:
299 format = format%args
300 print format
302 def __getstate__(self):
303 return (self.roots, self.todo, self.done, self.bad, self.round)
305 def __setstate__(self, state):
306 self.reset()
307 (self.roots, self.todo, self.done, self.bad, self.round) = state
308 for root in self.roots:
309 self.addrobot(root)
310 for url in self.bad.keys():
311 self.markerror(url)
313 def addroot(self, root, add_to_do = 1):
314 if root not in self.roots:
315 troot = root
316 scheme, netloc, path, params, query, fragment = \
317 urlparse.urlparse(root)
318 i = string.rfind(path, "/") + 1
319 if 0 < i < len(path):
320 path = path[:i]
321 troot = urlparse.urlunparse((scheme, netloc, path,
322 params, query, fragment))
323 self.roots.append(troot)
324 self.addrobot(root)
325 if add_to_do:
326 self.newlink((root, ""), ("<root>", root))
328 def addrobot(self, root):
329 root = urlparse.urljoin(root, "/")
330 if self.robots.has_key(root): return
331 url = urlparse.urljoin(root, "/robots.txt")
332 self.robots[root] = rp = robotparser.RobotFileParser()
333 self.note(2, "Parsing %s", url)
334 rp.debug = self.verbose > 3
335 rp.set_url(url)
336 try:
337 rp.read()
338 except IOError, msg:
339 self.note(1, "I/O error parsing %s: %s", url, msg)
341 def run(self):
342 while self.todo:
343 self.round = self.round + 1
344 self.note(0, "\nRound %d (%s)\n", self.round, self.status())
345 urls = self.todo.keys()
346 urls.sort()
347 del urls[self.roundsize:]
348 for url in urls:
349 self.dopage(url)
351 def status(self):
352 return "%d total, %d to do, %d done, %d bad" % (
353 len(self.todo)+len(self.done),
354 len(self.todo), len(self.done),
355 len(self.bad))
357 def report(self):
358 self.message("")
359 if not self.todo: s = "Final"
360 else: s = "Interim"
361 self.message("%s Report (%s)", s, self.status())
362 self.report_errors()
364 def report_errors(self):
365 if not self.bad:
366 self.message("\nNo errors")
367 return
368 self.message("\nError Report:")
369 sources = self.errors.keys()
370 sources.sort()
371 for source in sources:
372 triples = self.errors[source]
373 self.message("")
374 if len(triples) > 1:
375 self.message("%d Errors in %s", len(triples), source)
376 else:
377 self.message("Error in %s", source)
378 # Call self.format_url() instead of referring
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
381 # of the "source" variable comes from the list of
382 # origins, and is a URL, not a pair.
383 for url, rawlink, msg in triples:
384 if rawlink != self.format_url(url): s = " (%s)" % rawlink
385 else: s = ""
386 self.message(" HREF %s%s\n msg %s",
387 self.format_url(url), s, msg)
389 def dopage(self, url_pair):
391 # All printing of URLs uses format_url(); argument changed to
392 # url_pair for clarity.
393 if self.verbose > 1:
394 if self.verbose > 2:
395 self.show("Check ", self.format_url(url_pair),
396 " from", self.todo[url_pair])
397 else:
398 self.message("Check %s", self.format_url(url_pair))
399 url, local_fragment = url_pair
400 if local_fragment and self.nonames:
401 self.markdone(url_pair)
402 return
403 page = self.getpage(url_pair)
404 if page:
405 # Store the page which corresponds to this URL.
406 self.name_table[url] = page
407 # If there is a fragment in this url_pair, and it's not
408 # in the list of names for the page, call setbad(), since
409 # it's a missing anchor.
410 if local_fragment and local_fragment not in page.getnames():
411 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
412 for info in page.getlinkinfos():
413 # getlinkinfos() now returns the fragment as well,
414 # and we store that fragment here in the "todo" dictionary.
415 link, rawlink, fragment = info
416 # However, we don't want the fragment as the origin, since
417 # the origin is logically a page.
418 origin = url, rawlink
419 self.newlink((link, fragment), origin)
420 else:
421 # If no page has been created yet, we want to
422 # record that fact.
423 self.name_table[url_pair[0]] = None
424 self.markdone(url_pair)
426 def newlink(self, url, origin):
427 if self.done.has_key(url):
428 self.newdonelink(url, origin)
429 else:
430 self.newtodolink(url, origin)
432 def newdonelink(self, url, origin):
433 if origin not in self.done[url]:
434 self.done[url].append(origin)
436 # Call self.format_url(), since the URL here
437 # is now a (URL, fragment) pair.
438 self.note(3, " Done link %s", self.format_url(url))
440 # Make sure that if it's bad, that the origin gets added.
441 if self.bad.has_key(url):
442 source, rawlink = origin
443 triple = url, rawlink, self.bad[url]
444 self.seterror(source, triple)
446 def newtodolink(self, url, origin):
447 # Call self.format_url(), since the URL here
448 # is now a (URL, fragment) pair.
449 if self.todo.has_key(url):
450 if origin not in self.todo[url]:
451 self.todo[url].append(origin)
452 self.note(3, " Seen todo link %s", self.format_url(url))
453 else:
454 self.todo[url] = [origin]
455 self.note(3, " New todo link %s", self.format_url(url))
457 def format_url(self, url):
458 link, fragment = url
459 if fragment: return link + "#" + fragment
460 else: return link
462 def markdone(self, url):
463 self.done[url] = self.todo[url]
464 del self.todo[url]
465 self.changed = 1
467 def inroots(self, url):
468 for root in self.roots:
469 if url[:len(root)] == root:
470 return self.isallowed(root, url)
471 return 0
473 def isallowed(self, root, url):
474 root = urlparse.urljoin(root, "/")
475 return self.robots[root].can_fetch(AGENTNAME, url)
477 def getpage(self, url_pair):
478 # Incoming argument name is a (URL, fragment) pair.
479 # The page may have been cached in the name_table variable.
480 url, fragment = url_pair
481 if self.name_table.has_key(url):
482 return self.name_table[url]
484 scheme = urllib.splittype(url)
485 if scheme in ('mailto', 'news', 'javascript', 'telnet'):
486 self.note(1, " Not checking %s URL" % scheme)
487 return None
488 isint = self.inroots(url)
490 # Ensure that openpage gets the URL pair to
491 # print out its error message and record the error pair
492 # correctly.
493 if not isint:
494 if not self.checkext:
495 self.note(1, " Not checking ext link")
496 return None
497 f = self.openpage(url_pair)
498 if f:
499 self.safeclose(f)
500 return None
501 text, nurl = self.readhtml(url_pair)
503 if nurl != url:
504 self.note(1, " Redirected to %s", nurl)
505 url = nurl
506 if text:
507 return Page(text, url, maxpage=self.maxpage, checker=self)
509 # These next three functions take (URL, fragment) pairs as
510 # arguments, so that openpage() receives the appropriate tuple to
511 # record error messages.
512 def readhtml(self, url_pair):
513 url, fragment = url_pair
514 text = None
515 f, url = self.openhtml(url_pair)
516 if f:
517 text = f.read()
518 f.close()
519 return text, url
521 def openhtml(self, url_pair):
522 url, fragment = url_pair
523 f = self.openpage(url_pair)
524 if f:
525 url = f.geturl()
526 info = f.info()
527 if not self.checkforhtml(info, url):
528 self.safeclose(f)
529 f = None
530 return f, url
532 def openpage(self, url_pair):
533 url, fragment = url_pair
534 try:
535 return self.urlopener.open(url)
536 except IOError, msg:
537 msg = self.sanitize(msg)
538 self.note(0, "Error %s", msg)
539 if self.verbose > 0:
540 self.show(" HREF ", url, " from", self.todo[url_pair])
541 self.setbad(url_pair, msg)
542 return None
544 def checkforhtml(self, info, url):
545 if info.has_key('content-type'):
546 ctype = string.lower(info['content-type'])
547 else:
548 if url[-1:] == "/":
549 return 1
550 ctype, encoding = mimetypes.guess_type(url)
551 if ctype == 'text/html':
552 return 1
553 else:
554 self.note(1, " Not HTML, mime type %s", ctype)
555 return 0
557 def setgood(self, url):
558 if self.bad.has_key(url):
559 del self.bad[url]
560 self.changed = 1
561 self.note(0, "(Clear previously seen error)")
563 def setbad(self, url, msg):
564 if self.bad.has_key(url) and self.bad[url] == msg:
565 self.note(0, "(Seen this error before)")
566 return
567 self.bad[url] = msg
568 self.changed = 1
569 self.markerror(url)
571 def markerror(self, url):
572 try:
573 origins = self.todo[url]
574 except KeyError:
575 origins = self.done[url]
576 for source, rawlink in origins:
577 triple = url, rawlink, self.bad[url]
578 self.seterror(source, triple)
580 def seterror(self, url, triple):
581 try:
582 # Because of the way the URLs are now processed, I need to
583 # check to make sure the URL hasn't been entered in the
584 # error list. The first element of the triple here is a
585 # (URL, fragment) pair, but the URL key is not, since it's
586 # from the list of origins.
587 if triple not in self.errors[url]:
588 self.errors[url].append(triple)
589 except KeyError:
590 self.errors[url] = [triple]
592 # The following used to be toplevel functions; they have been
593 # changed into methods so they can be overridden in subclasses.
595 def show(self, p1, link, p2, origins):
596 self.message("%s %s", p1, link)
597 i = 0
598 for source, rawlink in origins:
599 i = i+1
600 if i == 2:
601 p2 = ' '*len(p2)
602 if rawlink != link: s = " (%s)" % rawlink
603 else: s = ""
604 self.message("%s %s%s", p2, source, s)
606 def sanitize(self, msg):
607 if isinstance(IOError, ClassType) and isinstance(msg, IOError):
608 # Do the other branch recursively
609 msg.args = self.sanitize(msg.args)
610 elif isinstance(msg, TupleType):
611 if len(msg) >= 4 and msg[0] == 'http error' and \
612 isinstance(msg[3], InstanceType):
613 # Remove the Message instance -- it may contain
614 # a file object which prevents pickling.
615 msg = msg[:3] + msg[4:]
616 return msg
618 def safeclose(self, f):
619 try:
620 url = f.geturl()
621 except AttributeError:
622 pass
623 else:
624 if url[:4] == 'ftp:' or url[:7] == 'file://':
625 # Apparently ftp connections don't like to be closed
626 # prematurely...
627 text = f.read()
628 f.close()
630 def save_pickle(self, dumpfile=DUMPFILE):
631 if not self.changed:
632 self.note(0, "\nNo need to save checkpoint")
633 elif not dumpfile:
634 self.note(0, "No dumpfile, won't save checkpoint")
635 else:
636 self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
637 newfile = dumpfile + ".new"
638 f = open(newfile, "wb")
639 pickle.dump(self, f)
640 f.close()
641 try:
642 os.unlink(dumpfile)
643 except os.error:
644 pass
645 os.rename(newfile, dumpfile)
646 self.note(0, "Done.")
647 return 1
650 class Page:
652 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
653 self.text = text
654 self.url = url
655 self.verbose = verbose
656 self.maxpage = maxpage
657 self.checker = checker
659 # The parsing of the page is done in the __init__() routine in
660 # order to initialize the list of names the file
661 # contains. Stored the parser in an instance variable. Passed
662 # the URL to MyHTMLParser().
663 size = len(self.text)
664 if size > self.maxpage:
665 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
666 self.parser = None
667 return
668 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
669 self.parser = MyHTMLParser(url, verbose=self.verbose,
670 checker=self.checker)
671 self.parser.feed(self.text)
672 self.parser.close()
674 def note(self, level, msg, *args):
675 if self.checker:
676 apply(self.checker.note, (level, msg) + args)
677 else:
678 if self.verbose >= level:
679 if args:
680 msg = msg%args
681 print msg
683 # Method to retrieve names.
684 def getnames(self):
685 if self.parser:
686 return self.parser.names
687 else:
688 return []
690 def getlinkinfos(self):
691 # File reading is done in __init__() routine. Store parser in
692 # local variable to indicate success of parsing.
694 # If no parser was stored, fail.
695 if not self.parser: return []
697 rawlinks = self.parser.getlinks()
698 base = urlparse.urljoin(self.url, self.parser.getbase() or "")
699 infos = []
700 for rawlink in rawlinks:
701 t = urlparse.urlparse(rawlink)
702 # DON'T DISCARD THE FRAGMENT! Instead, include
703 # it in the tuples which are returned. See Checker.dopage().
704 fragment = t[-1]
705 t = t[:-1] + ('',)
706 rawlink = urlparse.urlunparse(t)
707 link = urlparse.urljoin(base, rawlink)
708 infos.append((link, rawlink, fragment))
710 return infos
713 class MyStringIO(StringIO.StringIO):
715 def __init__(self, url, info):
716 self.__url = url
717 self.__info = info
718 StringIO.StringIO.__init__(self)
720 def info(self):
721 return self.__info
723 def geturl(self):
724 return self.__url
727 class MyURLopener(urllib.FancyURLopener):
729 http_error_default = urllib.URLopener.http_error_default
731 def __init__(*args):
732 self = args[0]
733 apply(urllib.FancyURLopener.__init__, args)
734 self.addheaders = [
735 ('User-agent', 'Python-webchecker/%s' % __version__),
738 def http_error_401(self, url, fp, errcode, errmsg, headers):
739 return None
741 def open_file(self, url):
742 path = urllib.url2pathname(urllib.unquote(url))
743 if os.path.isdir(path):
744 if path[-1] != os.sep:
745 url = url + '/'
746 indexpath = os.path.join(path, "index.html")
747 if os.path.exists(indexpath):
748 return self.open_file(url + "index.html")
749 try:
750 names = os.listdir(path)
751 except os.error, msg:
752 raise IOError, msg, sys.exc_traceback
753 names.sort()
754 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
755 s.write('<BASE HREF="file:%s">\n' %
756 urllib.quote(os.path.join(path, "")))
757 for name in names:
758 q = urllib.quote(name)
759 s.write('<A HREF="%s">%s</A>\n' % (q, q))
760 s.seek(0)
761 return s
762 return urllib.FancyURLopener.open_file(self, url)
765 class MyHTMLParser(sgmllib.SGMLParser):
767 def __init__(self, url, verbose=VERBOSE, checker=None):
768 self.myverbose = verbose # now unused
769 self.checker = checker
770 self.base = None
771 self.links = {}
772 self.names = []
773 self.url = url
774 sgmllib.SGMLParser.__init__(self)
776 def start_a(self, attributes):
777 self.link_attr(attributes, 'href')
779 # We must rescue the NAME
780 # attributes from the anchor, in order to
781 # cache the internal anchors which are made
782 # available in the page.
783 for name, value in attributes:
784 if name == "name":
785 if value in self.names:
786 self.checker.message("WARNING: duplicate name %s in %s",
787 value, self.url)
788 else: self.names.append(value)
789 break
791 def end_a(self): pass
793 def do_area(self, attributes):
794 self.link_attr(attributes, 'href')
796 def do_body(self, attributes):
797 self.link_attr(attributes, 'background', 'bgsound')
799 def do_img(self, attributes):
800 self.link_attr(attributes, 'src', 'lowsrc')
802 def do_frame(self, attributes):
803 self.link_attr(attributes, 'src', 'longdesc')
805 def do_iframe(self, attributes):
806 self.link_attr(attributes, 'src', 'longdesc')
808 def do_link(self, attributes):
809 for name, value in attributes:
810 if name == "rel":
811 parts = string.split(string.lower(value))
812 if ( parts == ["stylesheet"]
813 or parts == ["alternate", "stylesheet"]):
814 self.link_attr(attributes, "href")
815 break
817 def do_object(self, attributes):
818 self.link_attr(attributes, 'data', 'usemap')
820 def do_script(self, attributes):
821 self.link_attr(attributes, 'src')
823 def do_table(self, attributes):
824 self.link_attr(attributes, 'background')
826 def do_td(self, attributes):
827 self.link_attr(attributes, 'background')
829 def do_th(self, attributes):
830 self.link_attr(attributes, 'background')
832 def do_tr(self, attributes):
833 self.link_attr(attributes, 'background')
835 def link_attr(self, attributes, *args):
836 for name, value in attributes:
837 if name in args:
838 if value: value = string.strip(value)
839 if value: self.links[value] = None
841 def do_base(self, attributes):
842 for name, value in attributes:
843 if name == 'href':
844 if value: value = string.strip(value)
845 if value:
846 if self.checker:
847 self.checker.note(1, " Base %s", value)
848 self.base = value
850 def getlinks(self):
851 return self.links.keys()
853 def getbase(self):
854 return self.base
857 if __name__ == '__main__':
858 main()