Updated for 2.1b2 distribution.
[python/dscho.git] / Tools / webchecker / webchecker.py
blobe79e7f10c09726179890c1a6902d0bfd9c51b97e
1 #! /usr/bin/env python
3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
4 # including code to check URL fragments.
6 """Web tree checker.
8 This utility is handy to check a subweb of the world-wide web for
9 errors. A subweb is specified by giving one or more ``root URLs''; a
10 page belongs to the subweb if one of the root URLs is an initial
11 prefix of it.
13 File URL extension:
15 In order to easy the checking of subwebs via the local file system,
16 the interpretation of ``file:'' URLs is extended to mimic the behavior
17 of your average HTTP daemon: if a directory pathname is given, the
18 file index.html in that directory is returned if it exists, otherwise
19 a directory listing is returned. Now, you can point webchecker to the
20 document tree in the local file system of your HTTP daemon, and have
21 most of it checked. In fact the default works this way if your local
22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
23 the NCSA HTTP daemon and probably others).
25 Report printed:
27 When done, it reports pages with bad links within the subweb. When
28 interrupted, it reports for the pages that it has checked already.
30 In verbose mode, additional messages are printed during the
31 information gathering phase. By default, it prints a summary of its
32 work status every 50 URLs (adjustable with the -r option), and it
33 reports errors as they are encountered. Use the -q option to disable
34 this output.
36 Checkpoint feature:
38 Whether interrupted or not, it dumps its state (a Python pickle) to a
39 checkpoint file and the -R option allows it to restart from the
40 checkpoint (assuming that the pages on the subweb that were already
41 processed haven't changed). Even when it has run till completion, -R
42 can still be useful -- it will print the reports again, and -Rq prints
43 the errors only. In this case, the checkpoint file is not written
44 again. The checkpoint file can be set with the -d option.
46 The checkpoint file is written as a Python pickle. Remember that
47 Python's pickle module is currently quite slow. Give it the time it
48 needs to load and save the checkpoint file. When interrupted while
49 writing the checkpoint file, the old checkpoint file is not
50 overwritten, but all work done in the current run is lost.
52 Miscellaneous:
54 - You may find the (Tk-based) GUI version easier to use. See wcgui.py.
56 - Webchecker honors the "robots.txt" convention. Thanks to Skip
57 Montanaro for his robotparser.py module (included in this directory)!
58 The agent name is hardwired to "webchecker". URLs that are disallowed
59 by the robots.txt file are reported as external URLs.
61 - Because the SGML parser is a bit slow, very large SGML files are
62 skipped. The size limit can be set with the -m option.
64 - When the server or protocol does not tell us a file's type, we guess
65 it based on the URL's suffix. The mimetypes.py module (also in this
66 directory) has a built-in table mapping most currently known suffixes,
67 and in addition attempts to read the mime.types configuration files in
68 the default locations of Netscape and the NCSA HTTP daemon.
70 - We follow links indicated by <A>, <FRAME> and <IMG> tags. We also
71 honor the <BASE> tag.
73 - We now check internal NAME anchor links, as well as toplevel links.
75 - Checking external links is now done by default; use -x to *disable*
76 this feature. External links are now checked during normal
77 processing. (XXX The status of a checked link could be categorized
78 better. Later...)
80 - If external links are not checked, you can use the -t flag to
81 provide specific overrides to -x.
83 Usage: webchecker.py [option] ... [rooturl] ...
85 Options:
87 -R -- restart from checkpoint file
88 -d file -- checkpoint filename (default %(DUMPFILE)s)
89 -m bytes -- skip HTML pages larger than this size (default %(MAXPAGE)d)
90 -n -- reports only, no checking (use with -R)
91 -q -- quiet operation (also suppresses external links report)
92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
93 -t root -- specify root dir which should be treated as internal (can repeat)
94 -v -- verbose operation; repeating -v will increase verbosity
95 -x -- don't check external links (these are often slow to check)
96 -a -- don't check name anchors
98 Arguments:
100 rooturl -- URL to start checking
101 (default %(DEFROOT)s)
106 __version__ = "$Revision$"
109 import sys
110 import os
111 from types import *
112 import string
113 import StringIO
114 import getopt
115 import pickle
117 import urllib
118 import urlparse
119 import sgmllib
121 import mimetypes
122 import robotparser
124 # Extract real version number if necessary
125 if __version__[0] == '$':
126 _v = string.split(__version__)
127 if len(_v) == 3:
128 __version__ = _v[1]
131 # Tunable parameters
132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
133 CHECKEXT = 1 # Check external references (1 deep)
134 VERBOSE = 1 # Verbosity level (0-3)
135 MAXPAGE = 150000 # Ignore files bigger than this
136 ROUNDSIZE = 50 # Number of links processed per round
137 DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
138 AGENTNAME = "webchecker" # Agent name for robots.txt parser
139 NONAMES = 0 # Force name anchor checking
142 # Global variables
145 def main():
146 checkext = CHECKEXT
147 verbose = VERBOSE
148 maxpage = MAXPAGE
149 roundsize = ROUNDSIZE
150 dumpfile = DUMPFILE
151 restart = 0
152 norun = 0
154 try:
155 opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
156 except getopt.error, msg:
157 sys.stdout = sys.stderr
158 print msg
159 print __doc__%globals()
160 sys.exit(2)
162 # The extra_roots variable collects extra roots.
163 extra_roots = []
164 nonames = NONAMES
166 for o, a in opts:
167 if o == '-R':
168 restart = 1
169 if o == '-d':
170 dumpfile = a
171 if o == '-m':
172 maxpage = string.atoi(a)
173 if o == '-n':
174 norun = 1
175 if o == '-q':
176 verbose = 0
177 if o == '-r':
178 roundsize = string.atoi(a)
179 if o == '-t':
180 extra_roots.append(a)
181 if o == '-a':
182 nonames = not nonames
183 if o == '-v':
184 verbose = verbose + 1
185 if o == '-x':
186 checkext = not checkext
188 if verbose > 0:
189 print AGENTNAME, "version", __version__
191 if restart:
192 c = load_pickle(dumpfile=dumpfile, verbose=verbose)
193 else:
194 c = Checker()
196 c.setflags(checkext=checkext, verbose=verbose,
197 maxpage=maxpage, roundsize=roundsize,
198 nonames=nonames
201 if not restart and not args:
202 args.append(DEFROOT)
204 for arg in args:
205 c.addroot(arg)
207 # The -t flag is only needed if external links are not to be
208 # checked. So -t values are ignored unless -x was specified.
209 if not checkext:
210 for root in extra_roots:
211 # Make sure it's terminated by a slash,
212 # so that addroot doesn't discard the last
213 # directory component.
214 if root[-1] != "/":
215 root = root + "/"
216 c.addroot(root, add_to_do = 0)
218 try:
220 if not norun:
221 try:
222 c.run()
223 except KeyboardInterrupt:
224 if verbose > 0:
225 print "[run interrupted]"
227 try:
228 c.report()
229 except KeyboardInterrupt:
230 if verbose > 0:
231 print "[report interrupted]"
233 finally:
234 if c.save_pickle(dumpfile):
235 if dumpfile == DUMPFILE:
236 print "Use ``%s -R'' to restart." % sys.argv[0]
237 else:
238 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
239 dumpfile)
242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
243 if verbose > 0:
244 print "Loading checkpoint from %s ..." % dumpfile
245 f = open(dumpfile, "rb")
246 c = pickle.load(f)
247 f.close()
248 if verbose > 0:
249 print "Done."
250 print "Root:", string.join(c.roots, "\n ")
251 return c
254 class Checker:
256 checkext = CHECKEXT
257 verbose = VERBOSE
258 maxpage = MAXPAGE
259 roundsize = ROUNDSIZE
260 nonames = NONAMES
262 validflags = tuple(dir())
264 def __init__(self):
265 self.reset()
267 def setflags(self, **kw):
268 for key in kw.keys():
269 if key not in self.validflags:
270 raise NameError, "invalid keyword argument: %s" % str(key)
271 for key, value in kw.items():
272 setattr(self, key, value)
274 def reset(self):
275 self.roots = []
276 self.todo = {}
277 self.done = {}
278 self.bad = {}
280 # Add a name table, so that the name URLs can be checked. Also
281 # serves as an implicit cache for which URLs are done.
282 self.name_table = {}
284 self.round = 0
285 # The following are not pickled:
286 self.robots = {}
287 self.errors = {}
288 self.urlopener = MyURLopener()
289 self.changed = 0
291 def note(self, level, format, *args):
292 if self.verbose > level:
293 if args:
294 format = format%args
295 self.message(format)
297 def message(self, format, *args):
298 if args:
299 format = format%args
300 print format
302 def __getstate__(self):
303 return (self.roots, self.todo, self.done, self.bad, self.round)
305 def __setstate__(self, state):
306 self.reset()
307 (self.roots, self.todo, self.done, self.bad, self.round) = state
308 for root in self.roots:
309 self.addrobot(root)
310 for url in self.bad.keys():
311 self.markerror(url)
313 def addroot(self, root, add_to_do = 1):
314 if root not in self.roots:
315 troot = root
316 scheme, netloc, path, params, query, fragment = \
317 urlparse.urlparse(root)
318 i = string.rfind(path, "/") + 1
319 if 0 < i < len(path):
320 path = path[:i]
321 troot = urlparse.urlunparse((scheme, netloc, path,
322 params, query, fragment))
323 self.roots.append(troot)
324 self.addrobot(root)
325 if add_to_do:
326 self.newlink((root, ""), ("<root>", root))
328 def addrobot(self, root):
329 root = urlparse.urljoin(root, "/")
330 if self.robots.has_key(root): return
331 url = urlparse.urljoin(root, "/robots.txt")
332 self.robots[root] = rp = robotparser.RobotFileParser()
333 self.note(2, "Parsing %s", url)
334 rp.debug = self.verbose > 3
335 rp.set_url(url)
336 try:
337 rp.read()
338 except IOError, msg:
339 self.note(1, "I/O error parsing %s: %s", url, msg)
341 def run(self):
342 while self.todo:
343 self.round = self.round + 1
344 self.note(0, "\nRound %d (%s)\n", self.round, self.status())
345 urls = self.todo.keys()
346 urls.sort()
347 del urls[self.roundsize:]
348 for url in urls:
349 self.dopage(url)
351 def status(self):
352 return "%d total, %d to do, %d done, %d bad" % (
353 len(self.todo)+len(self.done),
354 len(self.todo), len(self.done),
355 len(self.bad))
357 def report(self):
358 self.message("")
359 if not self.todo: s = "Final"
360 else: s = "Interim"
361 self.message("%s Report (%s)", s, self.status())
362 self.report_errors()
364 def report_errors(self):
365 if not self.bad:
366 self.message("\nNo errors")
367 return
368 self.message("\nError Report:")
369 sources = self.errors.keys()
370 sources.sort()
371 for source in sources:
372 triples = self.errors[source]
373 self.message("")
374 if len(triples) > 1:
375 self.message("%d Errors in %s", len(triples), source)
376 else:
377 self.message("Error in %s", source)
378 # Call self.format_url() instead of referring
379 # to the URL directly, since the URLs in these
380 # triples is now a (URL, fragment) pair. The value
381 # of the "source" variable comes from the list of
382 # origins, and is a URL, not a pair.
383 for url, rawlink, msg in triples:
384 if rawlink != self.format_url(url): s = " (%s)" % rawlink
385 else: s = ""
386 self.message(" HREF %s%s\n msg %s",
387 self.format_url(url), s, msg)
389 def dopage(self, url_pair):
391 # All printing of URLs uses format_url(); argument changed to
392 # url_pair for clarity.
393 if self.verbose > 1:
394 if self.verbose > 2:
395 self.show("Check ", self.format_url(url_pair),
396 " from", self.todo[url_pair])
397 else:
398 self.message("Check %s", self.format_url(url_pair))
399 url, local_fragment = url_pair
400 if local_fragment and self.nonames:
401 self.markdone(url_pair)
402 return
403 page = self.getpage(url_pair)
404 if page:
405 # Store the page which corresponds to this URL.
406 self.name_table[url] = page
407 # If there is a fragment in this url_pair, and it's not
408 # in the list of names for the page, call setbad(), since
409 # it's a missing anchor.
410 if local_fragment and local_fragment not in page.getnames():
411 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
412 for info in page.getlinkinfos():
413 # getlinkinfos() now returns the fragment as well,
414 # and we store that fragment here in the "todo" dictionary.
415 link, rawlink, fragment = info
416 # However, we don't want the fragment as the origin, since
417 # the origin is logically a page.
418 origin = url, rawlink
419 self.newlink((link, fragment), origin)
420 else:
421 # If no page has been created yet, we want to
422 # record that fact.
423 self.name_table[url_pair[0]] = None
424 self.markdone(url_pair)
426 def newlink(self, url, origin):
427 if self.done.has_key(url):
428 self.newdonelink(url, origin)
429 else:
430 self.newtodolink(url, origin)
432 def newdonelink(self, url, origin):
433 if origin not in self.done[url]:
434 self.done[url].append(origin)
436 # Call self.format_url(), since the URL here
437 # is now a (URL, fragment) pair.
438 self.note(3, " Done link %s", self.format_url(url))
440 # Make sure that if it's bad, that the origin gets added.
441 if self.bad.has_key(url):
442 source, rawlink = origin
443 triple = url, rawlink, self.bad[url]
444 self.seterror(source, triple)
446 def newtodolink(self, url, origin):
447 # Call self.format_url(), since the URL here
448 # is now a (URL, fragment) pair.
449 if self.todo.has_key(url):
450 if origin not in self.todo[url]:
451 self.todo[url].append(origin)
452 self.note(3, " Seen todo link %s", self.format_url(url))
453 else:
454 self.todo[url] = [origin]
455 self.note(3, " New todo link %s", self.format_url(url))
457 def format_url(self, url):
458 link, fragment = url
459 if fragment: return link + "#" + fragment
460 else: return link
462 def markdone(self, url):
463 self.done[url] = self.todo[url]
464 del self.todo[url]
465 self.changed = 1
467 def inroots(self, url):
468 for root in self.roots:
469 if url[:len(root)] == root:
470 return self.isallowed(root, url)
471 return 0
473 def isallowed(self, root, url):
474 root = urlparse.urljoin(root, "/")
475 return self.robots[root].can_fetch(AGENTNAME, url)
477 def getpage(self, url_pair):
478 # Incoming argument name is a (URL, fragment) pair.
479 # The page may have been cached in the name_table variable.
480 url, fragment = url_pair
481 if self.name_table.has_key(url):
482 return self.name_table[url]
484 if url[:7] == 'mailto:' or url[:5] == 'news:':
485 self.note(1, " Not checking mailto/news URL")
486 return None
487 isint = self.inroots(url)
489 # Ensure that openpage gets the URL pair to
490 # print out its error message and record the error pair
491 # correctly.
492 if not isint:
493 if not self.checkext:
494 self.note(1, " Not checking ext link")
495 return None
496 f = self.openpage(url_pair)
497 if f:
498 self.safeclose(f)
499 return None
500 text, nurl = self.readhtml(url_pair)
502 if nurl != url:
503 self.note(1, " Redirected to %s", nurl)
504 url = nurl
505 if text:
506 return Page(text, url, maxpage=self.maxpage, checker=self)
508 # These next three functions take (URL, fragment) pairs as
509 # arguments, so that openpage() receives the appropriate tuple to
510 # record error messages.
511 def readhtml(self, url_pair):
512 url, fragment = url_pair
513 text = None
514 f, url = self.openhtml(url_pair)
515 if f:
516 text = f.read()
517 f.close()
518 return text, url
520 def openhtml(self, url_pair):
521 url, fragment = url_pair
522 f = self.openpage(url_pair)
523 if f:
524 url = f.geturl()
525 info = f.info()
526 if not self.checkforhtml(info, url):
527 self.safeclose(f)
528 f = None
529 return f, url
531 def openpage(self, url_pair):
532 url, fragment = url_pair
533 try:
534 return self.urlopener.open(url)
535 except IOError, msg:
536 msg = self.sanitize(msg)
537 self.note(0, "Error %s", msg)
538 if self.verbose > 0:
539 self.show(" HREF ", url, " from", self.todo[url_pair])
540 self.setbad(url_pair, msg)
541 return None
543 def checkforhtml(self, info, url):
544 if info.has_key('content-type'):
545 ctype = string.lower(info['content-type'])
546 else:
547 if url[-1:] == "/":
548 return 1
549 ctype, encoding = mimetypes.guess_type(url)
550 if ctype == 'text/html':
551 return 1
552 else:
553 self.note(1, " Not HTML, mime type %s", ctype)
554 return 0
556 def setgood(self, url):
557 if self.bad.has_key(url):
558 del self.bad[url]
559 self.changed = 1
560 self.note(0, "(Clear previously seen error)")
562 def setbad(self, url, msg):
563 if self.bad.has_key(url) and self.bad[url] == msg:
564 self.note(0, "(Seen this error before)")
565 return
566 self.bad[url] = msg
567 self.changed = 1
568 self.markerror(url)
570 def markerror(self, url):
571 try:
572 origins = self.todo[url]
573 except KeyError:
574 origins = self.done[url]
575 for source, rawlink in origins:
576 triple = url, rawlink, self.bad[url]
577 self.seterror(source, triple)
579 def seterror(self, url, triple):
580 try:
581 # Because of the way the URLs are now processed, I need to
582 # check to make sure the URL hasn't been entered in the
583 # error list. The first element of the triple here is a
584 # (URL, fragment) pair, but the URL key is not, since it's
585 # from the list of origins.
586 if triple not in self.errors[url]:
587 self.errors[url].append(triple)
588 except KeyError:
589 self.errors[url] = [triple]
591 # The following used to be toplevel functions; they have been
592 # changed into methods so they can be overridden in subclasses.
594 def show(self, p1, link, p2, origins):
595 self.message("%s %s", p1, link)
596 i = 0
597 for source, rawlink in origins:
598 i = i+1
599 if i == 2:
600 p2 = ' '*len(p2)
601 if rawlink != link: s = " (%s)" % rawlink
602 else: s = ""
603 self.message("%s %s%s", p2, source, s)
605 def sanitize(self, msg):
606 if isinstance(IOError, ClassType) and isinstance(msg, IOError):
607 # Do the other branch recursively
608 msg.args = self.sanitize(msg.args)
609 elif isinstance(msg, TupleType):
610 if len(msg) >= 4 and msg[0] == 'http error' and \
611 isinstance(msg[3], InstanceType):
612 # Remove the Message instance -- it may contain
613 # a file object which prevents pickling.
614 msg = msg[:3] + msg[4:]
615 return msg
617 def safeclose(self, f):
618 try:
619 url = f.geturl()
620 except AttributeError:
621 pass
622 else:
623 if url[:4] == 'ftp:' or url[:7] == 'file://':
624 # Apparently ftp connections don't like to be closed
625 # prematurely...
626 text = f.read()
627 f.close()
629 def save_pickle(self, dumpfile=DUMPFILE):
630 if not self.changed:
631 self.note(0, "\nNo need to save checkpoint")
632 elif not dumpfile:
633 self.note(0, "No dumpfile, won't save checkpoint")
634 else:
635 self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
636 newfile = dumpfile + ".new"
637 f = open(newfile, "wb")
638 pickle.dump(self, f)
639 f.close()
640 try:
641 os.unlink(dumpfile)
642 except os.error:
643 pass
644 os.rename(newfile, dumpfile)
645 self.note(0, "Done.")
646 return 1
649 class Page:
651 def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
652 self.text = text
653 self.url = url
654 self.verbose = verbose
655 self.maxpage = maxpage
656 self.checker = checker
658 # The parsing of the page is done in the __init__() routine in
659 # order to initialize the list of names the file
660 # contains. Stored the parser in an instance variable. Passed
661 # the URL to MyHTMLParser().
662 size = len(self.text)
663 if size > self.maxpage:
664 self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
665 self.parser = None
666 return
667 self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
668 self.parser = MyHTMLParser(url, verbose=self.verbose,
669 checker=self.checker)
670 self.parser.feed(self.text)
671 self.parser.close()
673 def note(self, level, msg, *args):
674 if self.checker:
675 apply(self.checker.note, (level, msg) + args)
676 else:
677 if self.verbose >= level:
678 if args:
679 msg = msg%args
680 print msg
682 # Method to retrieve names.
683 def getnames(self):
684 if self.parser:
685 return self.parser.names
686 else:
687 return []
689 def getlinkinfos(self):
690 # File reading is done in __init__() routine. Store parser in
691 # local variable to indicate success of parsing.
693 # If no parser was stored, fail.
694 if not self.parser: return []
696 rawlinks = self.parser.getlinks()
697 base = urlparse.urljoin(self.url, self.parser.getbase() or "")
698 infos = []
699 for rawlink in rawlinks:
700 t = urlparse.urlparse(rawlink)
701 # DON'T DISCARD THE FRAGMENT! Instead, include
702 # it in the tuples which are returned. See Checker.dopage().
703 fragment = t[-1]
704 t = t[:-1] + ('',)
705 rawlink = urlparse.urlunparse(t)
706 link = urlparse.urljoin(base, rawlink)
707 infos.append((link, rawlink, fragment))
709 return infos
712 class MyStringIO(StringIO.StringIO):
714 def __init__(self, url, info):
715 self.__url = url
716 self.__info = info
717 StringIO.StringIO.__init__(self)
719 def info(self):
720 return self.__info
722 def geturl(self):
723 return self.__url
726 class MyURLopener(urllib.FancyURLopener):
728 http_error_default = urllib.URLopener.http_error_default
730 def __init__(*args):
731 self = args[0]
732 apply(urllib.FancyURLopener.__init__, args)
733 self.addheaders = [
734 ('User-agent', 'Python-webchecker/%s' % __version__),
737 def http_error_401(self, url, fp, errcode, errmsg, headers):
738 return None
740 def open_file(self, url):
741 path = urllib.url2pathname(urllib.unquote(url))
742 if os.path.isdir(path):
743 if path[-1] != os.sep:
744 url = url + '/'
745 indexpath = os.path.join(path, "index.html")
746 if os.path.exists(indexpath):
747 return self.open_file(url + "index.html")
748 try:
749 names = os.listdir(path)
750 except os.error, msg:
751 raise IOError, msg, sys.exc_traceback
752 names.sort()
753 s = MyStringIO("file:"+url, {'content-type': 'text/html'})
754 s.write('<BASE HREF="file:%s">\n' %
755 urllib.quote(os.path.join(path, "")))
756 for name in names:
757 q = urllib.quote(name)
758 s.write('<A HREF="%s">%s</A>\n' % (q, q))
759 s.seek(0)
760 return s
761 return urllib.FancyURLopener.open_file(self, url)
764 class MyHTMLParser(sgmllib.SGMLParser):
766 def __init__(self, url, verbose=VERBOSE, checker=None):
767 self.myverbose = verbose # now unused
768 self.checker = checker
769 self.base = None
770 self.links = {}
771 self.names = []
772 self.url = url
773 sgmllib.SGMLParser.__init__(self)
775 def start_a(self, attributes):
776 self.link_attr(attributes, 'href')
778 # We must rescue the NAME
779 # attributes from the anchor, in order to
780 # cache the internal anchors which are made
781 # available in the page.
782 for name, value in attributes:
783 if name == "name":
784 if value in self.names:
785 self.checker.message("WARNING: duplicate name %s in %s",
786 value, self.url)
787 else: self.names.append(value)
788 break
790 def end_a(self): pass
792 def do_area(self, attributes):
793 self.link_attr(attributes, 'href')
795 def do_img(self, attributes):
796 self.link_attr(attributes, 'src', 'lowsrc')
798 def do_frame(self, attributes):
799 self.link_attr(attributes, 'src')
801 def link_attr(self, attributes, *args):
802 for name, value in attributes:
803 if name in args:
804 if value: value = string.strip(value)
805 if value: self.links[value] = None
807 def do_base(self, attributes):
808 for name, value in attributes:
809 if name == 'href':
810 if value: value = string.strip(value)
811 if value:
812 if self.checker:
813 self.checker.note(1, " Base %s", value)
814 self.base = value
816 def getlinks(self):
817 return self.links.keys()
819 def getbase(self):
820 return self.base
823 if __name__ == '__main__':
824 main()