Tools/webchecker/webchecker.py

   1 #! /usr/bin/env python
   2
   3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
   4 # including code to check URL fragments.
   5
   6 """Web tree checker.
   7
   8 This utility is handy to check a subweb of the world-wide web for
   9 errors.  A subweb is specified by giving one or more ``root URLs''; a
  10 page belongs to the subweb if one of the root URLs is an initial
  11 prefix of it.
  12
  13 File URL extension:
  14
  15 In order to easy the checking of subwebs via the local file system,
  16 the interpretation of ``file:'' URLs is extended to mimic the behavior
  17 of your average HTTP daemon: if a directory pathname is given, the
  18 file index.html in that directory is returned if it exists, otherwise
  19 a directory listing is returned.  Now, you can point webchecker to the
  20 document tree in the local file system of your HTTP daemon, and have
  21 most of it checked.  In fact the default works this way if your local
  22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
  23 the NCSA HTTP daemon and probably others).
  24
  25 Report printed:
  26
  27 When done, it reports pages with bad links within the subweb.  When
  28 interrupted, it reports for the pages that it has checked already.
  29
  30 In verbose mode, additional messages are printed during the
  31 information gathering phase.  By default, it prints a summary of its
  32 work status every 50 URLs (adjustable with the -r option), and it
  33 reports errors as they are encountered.  Use the -q option to disable
  34 this output.
  35
  36 Checkpoint feature:
  37
  38 Whether interrupted or not, it dumps its state (a Python pickle) to a
  39 checkpoint file and the -R option allows it to restart from the
  40 checkpoint (assuming that the pages on the subweb that were already
  41 processed haven't changed).  Even when it has run till completion, -R
  42 can still be useful -- it will print the reports again, and -Rq prints
  43 the errors only.  In this case, the checkpoint file is not written
  44 again.  The checkpoint file can be set with the -d option.
  45
  46 The checkpoint file is written as a Python pickle.  Remember that
  47 Python's pickle module is currently quite slow.  Give it the time it
  48 needs to load and save the checkpoint file.  When interrupted while
  49 writing the checkpoint file, the old checkpoint file is not
  50 overwritten, but all work done in the current run is lost.
  51
  52 Miscellaneous:
  53
  54 - You may find the (Tk-based) GUI version easier to use.  See wcgui.py.
  55
  56 - Webchecker honors the "robots.txt" convention.  Thanks to Skip
  57 Montanaro for his robotparser.py module (included in this directory)!
  58 The agent name is hardwired to "webchecker".  URLs that are disallowed
  59 by the robots.txt file are reported as external URLs.
  60
  61 - Because the SGML parser is a bit slow, very large SGML files are
  62 skipped.  The size limit can be set with the -m option.
  63
  64 - When the server or protocol does not tell us a file's type, we guess
  65 it based on the URL's suffix.  The mimetypes.py module (also in this
  66 directory) has a built-in table mapping most currently known suffixes,
  67 and in addition attempts to read the mime.types configuration files in
  68 the default locations of Netscape and the NCSA HTTP daemon.
  69
  70 - We follow links indicated by <A>, <FRAME> and <IMG> tags.  We also
  71 honor the <BASE> tag.
  72
  73 - We now check internal NAME anchor links, as well as toplevel links.
  74
  75 - Checking external links is now done by default; use -x to *disable*
  76 this feature.  External links are now checked during normal
  77 processing.  (XXX The status of a checked link could be categorized
  78 better.  Later...)
  79
  80 - If external links are not checked, you can use the -t flag to
  81 provide specific overrides to -x.
  82
  83 Usage: webchecker.py [option] ... [rooturl] ...
  84
  85 Options:
  86
  87 -R        -- restart from checkpoint file
  88 -d file   -- checkpoint filename (default %(DUMPFILE)s)
  89 -m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
  90 -n        -- reports only, no checking (use with -R)
  91 -q        -- quiet operation (also suppresses external links report)
  92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
  93 -t root   -- specify root dir which should be treated as internal (can repeat)
  94 -v        -- verbose operation; repeating -v will increase verbosity
  95 -x        -- don't check external links (these are often slow to check)
  96 -a        -- don't check name anchors
  97
  98 Arguments:
  99
 100 rooturl   -- URL to start checking
 101              (default %(DEFROOT)s)
 102
 103 """
 104
 105
 106 __version__ = "$Revision$"
 107
 108
 109 import sys
 110 import os
 111 from types import *
 112 import string
 113 import StringIO
 114 import getopt
 115 import pickle
 116
 117 import urllib
 118 import urlparse
 119 import sgmllib
 120
 121 import mimetypes
 122 import robotparser
 123
 124 # Extract real version number if necessary
 125 if __version__[0] == '$':
 126     _v = string.split(__version__)
 127     if len(_v) == 3:
 128         __version__ = _v[1]
 129
 130
 131 # Tunable parameters
 132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/"   # Default root URL
 133 CHECKEXT = 1                            # Check external references (1 deep)
 134 VERBOSE = 1                             # Verbosity level (0-3)
 135 MAXPAGE = 150000                        # Ignore files bigger than this
 136 ROUNDSIZE = 50                          # Number of links processed per round
 137 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 138 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
 139 NONAMES = 0                             # Force name anchor checking
 140
 141
 142 # Global variables
 143
 144
 145 def main():
 146     checkext = CHECKEXT
 147     verbose = VERBOSE
 148     maxpage = MAXPAGE
 149     roundsize = ROUNDSIZE
 150     dumpfile = DUMPFILE
 151     restart = 0
 152     norun = 0
 153
 154     try:
 155         opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
 156     except getopt.error, msg:
 157         sys.stdout = sys.stderr
 158         print msg
 159         print __doc__%globals()
 160         sys.exit(2)
 161
 162     # The extra_roots variable collects extra roots.
 163     extra_roots = []
 164     nonames = NONAMES
 165
 166     for o, a in opts:
 167         if o == '-R':
 168             restart = 1
 169         if o == '-d':
 170             dumpfile = a
 171         if o == '-m':
 172             maxpage = string.atoi(a)
 173         if o == '-n':
 174             norun = 1
 175         if o == '-q':
 176             verbose = 0
 177         if o == '-r':
 178             roundsize = string.atoi(a)
 179         if o == '-t':
 180             extra_roots.append(a)
 181         if o == '-a':
 182             nonames = not nonames
 183         if o == '-v':
 184             verbose = verbose + 1
 185         if o == '-x':
 186             checkext = not checkext
 187
 188     if verbose > 0:
 189         print AGENTNAME, "version", __version__
 190
 191     if restart:
 192         c = load_pickle(dumpfile=dumpfile, verbose=verbose)
 193     else:
 194         c = Checker()
 195
 196     c.setflags(checkext=checkext, verbose=verbose,
 197                maxpage=maxpage, roundsize=roundsize,
 198                nonames=nonames
 199                )
 200
 201     if not restart and not args:
 202         args.append(DEFROOT)
 203
 204     for arg in args:
 205         c.addroot(arg)
 206
 207     # The -t flag is only needed if external links are not to be
 208     # checked. So -t values are ignored unless -x was specified.
 209     if not checkext:
 210         for root in extra_roots:
 211             # Make sure it's terminated by a slash,
 212             # so that addroot doesn't discard the last
 213             # directory component.
 214             if root[-1] != "/":
 215                 root = root + "/"
 216             c.addroot(root, add_to_do = 0)
 217
 218     try:
 219
 220         if not norun:
 221             try:
 222                 c.run()
 223             except KeyboardInterrupt:
 224                 if verbose > 0:
 225                     print "[run interrupted]"
 226
 227         try:
 228             c.report()
 229         except KeyboardInterrupt:
 230             if verbose > 0:
 231                 print "[report interrupted]"
 232
 233     finally:
 234         if c.save_pickle(dumpfile):
 235             if dumpfile == DUMPFILE:
 236                 print "Use ``%s -R'' to restart." % sys.argv[0]
 237             else:
 238                 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
 239                                                            dumpfile)
 240
 241
 242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
 243     if verbose > 0:
 244         print "Loading checkpoint from %s ..." % dumpfile
 245     f = open(dumpfile, "rb")
 246     c = pickle.load(f)
 247     f.close()
 248     if verbose > 0:
 249         print "Done."
 250         print "Root:", string.join(c.roots, "\n      ")
 251     return c
 252
 253
 254 class Checker:
 255
 256     checkext = CHECKEXT
 257     verbose = VERBOSE
 258     maxpage = MAXPAGE
 259     roundsize = ROUNDSIZE
 260     nonames = NONAMES
 261
 262     validflags = tuple(dir())
 263
 264     def __init__(self):
 265         self.reset()
 266
 267     def setflags(self, **kw):
 268         for key in kw.keys():
 269             if key not in self.validflags:
 270                 raise NameError, "invalid keyword argument: %s" % str(key)
 271         for key, value in kw.items():
 272             setattr(self, key, value)
 273
 274     def reset(self):
 275         self.roots = []
 276         self.todo = {}
 277         self.done = {}
 278         self.bad = {}
 279
 280         # Add a name table, so that the name URLs can be checked. Also
 281         # serves as an implicit cache for which URLs are done.
 282         self.name_table = {}
 283
 284         self.round = 0
 285         # The following are not pickled:
 286         self.robots = {}
 287         self.errors = {}
 288         self.urlopener = MyURLopener()
 289         self.changed = 0
 290
 291     def note(self, level, format, *args):
 292         if self.verbose > level:
 293             if args:
 294                 format = format%args
 295             self.message(format)
 296
 297     def message(self, format, *args):
 298         if args:
 299             format = format%args
 300         print format
 301
 302     def __getstate__(self):
 303         return (self.roots, self.todo, self.done, self.bad, self.round)
 304
 305     def __setstate__(self, state):
 306         self.reset()
 307         (self.roots, self.todo, self.done, self.bad, self.round) = state
 308         for root in self.roots:
 309             self.addrobot(root)
 310         for url in self.bad.keys():
 311             self.markerror(url)
 312
 313     def addroot(self, root, add_to_do = 1):
 314         if root not in self.roots:
 315             troot = root
 316             scheme, netloc, path, params, query, fragment = \
 317                     urlparse.urlparse(root)
 318             i = string.rfind(path, "/") + 1
 319             if 0 < i < len(path):
 320                 path = path[:i]
 321                 troot = urlparse.urlunparse((scheme, netloc, path,
 322                                              params, query, fragment))
 323             self.roots.append(troot)
 324             self.addrobot(root)
 325             if add_to_do:
 326                 self.newlink((root, ""), ("<root>", root))
 327
 328     def addrobot(self, root):
 329         root = urlparse.urljoin(root, "/")
 330         if self.robots.has_key(root): return
 331         url = urlparse.urljoin(root, "/robots.txt")
 332         self.robots[root] = rp = robotparser.RobotFileParser()
 333         self.note(2, "Parsing %s", url)
 334         rp.debug = self.verbose > 3
 335         rp.set_url(url)
 336         try:
 337             rp.read()
 338         except IOError, msg:
 339             self.note(1, "I/O error parsing %s: %s", url, msg)
 340
 341     def run(self):
 342         while self.todo:
 343             self.round = self.round + 1
 344             self.note(0, "\nRound %d (%s)\n", self.round, self.status())
 345             urls = self.todo.keys()
 346             urls.sort()
 347             del urls[self.roundsize:]
 348             for url in urls:
 349                 self.dopage(url)
 350
 351     def status(self):
 352         return "%d total, %d to do, %d done, %d bad" % (
 353             len(self.todo)+len(self.done),
 354             len(self.todo), len(self.done),
 355             len(self.bad))
 356
 357     def report(self):
 358         self.message("")
 359         if not self.todo: s = "Final"
 360         else: s = "Interim"
 361         self.message("%s Report (%s)", s, self.status())
 362         self.report_errors()
 363
 364     def report_errors(self):
 365         if not self.bad:
 366             self.message("\nNo errors")
 367             return
 368         self.message("\nError Report:")
 369         sources = self.errors.keys()
 370         sources.sort()
 371         for source in sources:
 372             triples = self.errors[source]
 373             self.message("")
 374             if len(triples) > 1:
 375                 self.message("%d Errors in %s", len(triples), source)
 376             else:
 377                 self.message("Error in %s", source)
 378             # Call self.format_url() instead of referring
 379             # to the URL directly, since the URLs in these
 380             # triples is now a (URL, fragment) pair. The value
 381             # of the "source" variable comes from the list of
 382             # origins, and is a URL, not a pair.
 383             for url, rawlink, msg in triples:
 384                 if rawlink != self.format_url(url): s = " (%s)" % rawlink
 385                 else: s = ""
 386                 self.message("  HREF %s%s\n    msg %s",
 387                              self.format_url(url), s, msg)
 388
 389     def dopage(self, url_pair):
 390
 391         # All printing of URLs uses format_url(); argument changed to
 392         # url_pair for clarity.
 393         if self.verbose > 1:
 394             if self.verbose > 2:
 395                 self.show("Check ", self.format_url(url_pair),
 396                           "  from", self.todo[url_pair])
 397             else:
 398                 self.message("Check %s", self.format_url(url_pair))
 399         url, local_fragment = url_pair
 400         if local_fragment and self.nonames:
 401             self.markdone(url_pair)
 402             return
 403         page = self.getpage(url_pair)
 404         if page:
 405             # Store the page which corresponds to this URL.
 406             self.name_table[url] = page
 407             # If there is a fragment in this url_pair, and it's not
 408             # in the list of names for the page, call setbad(), since
 409             # it's a missing anchor.
 410             if local_fragment and local_fragment not in page.getnames():
 411                 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 412             for info in page.getlinkinfos():
 413                 # getlinkinfos() now returns the fragment as well,
 414                 # and we store that fragment here in the "todo" dictionary.
 415                 link, rawlink, fragment = info
 416                 # However, we don't want the fragment as the origin, since
 417                 # the origin is logically a page.
 418                 origin = url, rawlink
 419                 self.newlink((link, fragment), origin)
 420         else:
 421             # If no page has been created yet, we want to
 422             # record that fact.
 423             self.name_table[url_pair[0]] = None
 424         self.markdone(url_pair)
 425
 426     def newlink(self, url, origin):
 427         if self.done.has_key(url):
 428             self.newdonelink(url, origin)
 429         else:
 430             self.newtodolink(url, origin)
 431
 432     def newdonelink(self, url, origin):
 433         if origin not in self.done[url]:
 434             self.done[url].append(origin)
 435
 436         # Call self.format_url(), since the URL here
 437         # is now a (URL, fragment) pair.
 438         self.note(3, "  Done link %s", self.format_url(url))
 439
 440         # Make sure that if it's bad, that the origin gets added.
 441         if self.bad.has_key(url):
 442             source, rawlink = origin
 443             triple = url, rawlink, self.bad[url]
 444             self.seterror(source, triple)
 445
 446     def newtodolink(self, url, origin):
 447         # Call self.format_url(), since the URL here
 448         # is now a (URL, fragment) pair.
 449         if self.todo.has_key(url):
 450             if origin not in self.todo[url]:
 451                 self.todo[url].append(origin)
 452             self.note(3, "  Seen todo link %s", self.format_url(url))
 453         else:
 454             self.todo[url] = [origin]
 455             self.note(3, "  New todo link %s", self.format_url(url))
 456
 457     def format_url(self, url):
 458         link, fragment = url
 459         if fragment: return link + "#" + fragment
 460         else: return link
 461
 462     def markdone(self, url):
 463         self.done[url] = self.todo[url]
 464         del self.todo[url]
 465         self.changed = 1
 466
 467     def inroots(self, url):
 468         for root in self.roots:
 469             if url[:len(root)] == root:
 470                 return self.isallowed(root, url)
 471         return 0
 472
 473     def isallowed(self, root, url):
 474         root = urlparse.urljoin(root, "/")
 475         return self.robots[root].can_fetch(AGENTNAME, url)
 476
 477     def getpage(self, url_pair):
 478         # Incoming argument name is a (URL, fragment) pair.
 479         # The page may have been cached in the name_table variable.
 480         url, fragment = url_pair
 481         if self.name_table.has_key(url):
 482             return self.name_table[url]
 483
 484         scheme = urllib.splittype(url)
 485         if scheme in ('mailto', 'news', 'javascript', 'telnet'):
 486             self.note(1, " Not checking %s URL" % scheme)
 487             return None
 488         isint = self.inroots(url)
 489
 490         # Ensure that openpage gets the URL pair to
 491         # print out its error message and record the error pair
 492         # correctly.
 493         if not isint:
 494             if not self.checkext:
 495                 self.note(1, " Not checking ext link")
 496                 return None
 497             f = self.openpage(url_pair)
 498             if f:
 499                 self.safeclose(f)
 500             return None
 501         text, nurl = self.readhtml(url_pair)
 502
 503         if nurl != url:
 504             self.note(1, " Redirected to %s", nurl)
 505             url = nurl
 506         if text:
 507             return Page(text, url, maxpage=self.maxpage, checker=self)
 508
 509     # These next three functions take (URL, fragment) pairs as
 510     # arguments, so that openpage() receives the appropriate tuple to
 511     # record error messages.
 512     def readhtml(self, url_pair):
 513         url, fragment = url_pair
 514         text = None
 515         f, url = self.openhtml(url_pair)
 516         if f:
 517             text = f.read()
 518             f.close()
 519         return text, url
 520
 521     def openhtml(self, url_pair):
 522         url, fragment = url_pair
 523         f = self.openpage(url_pair)
 524         if f:
 525             url = f.geturl()
 526             info = f.info()
 527             if not self.checkforhtml(info, url):
 528                 self.safeclose(f)
 529                 f = None
 530         return f, url
 531
 532     def openpage(self, url_pair):
 533         url, fragment = url_pair
 534         try:
 535             return self.urlopener.open(url)
 536         except IOError, msg:
 537             msg = self.sanitize(msg)
 538             self.note(0, "Error %s", msg)
 539             if self.verbose > 0:
 540                 self.show(" HREF ", url, "  from", self.todo[url_pair])
 541             self.setbad(url_pair, msg)
 542             return None
 543
 544     def checkforhtml(self, info, url):
 545         if info.has_key('content-type'):
 546             ctype = string.lower(info['content-type'])
 547         else:
 548             if url[-1:] == "/":
 549                 return 1
 550             ctype, encoding = mimetypes.guess_type(url)
 551         if ctype == 'text/html':
 552             return 1
 553         else:
 554             self.note(1, " Not HTML, mime type %s", ctype)
 555             return 0
 556
 557     def setgood(self, url):
 558         if self.bad.has_key(url):
 559             del self.bad[url]
 560             self.changed = 1
 561             self.note(0, "(Clear previously seen error)")
 562
 563     def setbad(self, url, msg):
 564         if self.bad.has_key(url) and self.bad[url] == msg:
 565             self.note(0, "(Seen this error before)")
 566             return
 567         self.bad[url] = msg
 568         self.changed = 1
 569         self.markerror(url)
 570
 571     def markerror(self, url):
 572         try:
 573             origins = self.todo[url]
 574         except KeyError:
 575             origins = self.done[url]
 576         for source, rawlink in origins:
 577             triple = url, rawlink, self.bad[url]
 578             self.seterror(source, triple)
 579
 580     def seterror(self, url, triple):
 581         try:
 582             # Because of the way the URLs are now processed, I need to
 583             # check to make sure the URL hasn't been entered in the
 584             # error list.  The first element of the triple here is a
 585             # (URL, fragment) pair, but the URL key is not, since it's
 586             # from the list of origins.
 587             if triple not in self.errors[url]:
 588                 self.errors[url].append(triple)
 589         except KeyError:
 590             self.errors[url] = [triple]
 591
 592     # The following used to be toplevel functions; they have been
 593     # changed into methods so they can be overridden in subclasses.
 594
 595     def show(self, p1, link, p2, origins):
 596         self.message("%s %s", p1, link)
 597         i = 0
 598         for source, rawlink in origins:
 599             i = i+1
 600             if i == 2:
 601                 p2 = ' '*len(p2)
 602             if rawlink != link: s = " (%s)" % rawlink
 603             else: s = ""
 604             self.message("%s %s%s", p2, source, s)
 605
 606     def sanitize(self, msg):
 607         if isinstance(IOError, ClassType) and isinstance(msg, IOError):
 608             # Do the other branch recursively
 609             msg.args = self.sanitize(msg.args)
 610         elif isinstance(msg, TupleType):
 611             if len(msg) >= 4 and msg[0] == 'http error' and \
 612                isinstance(msg[3], InstanceType):
 613                 # Remove the Message instance -- it may contain
 614                 # a file object which prevents pickling.
 615                 msg = msg[:3] + msg[4:]
 616         return msg
 617
 618     def safeclose(self, f):
 619         try:
 620             url = f.geturl()
 621         except AttributeError:
 622             pass
 623         else:
 624             if url[:4] == 'ftp:' or url[:7] == 'file://':
 625                 # Apparently ftp connections don't like to be closed
 626                 # prematurely...
 627                 text = f.read()
 628         f.close()
 629
 630     def save_pickle(self, dumpfile=DUMPFILE):
 631         if not self.changed:
 632             self.note(0, "\nNo need to save checkpoint")
 633         elif not dumpfile:
 634             self.note(0, "No dumpfile, won't save checkpoint")
 635         else:
 636             self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
 637             newfile = dumpfile + ".new"
 638             f = open(newfile, "wb")
 639             pickle.dump(self, f)
 640             f.close()
 641             try:
 642                 os.unlink(dumpfile)
 643             except os.error:
 644                 pass
 645             os.rename(newfile, dumpfile)
 646             self.note(0, "Done.")
 647             return 1
 648
 649
 650 class Page:
 651
 652     def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
 653         self.text = text
 654         self.url = url
 655         self.verbose = verbose
 656         self.maxpage = maxpage
 657         self.checker = checker
 658
 659         # The parsing of the page is done in the __init__() routine in
 660         # order to initialize the list of names the file
 661         # contains. Stored the parser in an instance variable. Passed
 662         # the URL to MyHTMLParser().
 663         size = len(self.text)
 664         if size > self.maxpage:
 665             self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
 666             self.parser = None
 667             return
 668         self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
 669         self.parser = MyHTMLParser(url, verbose=self.verbose,
 670                                    checker=self.checker)
 671         self.parser.feed(self.text)
 672         self.parser.close()
 673
 674     def note(self, level, msg, *args):
 675         if self.checker:
 676             apply(self.checker.note, (level, msg) + args)
 677         else:
 678             if self.verbose >= level:
 679                 if args:
 680                     msg = msg%args
 681                 print msg
 682
 683     # Method to retrieve names.
 684     def getnames(self):
 685         if self.parser:
 686             return self.parser.names
 687         else:
 688             return []
 689
 690     def getlinkinfos(self):
 691         # File reading is done in __init__() routine.  Store parser in
 692         # local variable to indicate success of parsing.
 693
 694         # If no parser was stored, fail.
 695         if not self.parser: return []
 696
 697         rawlinks = self.parser.getlinks()
 698         base = urlparse.urljoin(self.url, self.parser.getbase() or "")
 699         infos = []
 700         for rawlink in rawlinks:
 701             t = urlparse.urlparse(rawlink)
 702             # DON'T DISCARD THE FRAGMENT! Instead, include
 703             # it in the tuples which are returned. See Checker.dopage().
 704             fragment = t[-1]
 705             t = t[:-1] + ('',)
 706             rawlink = urlparse.urlunparse(t)
 707             link = urlparse.urljoin(base, rawlink)
 708             infos.append((link, rawlink, fragment))
 709
 710         return infos
 711
 712
 713 class MyStringIO(StringIO.StringIO):
 714
 715     def __init__(self, url, info):
 716         self.__url = url
 717         self.__info = info
 718         StringIO.StringIO.__init__(self)
 719
 720     def info(self):
 721         return self.__info
 722
 723     def geturl(self):
 724         return self.__url
 725
 726
 727 class MyURLopener(urllib.FancyURLopener):
 728
 729     http_error_default = urllib.URLopener.http_error_default
 730
 731     def __init__(*args):
 732         self = args[0]
 733         apply(urllib.FancyURLopener.__init__, args)
 734         self.addheaders = [
 735             ('User-agent', 'Python-webchecker/%s' % __version__),
 736             ]
 737
 738     def http_error_401(self, url, fp, errcode, errmsg, headers):
 739         return None
 740
 741     def open_file(self, url):
 742         path = urllib.url2pathname(urllib.unquote(url))
 743         if os.path.isdir(path):
 744             if path[-1] != os.sep:
 745                 url = url + '/'
 746             indexpath = os.path.join(path, "index.html")
 747             if os.path.exists(indexpath):
 748                 return self.open_file(url + "index.html")
 749             try:
 750                 names = os.listdir(path)
 751             except os.error, msg:
 752                 raise IOError, msg, sys.exc_traceback
 753             names.sort()
 754             s = MyStringIO("file:"+url, {'content-type': 'text/html'})
 755             s.write('<BASE HREF="file:%s">\n' %
 756                     urllib.quote(os.path.join(path, "")))
 757             for name in names:
 758                 q = urllib.quote(name)
 759                 s.write('<A HREF="%s">%s</A>\n' % (q, q))
 760             s.seek(0)
 761             return s
 762         return urllib.FancyURLopener.open_file(self, url)
 763
 764
 765 class MyHTMLParser(sgmllib.SGMLParser):
 766
 767     def __init__(self, url, verbose=VERBOSE, checker=None):
 768         self.myverbose = verbose # now unused
 769         self.checker = checker
 770         self.base = None
 771         self.links = {}
 772         self.names = []
 773         self.url = url
 774         sgmllib.SGMLParser.__init__(self)
 775
 776     def start_a(self, attributes):
 777         self.link_attr(attributes, 'href')
 778
 779         # We must rescue the NAME
 780         # attributes from the anchor, in order to
 781         # cache the internal anchors which are made
 782         # available in the page.
 783         for name, value in attributes:
 784             if name == "name":
 785                 if value in self.names:
 786                     self.checker.message("WARNING: duplicate name %s in %s",
 787                                          value, self.url)
 788                 else: self.names.append(value)
 789                 break
 790
 791     def end_a(self): pass
 792
 793     def do_area(self, attributes):
 794         self.link_attr(attributes, 'href')
 795
 796     def do_body(self, attributes):
 797         self.link_attr(attributes, 'background', 'bgsound')
 798
 799     def do_img(self, attributes):
 800         self.link_attr(attributes, 'src', 'lowsrc')
 801
 802     def do_frame(self, attributes):
 803         self.link_attr(attributes, 'src', 'longdesc')
 804
 805     def do_iframe(self, attributes):
 806         self.link_attr(attributes, 'src', 'longdesc')
 807
 808     def do_link(self, attributes):
 809         for name, value in attributes:
 810             if name == "rel":
 811                 parts = string.split(string.lower(value))
 812                 if (  parts == ["stylesheet"]
 813                       or parts == ["alternate", "stylesheet"]):
 814                     self.link_attr(attributes, "href")
 815                     break
 816
 817     def do_object(self, attributes):
 818         self.link_attr(attributes, 'data', 'usemap')
 819
 820     def do_script(self, attributes):
 821         self.link_attr(attributes, 'src')
 822
 823     def do_table(self, attributes):
 824         self.link_attr(attributes, 'background')
 825
 826     def do_td(self, attributes):
 827         self.link_attr(attributes, 'background')
 828
 829     def do_th(self, attributes):
 830         self.link_attr(attributes, 'background')
 831
 832     def do_tr(self, attributes):
 833         self.link_attr(attributes, 'background')
 834
 835     def link_attr(self, attributes, *args):
 836         for name, value in attributes:
 837             if name in args:
 838                 if value: value = string.strip(value)
 839                 if value: self.links[value] = None
 840
 841     def do_base(self, attributes):
 842         for name, value in attributes:
 843             if name == 'href':
 844                 if value: value = string.strip(value)
 845                 if value:
 846                     if self.checker:
 847                         self.checker.note(1, "  Base %s", value)
 848                     self.base = value
 849
 850     def getlinks(self):
 851         return self.links.keys()
 852
 853     def getbase(self):
 854         return self.base
 855
 856
 857 if __name__ == '__main__':
 858     main()