Tools/webchecker/webchecker.py

   1 #! /usr/bin/env python
   2
   3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
   4 # including code to check URL fragments.
   5
   6 """Web tree checker.
   7
   8 This utility is handy to check a subweb of the world-wide web for
   9 errors.  A subweb is specified by giving one or more ``root URLs''; a
  10 page belongs to the subweb if one of the root URLs is an initial
  11 prefix of it.
  12
  13 File URL extension:
  14
  15 In order to easy the checking of subwebs via the local file system,
  16 the interpretation of ``file:'' URLs is extended to mimic the behavior
  17 of your average HTTP daemon: if a directory pathname is given, the
  18 file index.html in that directory is returned if it exists, otherwise
  19 a directory listing is returned.  Now, you can point webchecker to the
  20 document tree in the local file system of your HTTP daemon, and have
  21 most of it checked.  In fact the default works this way if your local
  22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
  23 the NCSA HTTP daemon and probably others).
  24
  25 Report printed:
  26
  27 When done, it reports pages with bad links within the subweb.  When
  28 interrupted, it reports for the pages that it has checked already.
  29
  30 In verbose mode, additional messages are printed during the
  31 information gathering phase.  By default, it prints a summary of its
  32 work status every 50 URLs (adjustable with the -r option), and it
  33 reports errors as they are encountered.  Use the -q option to disable
  34 this output.
  35
  36 Checkpoint feature:
  37
  38 Whether interrupted or not, it dumps its state (a Python pickle) to a
  39 checkpoint file and the -R option allows it to restart from the
  40 checkpoint (assuming that the pages on the subweb that were already
  41 processed haven't changed).  Even when it has run till completion, -R
  42 can still be useful -- it will print the reports again, and -Rq prints
  43 the errors only.  In this case, the checkpoint file is not written
  44 again.  The checkpoint file can be set with the -d option.
  45
  46 The checkpoint file is written as a Python pickle.  Remember that
  47 Python's pickle module is currently quite slow.  Give it the time it
  48 needs to load and save the checkpoint file.  When interrupted while
  49 writing the checkpoint file, the old checkpoint file is not
  50 overwritten, but all work done in the current run is lost.
  51
  52 Miscellaneous:
  53
  54 - You may find the (Tk-based) GUI version easier to use.  See wcgui.py.
  55
  56 - Webchecker honors the "robots.txt" convention.  Thanks to Skip
  57 Montanaro for his robotparser.py module (included in this directory)!
  58 The agent name is hardwired to "webchecker".  URLs that are disallowed
  59 by the robots.txt file are reported as external URLs.
  60
  61 - Because the SGML parser is a bit slow, very large SGML files are
  62 skipped.  The size limit can be set with the -m option.
  63
  64 - When the server or protocol does not tell us a file's type, we guess
  65 it based on the URL's suffix.  The mimetypes.py module (also in this
  66 directory) has a built-in table mapping most currently known suffixes,
  67 and in addition attempts to read the mime.types configuration files in
  68 the default locations of Netscape and the NCSA HTTP daemon.
  69
  70 - We follow links indicated by <A>, <FRAME> and <IMG> tags.  We also
  71 honor the <BASE> tag.
  72
  73 - We now check internal NAME anchor links, as well as toplevel links.
  74
  75 - Checking external links is now done by default; use -x to *disable*
  76 this feature.  External links are now checked during normal
  77 processing.  (XXX The status of a checked link could be categorized
  78 better.  Later...)
  79
  80 - If external links are not checked, you can use the -t flag to
  81 provide specific overrides to -x.
  82
  83 Usage: webchecker.py [option] ... [rooturl] ...
  84
  85 Options:
  86
  87 -R        -- restart from checkpoint file
  88 -d file   -- checkpoint filename (default %(DUMPFILE)s)
  89 -m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
  90 -n        -- reports only, no checking (use with -R)
  91 -q        -- quiet operation (also suppresses external links report)
  92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
  93 -t root   -- specify root dir which should be treated as internal (can repeat)
  94 -v        -- verbose operation; repeating -v will increase verbosity
  95 -x        -- don't check external links (these are often slow to check)
  96 -a        -- don't check name anchors
  97
  98 Arguments:
  99
 100 rooturl   -- URL to start checking
 101              (default %(DEFROOT)s)
 102
 103 """
 104
 105
 106 __version__ = "$Revision$"
 107
 108
 109 import sys
 110 import os
 111 from types import *
 112 import string
 113 import StringIO
 114 import getopt
 115 import pickle
 116
 117 import urllib
 118 import urlparse
 119 import sgmllib
 120
 121 import mimetypes
 122 import robotparser
 123
 124 # Extract real version number if necessary
 125 if __version__[0] == '$':
 126     _v = string.split(__version__)
 127     if len(_v) == 3:
 128         __version__ = _v[1]
 129
 130
 131 # Tunable parameters
 132 DEFROOT = "file:/usr/local/etc/httpd/htdocs/"   # Default root URL
 133 CHECKEXT = 1                            # Check external references (1 deep)
 134 VERBOSE = 1                             # Verbosity level (0-3)
 135 MAXPAGE = 150000                        # Ignore files bigger than this
 136 ROUNDSIZE = 50                          # Number of links processed per round
 137 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 138 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
 139 NONAMES = 0                             # Force name anchor checking
 140
 141
 142 # Global variables
 143
 144
 145 def main():
 146     checkext = CHECKEXT
 147     verbose = VERBOSE
 148     maxpage = MAXPAGE
 149     roundsize = ROUNDSIZE
 150     dumpfile = DUMPFILE
 151     restart = 0
 152     norun = 0
 153
 154     try:
 155         opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
 156     except getopt.error, msg:
 157         sys.stdout = sys.stderr
 158         print msg
 159         print __doc__%globals()
 160         sys.exit(2)
 161
 162     # The extra_roots variable collects extra roots.
 163     extra_roots = []
 164     nonames = NONAMES
 165
 166     for o, a in opts:
 167         if o == '-R':
 168             restart = 1
 169         if o == '-d':
 170             dumpfile = a
 171         if o == '-m':
 172             maxpage = string.atoi(a)
 173         if o == '-n':
 174             norun = 1
 175         if o == '-q':
 176             verbose = 0
 177         if o == '-r':
 178             roundsize = string.atoi(a)
 179         if o == '-t':
 180             extra_roots.append(a)
 181         if o == '-a':
 182             nonames = not nonames
 183         if o == '-v':
 184             verbose = verbose + 1
 185         if o == '-x':
 186             checkext = not checkext
 187
 188     if verbose > 0:
 189         print AGENTNAME, "version", __version__
 190
 191     if restart:
 192         c = load_pickle(dumpfile=dumpfile, verbose=verbose)
 193     else:
 194         c = Checker()
 195
 196     c.setflags(checkext=checkext, verbose=verbose,
 197                maxpage=maxpage, roundsize=roundsize,
 198                nonames=nonames
 199                )
 200
 201     if not restart and not args:
 202         args.append(DEFROOT)
 203
 204     for arg in args:
 205         c.addroot(arg)
 206
 207     # The -t flag is only needed if external links are not to be
 208     # checked. So -t values are ignored unless -x was specified.
 209     if not checkext:
 210         for root in extra_roots:
 211             # Make sure it's terminated by a slash,
 212             # so that addroot doesn't discard the last
 213             # directory component.
 214             if root[-1] != "/":
 215                 root = root + "/"
 216             c.addroot(root, add_to_do = 0)
 217
 218     try:
 219
 220         if not norun:
 221             try:
 222                 c.run()
 223             except KeyboardInterrupt:
 224                 if verbose > 0:
 225                     print "[run interrupted]"
 226
 227         try:
 228             c.report()
 229         except KeyboardInterrupt:
 230             if verbose > 0:
 231                 print "[report interrupted]"
 232
 233     finally:
 234         if c.save_pickle(dumpfile):
 235             if dumpfile == DUMPFILE:
 236                 print "Use ``%s -R'' to restart." % sys.argv[0]
 237             else:
 238                 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
 239                                                            dumpfile)
 240
 241
 242 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
 243     if verbose > 0:
 244         print "Loading checkpoint from %s ..." % dumpfile
 245     f = open(dumpfile, "rb")
 246     c = pickle.load(f)
 247     f.close()
 248     if verbose > 0:
 249         print "Done."
 250         print "Root:", string.join(c.roots, "\n      ")
 251     return c
 252
 253
 254 class Checker:
 255
 256     checkext = CHECKEXT
 257     verbose = VERBOSE
 258     maxpage = MAXPAGE
 259     roundsize = ROUNDSIZE
 260     nonames = NONAMES
 261
 262     validflags = tuple(dir())
 263
 264     def __init__(self):
 265         self.reset()
 266
 267     def setflags(self, **kw):
 268         for key in kw.keys():
 269             if key not in self.validflags:
 270                 raise NameError, "invalid keyword argument: %s" % str(key)
 271         for key, value in kw.items():
 272             setattr(self, key, value)
 273
 274     def reset(self):
 275         self.roots = []
 276         self.todo = {}
 277         self.done = {}
 278         self.bad = {}
 279
 280         # Add a name table, so that the name URLs can be checked. Also
 281         # serves as an implicit cache for which URLs are done.
 282         self.name_table = {}
 283
 284         self.round = 0
 285         # The following are not pickled:
 286         self.robots = {}
 287         self.errors = {}
 288         self.urlopener = MyURLopener()
 289         self.changed = 0
 290
 291     def note(self, level, format, *args):
 292         if self.verbose > level:
 293             if args:
 294                 format = format%args
 295             self.message(format)
 296
 297     def message(self, format, *args):
 298         if args:
 299             format = format%args
 300         print format
 301
 302     def __getstate__(self):
 303         return (self.roots, self.todo, self.done, self.bad, self.round)
 304
 305     def __setstate__(self, state):
 306         self.reset()
 307         (self.roots, self.todo, self.done, self.bad, self.round) = state
 308         for root in self.roots:
 309             self.addrobot(root)
 310         for url in self.bad.keys():
 311             self.markerror(url)
 312
 313     def addroot(self, root, add_to_do = 1):
 314         if root not in self.roots:
 315             troot = root
 316             scheme, netloc, path, params, query, fragment = \
 317                     urlparse.urlparse(root)
 318             i = string.rfind(path, "/") + 1
 319             if 0 < i < len(path):
 320                 path = path[:i]
 321                 troot = urlparse.urlunparse((scheme, netloc, path,
 322                                              params, query, fragment))
 323             self.roots.append(troot)
 324             self.addrobot(root)
 325             if add_to_do:
 326                 self.newlink((root, ""), ("<root>", root))
 327
 328     def addrobot(self, root):
 329         root = urlparse.urljoin(root, "/")
 330         if self.robots.has_key(root): return
 331         url = urlparse.urljoin(root, "/robots.txt")
 332         self.robots[root] = rp = robotparser.RobotFileParser()
 333         self.note(2, "Parsing %s", url)
 334         rp.debug = self.verbose > 3
 335         rp.set_url(url)
 336         try:
 337             rp.read()
 338         except IOError, msg:
 339             self.note(1, "I/O error parsing %s: %s", url, msg)
 340
 341     def run(self):
 342         while self.todo:
 343             self.round = self.round + 1
 344             self.note(0, "\nRound %d (%s)\n", self.round, self.status())
 345             urls = self.todo.keys()
 346             urls.sort()
 347             del urls[self.roundsize:]
 348             for url in urls:
 349                 self.dopage(url)
 350
 351     def status(self):
 352         return "%d total, %d to do, %d done, %d bad" % (
 353             len(self.todo)+len(self.done),
 354             len(self.todo), len(self.done),
 355             len(self.bad))
 356
 357     def report(self):
 358         self.message("")
 359         if not self.todo: s = "Final"
 360         else: s = "Interim"
 361         self.message("%s Report (%s)", s, self.status())
 362         self.report_errors()
 363
 364     def report_errors(self):
 365         if not self.bad:
 366             self.message("\nNo errors")
 367             return
 368         self.message("\nError Report:")
 369         sources = self.errors.keys()
 370         sources.sort()
 371         for source in sources:
 372             triples = self.errors[source]
 373             self.message("")
 374             if len(triples) > 1:
 375                 self.message("%d Errors in %s", len(triples), source)
 376             else:
 377                 self.message("Error in %s", source)
 378             # Call self.format_url() instead of referring
 379             # to the URL directly, since the URLs in these
 380             # triples is now a (URL, fragment) pair. The value
 381             # of the "source" variable comes from the list of
 382             # origins, and is a URL, not a pair.
 383             for url, rawlink, msg in triples:
 384                 if rawlink != self.format_url(url): s = " (%s)" % rawlink
 385                 else: s = ""
 386                 self.message("  HREF %s%s\n    msg %s",
 387                              self.format_url(url), s, msg)
 388
 389     def dopage(self, url_pair):
 390
 391         # All printing of URLs uses format_url(); argument changed to
 392         # url_pair for clarity.
 393         if self.verbose > 1:
 394             if self.verbose > 2:
 395                 self.show("Check ", self.format_url(url_pair),
 396                           "  from", self.todo[url_pair])
 397             else:
 398                 self.message("Check %s", self.format_url(url_pair))
 399         url, local_fragment = url_pair
 400         if local_fragment and self.nonames:
 401             self.markdone(url_pair)
 402             return
 403         page = self.getpage(url_pair)
 404         if page:
 405             # Store the page which corresponds to this URL.
 406             self.name_table[url] = page
 407             # If there is a fragment in this url_pair, and it's not
 408             # in the list of names for the page, call setbad(), since
 409             # it's a missing anchor.
 410             if local_fragment and local_fragment not in page.getnames():
 411                 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 412             for info in page.getlinkinfos():
 413                 # getlinkinfos() now returns the fragment as well,
 414                 # and we store that fragment here in the "todo" dictionary.
 415                 link, rawlink, fragment = info
 416                 # However, we don't want the fragment as the origin, since
 417                 # the origin is logically a page.
 418                 origin = url, rawlink
 419                 self.newlink((link, fragment), origin)
 420         else:
 421             # If no page has been created yet, we want to
 422             # record that fact.
 423             self.name_table[url_pair[0]] = None
 424         self.markdone(url_pair)
 425
 426     def newlink(self, url, origin):
 427         if self.done.has_key(url):
 428             self.newdonelink(url, origin)
 429         else:
 430             self.newtodolink(url, origin)
 431
 432     def newdonelink(self, url, origin):
 433         if origin not in self.done[url]:
 434             self.done[url].append(origin)
 435
 436         # Call self.format_url(), since the URL here
 437         # is now a (URL, fragment) pair.
 438         self.note(3, "  Done link %s", self.format_url(url))
 439
 440         # Make sure that if it's bad, that the origin gets added.
 441         if self.bad.has_key(url):
 442             source, rawlink = origin
 443             triple = url, rawlink, self.bad[url]
 444             self.seterror(source, triple)
 445
 446     def newtodolink(self, url, origin):
 447         # Call self.format_url(), since the URL here
 448         # is now a (URL, fragment) pair.
 449         if self.todo.has_key(url):
 450             if origin not in self.todo[url]:
 451                 self.todo[url].append(origin)
 452             self.note(3, "  Seen todo link %s", self.format_url(url))
 453         else:
 454             self.todo[url] = [origin]
 455             self.note(3, "  New todo link %s", self.format_url(url))
 456
 457     def format_url(self, url):
 458         link, fragment = url
 459         if fragment: return link + "#" + fragment
 460         else: return link
 461
 462     def markdone(self, url):
 463         self.done[url] = self.todo[url]
 464         del self.todo[url]
 465         self.changed = 1
 466
 467     def inroots(self, url):
 468         for root in self.roots:
 469             if url[:len(root)] == root:
 470                 return self.isallowed(root, url)
 471         return 0
 472
 473     def isallowed(self, root, url):
 474         root = urlparse.urljoin(root, "/")
 475         return self.robots[root].can_fetch(AGENTNAME, url)
 476
 477     def getpage(self, url_pair):
 478         # Incoming argument name is a (URL, fragment) pair.
 479         # The page may have been cached in the name_table variable.
 480         url, fragment = url_pair
 481         if self.name_table.has_key(url):
 482             return self.name_table[url]
 483
 484         if url[:7] == 'mailto:' or url[:5] == 'news:':
 485             self.note(1, " Not checking mailto/news URL")
 486             return None
 487         isint = self.inroots(url)
 488
 489         # Ensure that openpage gets the URL pair to
 490         # print out its error message and record the error pair
 491         # correctly.
 492         if not isint:
 493             if not self.checkext:
 494                 self.note(1, " Not checking ext link")
 495                 return None
 496             f = self.openpage(url_pair)
 497             if f:
 498                 self.safeclose(f)
 499             return None
 500         text, nurl = self.readhtml(url_pair)
 501
 502         if nurl != url:
 503             self.note(1, " Redirected to %s", nurl)
 504             url = nurl
 505         if text:
 506             return Page(text, url, maxpage=self.maxpage, checker=self)
 507
 508     # These next three functions take (URL, fragment) pairs as
 509     # arguments, so that openpage() receives the appropriate tuple to
 510     # record error messages.
 511     def readhtml(self, url_pair):
 512         url, fragment = url_pair
 513         text = None
 514         f, url = self.openhtml(url_pair)
 515         if f:
 516             text = f.read()
 517             f.close()
 518         return text, url
 519
 520     def openhtml(self, url_pair):
 521         url, fragment = url_pair
 522         f = self.openpage(url_pair)
 523         if f:
 524             url = f.geturl()
 525             info = f.info()
 526             if not self.checkforhtml(info, url):
 527                 self.safeclose(f)
 528                 f = None
 529         return f, url
 530
 531     def openpage(self, url_pair):
 532         url, fragment = url_pair
 533         try:
 534             return self.urlopener.open(url)
 535         except IOError, msg:
 536             msg = self.sanitize(msg)
 537             self.note(0, "Error %s", msg)
 538             if self.verbose > 0:
 539                 self.show(" HREF ", url, "  from", self.todo[url_pair])
 540             self.setbad(url_pair, msg)
 541             return None
 542
 543     def checkforhtml(self, info, url):
 544         if info.has_key('content-type'):
 545             ctype = string.lower(info['content-type'])
 546         else:
 547             if url[-1:] == "/":
 548                 return 1
 549             ctype, encoding = mimetypes.guess_type(url)
 550         if ctype == 'text/html':
 551             return 1
 552         else:
 553             self.note(1, " Not HTML, mime type %s", ctype)
 554             return 0
 555
 556     def setgood(self, url):
 557         if self.bad.has_key(url):
 558             del self.bad[url]
 559             self.changed = 1
 560             self.note(0, "(Clear previously seen error)")
 561
 562     def setbad(self, url, msg):
 563         if self.bad.has_key(url) and self.bad[url] == msg:
 564             self.note(0, "(Seen this error before)")
 565             return
 566         self.bad[url] = msg
 567         self.changed = 1
 568         self.markerror(url)
 569
 570     def markerror(self, url):
 571         try:
 572             origins = self.todo[url]
 573         except KeyError:
 574             origins = self.done[url]
 575         for source, rawlink in origins:
 576             triple = url, rawlink, self.bad[url]
 577             self.seterror(source, triple)
 578
 579     def seterror(self, url, triple):
 580         try:
 581             # Because of the way the URLs are now processed, I need to
 582             # check to make sure the URL hasn't been entered in the
 583             # error list.  The first element of the triple here is a
 584             # (URL, fragment) pair, but the URL key is not, since it's
 585             # from the list of origins.
 586             if triple not in self.errors[url]:
 587                 self.errors[url].append(triple)
 588         except KeyError:
 589             self.errors[url] = [triple]
 590
 591     # The following used to be toplevel functions; they have been
 592     # changed into methods so they can be overridden in subclasses.
 593
 594     def show(self, p1, link, p2, origins):
 595         self.message("%s %s", p1, link)
 596         i = 0
 597         for source, rawlink in origins:
 598             i = i+1
 599             if i == 2:
 600                 p2 = ' '*len(p2)
 601             if rawlink != link: s = " (%s)" % rawlink
 602             else: s = ""
 603             self.message("%s %s%s", p2, source, s)
 604
 605     def sanitize(self, msg):
 606         if isinstance(IOError, ClassType) and isinstance(msg, IOError):
 607             # Do the other branch recursively
 608             msg.args = self.sanitize(msg.args)
 609         elif isinstance(msg, TupleType):
 610             if len(msg) >= 4 and msg[0] == 'http error' and \
 611                isinstance(msg[3], InstanceType):
 612                 # Remove the Message instance -- it may contain
 613                 # a file object which prevents pickling.
 614                 msg = msg[:3] + msg[4:]
 615         return msg
 616
 617     def safeclose(self, f):
 618         try:
 619             url = f.geturl()
 620         except AttributeError:
 621             pass
 622         else:
 623             if url[:4] == 'ftp:' or url[:7] == 'file://':
 624                 # Apparently ftp connections don't like to be closed
 625                 # prematurely...
 626                 text = f.read()
 627         f.close()
 628
 629     def save_pickle(self, dumpfile=DUMPFILE):
 630         if not self.changed:
 631             self.note(0, "\nNo need to save checkpoint")
 632         elif not dumpfile:
 633             self.note(0, "No dumpfile, won't save checkpoint")
 634         else:
 635             self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
 636             newfile = dumpfile + ".new"
 637             f = open(newfile, "wb")
 638             pickle.dump(self, f)
 639             f.close()
 640             try:
 641                 os.unlink(dumpfile)
 642             except os.error:
 643                 pass
 644             os.rename(newfile, dumpfile)
 645             self.note(0, "Done.")
 646             return 1
 647
 648
 649 class Page:
 650
 651     def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
 652         self.text = text
 653         self.url = url
 654         self.verbose = verbose
 655         self.maxpage = maxpage
 656         self.checker = checker
 657
 658         # The parsing of the page is done in the __init__() routine in
 659         # order to initialize the list of names the file
 660         # contains. Stored the parser in an instance variable. Passed
 661         # the URL to MyHTMLParser().
 662         size = len(self.text)
 663         if size > self.maxpage:
 664             self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
 665             self.parser = None
 666             return
 667         self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
 668         self.parser = MyHTMLParser(url, verbose=self.verbose,
 669                                    checker=self.checker)
 670         self.parser.feed(self.text)
 671         self.parser.close()
 672
 673     def note(self, level, msg, *args):
 674         if self.checker:
 675             apply(self.checker.note, (level, msg) + args)
 676         else:
 677             if self.verbose >= level:
 678                 if args:
 679                     msg = msg%args
 680                 print msg
 681
 682     # Method to retrieve names.
 683     def getnames(self):
 684         if self.parser:
 685             return self.parser.names
 686         else:
 687             return []
 688
 689     def getlinkinfos(self):
 690         # File reading is done in __init__() routine.  Store parser in
 691         # local variable to indicate success of parsing.
 692
 693         # If no parser was stored, fail.
 694         if not self.parser: return []
 695
 696         rawlinks = self.parser.getlinks()
 697         base = urlparse.urljoin(self.url, self.parser.getbase() or "")
 698         infos = []
 699         for rawlink in rawlinks:
 700             t = urlparse.urlparse(rawlink)
 701             # DON'T DISCARD THE FRAGMENT! Instead, include
 702             # it in the tuples which are returned. See Checker.dopage().
 703             fragment = t[-1]
 704             t = t[:-1] + ('',)
 705             rawlink = urlparse.urlunparse(t)
 706             link = urlparse.urljoin(base, rawlink)
 707             infos.append((link, rawlink, fragment))
 708
 709         return infos
 710
 711
 712 class MyStringIO(StringIO.StringIO):
 713
 714     def __init__(self, url, info):
 715         self.__url = url
 716         self.__info = info
 717         StringIO.StringIO.__init__(self)
 718
 719     def info(self):
 720         return self.__info
 721
 722     def geturl(self):
 723         return self.__url
 724
 725
 726 class MyURLopener(urllib.FancyURLopener):
 727
 728     http_error_default = urllib.URLopener.http_error_default
 729
 730     def __init__(*args):
 731         self = args[0]
 732         apply(urllib.FancyURLopener.__init__, args)
 733         self.addheaders = [
 734             ('User-agent', 'Python-webchecker/%s' % __version__),
 735             ]
 736
 737     def http_error_401(self, url, fp, errcode, errmsg, headers):
 738         return None
 739
 740     def open_file(self, url):
 741         path = urllib.url2pathname(urllib.unquote(url))
 742         if os.path.isdir(path):
 743             if path[-1] != os.sep:
 744                 url = url + '/'
 745             indexpath = os.path.join(path, "index.html")
 746             if os.path.exists(indexpath):
 747                 return self.open_file(url + "index.html")
 748             try:
 749                 names = os.listdir(path)
 750             except os.error, msg:
 751                 raise IOError, msg, sys.exc_traceback
 752             names.sort()
 753             s = MyStringIO("file:"+url, {'content-type': 'text/html'})
 754             s.write('<BASE HREF="file:%s">\n' %
 755                     urllib.quote(os.path.join(path, "")))
 756             for name in names:
 757                 q = urllib.quote(name)
 758                 s.write('<A HREF="%s">%s</A>\n' % (q, q))
 759             s.seek(0)
 760             return s
 761         return urllib.FancyURLopener.open_file(self, url)
 762
 763
 764 class MyHTMLParser(sgmllib.SGMLParser):
 765
 766     def __init__(self, url, verbose=VERBOSE, checker=None):
 767         self.myverbose = verbose # now unused
 768         self.checker = checker
 769         self.base = None
 770         self.links = {}
 771         self.names = []
 772         self.url = url
 773         sgmllib.SGMLParser.__init__(self)
 774
 775     def start_a(self, attributes):
 776         self.link_attr(attributes, 'href')
 777
 778         # We must rescue the NAME
 779         # attributes from the anchor, in order to
 780         # cache the internal anchors which are made
 781         # available in the page.
 782         for name, value in attributes:
 783             if name == "name":
 784                 if value in self.names:
 785                     self.checker.message("WARNING: duplicate name %s in %s",
 786                                          value, self.url)
 787                 else: self.names.append(value)
 788                 break
 789
 790     def end_a(self): pass
 791
 792     def do_area(self, attributes):
 793         self.link_attr(attributes, 'href')
 794
 795     def do_img(self, attributes):
 796         self.link_attr(attributes, 'src', 'lowsrc')
 797
 798     def do_frame(self, attributes):
 799         self.link_attr(attributes, 'src')
 800
 801     def link_attr(self, attributes, *args):
 802         for name, value in attributes:
 803             if name in args:
 804                 if value: value = string.strip(value)
 805                 if value: self.links[value] = None
 806
 807     def do_base(self, attributes):
 808         for name, value in attributes:
 809             if name == 'href':
 810                 if value: value = string.strip(value)
 811                 if value:
 812                     if self.checker:
 813                         self.checker.note(1, "  Base %s", value)
 814                     self.base = value
 815
 816     def getlinks(self):
 817         return self.links.keys()
 818
 819     def getbase(self):
 820         return self.base
 821
 822
 823 if __name__ == '__main__':
 824     main()