Tools/webchecker/webchecker.py

   1 #! /usr/bin/env python
   2
   3 # Original code by Guido van Rossum; extensive changes by Sam Bayer,
   4 # including code to check URL fragments.
   5
   6 """Web tree checker.
   7
   8 This utility is handy to check a subweb of the world-wide web for
   9 errors.  A subweb is specified by giving one or more ``root URLs''; a
  10 page belongs to the subweb if one of the root URLs is an initial
  11 prefix of it.
  12
  13 File URL extension:
  14
  15 In order to easy the checking of subwebs via the local file system,
  16 the interpretation of ``file:'' URLs is extended to mimic the behavior
  17 of your average HTTP daemon: if a directory pathname is given, the
  18 file index.html in that directory is returned if it exists, otherwise
  19 a directory listing is returned.  Now, you can point webchecker to the
  20 document tree in the local file system of your HTTP daemon, and have
  21 most of it checked.  In fact the default works this way if your local
  22 web tree is located at /usr/local/etc/httpd/htdpcs (the default for
  23 the NCSA HTTP daemon and probably others).
  24
  25 Report printed:
  26
  27 When done, it reports pages with bad links within the subweb.  When
  28 interrupted, it reports for the pages that it has checked already.
  29
  30 In verbose mode, additional messages are printed during the
  31 information gathering phase.  By default, it prints a summary of its
  32 work status every 50 URLs (adjustable with the -r option), and it
  33 reports errors as they are encountered.  Use the -q option to disable
  34 this output.
  35
  36 Checkpoint feature:
  37
  38 Whether interrupted or not, it dumps its state (a Python pickle) to a
  39 checkpoint file and the -R option allows it to restart from the
  40 checkpoint (assuming that the pages on the subweb that were already
  41 processed haven't changed).  Even when it has run till completion, -R
  42 can still be useful -- it will print the reports again, and -Rq prints
  43 the errors only.  In this case, the checkpoint file is not written
  44 again.  The checkpoint file can be set with the -d option.
  45
  46 The checkpoint file is written as a Python pickle.  Remember that
  47 Python's pickle module is currently quite slow.  Give it the time it
  48 needs to load and save the checkpoint file.  When interrupted while
  49 writing the checkpoint file, the old checkpoint file is not
  50 overwritten, but all work done in the current run is lost.
  51
  52 Miscellaneous:
  53
  54 - You may find the (Tk-based) GUI version easier to use.  See wcgui.py.
  55
  56 - Webchecker honors the "robots.txt" convention.  Thanks to Skip
  57 Montanaro for his robotparser.py module (included in this directory)!
  58 The agent name is hardwired to "webchecker".  URLs that are disallowed
  59 by the robots.txt file are reported as external URLs.
  60
  61 - Because the SGML parser is a bit slow, very large SGML files are
  62 skipped.  The size limit can be set with the -m option.
  63
  64 - When the server or protocol does not tell us a file's type, we guess
  65 it based on the URL's suffix.  The mimetypes.py module (also in this
  66 directory) has a built-in table mapping most currently known suffixes,
  67 and in addition attempts to read the mime.types configuration files in
  68 the default locations of Netscape and the NCSA HTTP daemon.
  69
  70 - We follow links indicated by <A>, <FRAME> and <IMG> tags.  We also
  71 honor the <BASE> tag.
  72
  73 - We now check internal NAME anchor links, as well as toplevel links.
  74
  75 - Checking external links is now done by default; use -x to *disable*
  76 this feature.  External links are now checked during normal
  77 processing.  (XXX The status of a checked link could be categorized
  78 better.  Later...)
  79
  80 - If external links are not checked, you can use the -t flag to
  81 provide specific overrides to -x.
  82
  83 Usage: webchecker.py [option] ... [rooturl] ...
  84
  85 Options:
  86
  87 -R        -- restart from checkpoint file
  88 -d file   -- checkpoint filename (default %(DUMPFILE)s)
  89 -m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
  90 -n        -- reports only, no checking (use with -R)
  91 -q        -- quiet operation (also suppresses external links report)
  92 -r number -- number of links processed per round (default %(ROUNDSIZE)d)
  93 -t root   -- specify root dir which should be treated as internal (can repeat)
  94 -v        -- verbose operation; repeating -v will increase verbosity
  95 -x        -- don't check external links (these are often slow to check)
  96 -a        -- don't check name anchors
  97
  98 Arguments:
  99
 100 rooturl   -- URL to start checking
 101              (default %(DEFROOT)s)
 102
 103 """
 104
 105
 106 __version__ = "$Revision$"
 107
 108
 109 import sys
 110 import os
 111 from types import *
 112 import string
 113 import StringIO
 114 import getopt
 115 import pickle
 116
 117 import urllib
 118 import urlparse
 119 import sgmllib
 120 import cgi
 121
 122 import mimetypes
 123 import robotparser
 124
 125 # Extract real version number if necessary
 126 if __version__[0] == '$':
 127     _v = string.split(__version__)
 128     if len(_v) == 3:
 129         __version__ = _v[1]
 130
 131
 132 # Tunable parameters
 133 DEFROOT = "file:/usr/local/etc/httpd/htdocs/"   # Default root URL
 134 CHECKEXT = 1                            # Check external references (1 deep)
 135 VERBOSE = 1                             # Verbosity level (0-3)
 136 MAXPAGE = 150000                        # Ignore files bigger than this
 137 ROUNDSIZE = 50                          # Number of links processed per round
 138 DUMPFILE = "@webchecker.pickle"         # Pickled checkpoint
 139 AGENTNAME = "webchecker"                # Agent name for robots.txt parser
 140 NONAMES = 0                             # Force name anchor checking
 141
 142
 143 # Global variables
 144
 145
 146 def main():
 147     checkext = CHECKEXT
 148     verbose = VERBOSE
 149     maxpage = MAXPAGE
 150     roundsize = ROUNDSIZE
 151     dumpfile = DUMPFILE
 152     restart = 0
 153     norun = 0
 154
 155     try:
 156         opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:nqr:t:vxa')
 157     except getopt.error, msg:
 158         sys.stdout = sys.stderr
 159         print msg
 160         print __doc__%globals()
 161         sys.exit(2)
 162
 163     # The extra_roots variable collects extra roots.
 164     extra_roots = []
 165     nonames = NONAMES
 166
 167     for o, a in opts:
 168         if o == '-R':
 169             restart = 1
 170         if o == '-d':
 171             dumpfile = a
 172         if o == '-m':
 173             maxpage = string.atoi(a)
 174         if o == '-n':
 175             norun = 1
 176         if o == '-q':
 177             verbose = 0
 178         if o == '-r':
 179             roundsize = string.atoi(a)
 180         if o == '-t':
 181             extra_roots.append(a)
 182         if o == '-a':
 183             nonames = not nonames
 184         if o == '-v':
 185             verbose = verbose + 1
 186         if o == '-x':
 187             checkext = not checkext
 188
 189     if verbose > 0:
 190         print AGENTNAME, "version", __version__
 191
 192     if restart:
 193         c = load_pickle(dumpfile=dumpfile, verbose=verbose)
 194     else:
 195         c = Checker()
 196
 197     c.setflags(checkext=checkext, verbose=verbose,
 198                maxpage=maxpage, roundsize=roundsize,
 199                nonames=nonames
 200                )
 201
 202     if not restart and not args:
 203         args.append(DEFROOT)
 204
 205     for arg in args:
 206         c.addroot(arg)
 207
 208     # The -t flag is only needed if external links are not to be
 209     # checked. So -t values are ignored unless -x was specified.
 210     if not checkext:
 211         for root in extra_roots:
 212             # Make sure it's terminated by a slash,
 213             # so that addroot doesn't discard the last
 214             # directory component.
 215             if root[-1] != "/":
 216                 root = root + "/"
 217             c.addroot(root, add_to_do = 0)
 218
 219     try:
 220
 221         if not norun:
 222             try:
 223                 c.run()
 224             except KeyboardInterrupt:
 225                 if verbose > 0:
 226                     print "[run interrupted]"
 227
 228         try:
 229             c.report()
 230         except KeyboardInterrupt:
 231             if verbose > 0:
 232                 print "[report interrupted]"
 233
 234     finally:
 235         if c.save_pickle(dumpfile):
 236             if dumpfile == DUMPFILE:
 237                 print "Use ``%s -R'' to restart." % sys.argv[0]
 238             else:
 239                 print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
 240                                                            dumpfile)
 241
 242
 243 def load_pickle(dumpfile=DUMPFILE, verbose=VERBOSE):
 244     if verbose > 0:
 245         print "Loading checkpoint from %s ..." % dumpfile
 246     f = open(dumpfile, "rb")
 247     c = pickle.load(f)
 248     f.close()
 249     if verbose > 0:
 250         print "Done."
 251         print "Root:", string.join(c.roots, "\n      ")
 252     return c
 253
 254
 255 class Checker:
 256
 257     checkext = CHECKEXT
 258     verbose = VERBOSE
 259     maxpage = MAXPAGE
 260     roundsize = ROUNDSIZE
 261     nonames = NONAMES
 262
 263     validflags = tuple(dir())
 264
 265     def __init__(self):
 266         self.reset()
 267
 268     def setflags(self, **kw):
 269         for key in kw.keys():
 270             if key not in self.validflags:
 271                 raise NameError, "invalid keyword argument: %s" % str(key)
 272         for key, value in kw.items():
 273             setattr(self, key, value)
 274
 275     def reset(self):
 276         self.roots = []
 277         self.todo = {}
 278         self.done = {}
 279         self.bad = {}
 280
 281         # Add a name table, so that the name URLs can be checked. Also
 282         # serves as an implicit cache for which URLs are done.
 283         self.name_table = {}
 284
 285         self.round = 0
 286         # The following are not pickled:
 287         self.robots = {}
 288         self.errors = {}
 289         self.urlopener = MyURLopener()
 290         self.changed = 0
 291
 292     def note(self, level, format, *args):
 293         if self.verbose > level:
 294             if args:
 295                 format = format%args
 296             self.message(format)
 297
 298     def message(self, format, *args):
 299         if args:
 300             format = format%args
 301         print format
 302
 303     def __getstate__(self):
 304         return (self.roots, self.todo, self.done, self.bad, self.round)
 305
 306     def __setstate__(self, state):
 307         self.reset()
 308         (self.roots, self.todo, self.done, self.bad, self.round) = state
 309         for root in self.roots:
 310             self.addrobot(root)
 311         for url in self.bad.keys():
 312             self.markerror(url)
 313
 314     def addroot(self, root, add_to_do = 1):
 315         if root not in self.roots:
 316             troot = root
 317             scheme, netloc, path, params, query, fragment = \
 318                     urlparse.urlparse(root)
 319             i = string.rfind(path, "/") + 1
 320             if 0 < i < len(path):
 321                 path = path[:i]
 322                 troot = urlparse.urlunparse((scheme, netloc, path,
 323                                              params, query, fragment))
 324             self.roots.append(troot)
 325             self.addrobot(root)
 326             if add_to_do:
 327                 self.newlink((root, ""), ("<root>", root))
 328
 329     def addrobot(self, root):
 330         root = urlparse.urljoin(root, "/")
 331         if self.robots.has_key(root): return
 332         url = urlparse.urljoin(root, "/robots.txt")
 333         self.robots[root] = rp = robotparser.RobotFileParser()
 334         self.note(2, "Parsing %s", url)
 335         rp.debug = self.verbose > 3
 336         rp.set_url(url)
 337         try:
 338             rp.read()
 339         except (OSError, IOError), msg:
 340             self.note(1, "I/O error parsing %s: %s", url, msg)
 341
 342     def run(self):
 343         while self.todo:
 344             self.round = self.round + 1
 345             self.note(0, "\nRound %d (%s)\n", self.round, self.status())
 346             urls = self.todo.keys()
 347             urls.sort()
 348             del urls[self.roundsize:]
 349             for url in urls:
 350                 self.dopage(url)
 351
 352     def status(self):
 353         return "%d total, %d to do, %d done, %d bad" % (
 354             len(self.todo)+len(self.done),
 355             len(self.todo), len(self.done),
 356             len(self.bad))
 357
 358     def report(self):
 359         self.message("")
 360         if not self.todo: s = "Final"
 361         else: s = "Interim"
 362         self.message("%s Report (%s)", s, self.status())
 363         self.report_errors()
 364
 365     def report_errors(self):
 366         if not self.bad:
 367             self.message("\nNo errors")
 368             return
 369         self.message("\nError Report:")
 370         sources = self.errors.keys()
 371         sources.sort()
 372         for source in sources:
 373             triples = self.errors[source]
 374             self.message("")
 375             if len(triples) > 1:
 376                 self.message("%d Errors in %s", len(triples), source)
 377             else:
 378                 self.message("Error in %s", source)
 379             # Call self.format_url() instead of referring
 380             # to the URL directly, since the URLs in these
 381             # triples is now a (URL, fragment) pair. The value
 382             # of the "source" variable comes from the list of
 383             # origins, and is a URL, not a pair.
 384             for url, rawlink, msg in triples:
 385                 if rawlink != self.format_url(url): s = " (%s)" % rawlink
 386                 else: s = ""
 387                 self.message("  HREF %s%s\n    msg %s",
 388                              self.format_url(url), s, msg)
 389
 390     def dopage(self, url_pair):
 391
 392         # All printing of URLs uses format_url(); argument changed to
 393         # url_pair for clarity.
 394         if self.verbose > 1:
 395             if self.verbose > 2:
 396                 self.show("Check ", self.format_url(url_pair),
 397                           "  from", self.todo[url_pair])
 398             else:
 399                 self.message("Check %s", self.format_url(url_pair))
 400         url, local_fragment = url_pair
 401         if local_fragment and self.nonames:
 402             self.markdone(url_pair)
 403             return
 404         page = self.getpage(url_pair)
 405         if page:
 406             # Store the page which corresponds to this URL.
 407             self.name_table[url] = page
 408             # If there is a fragment in this url_pair, and it's not
 409             # in the list of names for the page, call setbad(), since
 410             # it's a missing anchor.
 411             if local_fragment and local_fragment not in page.getnames():
 412                 self.setbad(url_pair, ("Missing name anchor `%s'" % local_fragment))
 413             for info in page.getlinkinfos():
 414                 # getlinkinfos() now returns the fragment as well,
 415                 # and we store that fragment here in the "todo" dictionary.
 416                 link, rawlink, fragment = info
 417                 # However, we don't want the fragment as the origin, since
 418                 # the origin is logically a page.
 419                 origin = url, rawlink
 420                 self.newlink((link, fragment), origin)
 421         else:
 422             # If no page has been created yet, we want to
 423             # record that fact.
 424             self.name_table[url_pair[0]] = None
 425         self.markdone(url_pair)
 426
 427     def newlink(self, url, origin):
 428         if self.done.has_key(url):
 429             self.newdonelink(url, origin)
 430         else:
 431             self.newtodolink(url, origin)
 432
 433     def newdonelink(self, url, origin):
 434         if origin not in self.done[url]:
 435             self.done[url].append(origin)
 436
 437         # Call self.format_url(), since the URL here
 438         # is now a (URL, fragment) pair.
 439         self.note(3, "  Done link %s", self.format_url(url))
 440
 441         # Make sure that if it's bad, that the origin gets added.
 442         if self.bad.has_key(url):
 443             source, rawlink = origin
 444             triple = url, rawlink, self.bad[url]
 445             self.seterror(source, triple)
 446
 447     def newtodolink(self, url, origin):
 448         # Call self.format_url(), since the URL here
 449         # is now a (URL, fragment) pair.
 450         if self.todo.has_key(url):
 451             if origin not in self.todo[url]:
 452                 self.todo[url].append(origin)
 453             self.note(3, "  Seen todo link %s", self.format_url(url))
 454         else:
 455             self.todo[url] = [origin]
 456             self.note(3, "  New todo link %s", self.format_url(url))
 457
 458     def format_url(self, url):
 459         link, fragment = url
 460         if fragment: return link + "#" + fragment
 461         else: return link
 462
 463     def markdone(self, url):
 464         self.done[url] = self.todo[url]
 465         del self.todo[url]
 466         self.changed = 1
 467
 468     def inroots(self, url):
 469         for root in self.roots:
 470             if url[:len(root)] == root:
 471                 return self.isallowed(root, url)
 472         return 0
 473
 474     def isallowed(self, root, url):
 475         root = urlparse.urljoin(root, "/")
 476         return self.robots[root].can_fetch(AGENTNAME, url)
 477
 478     def getpage(self, url_pair):
 479         # Incoming argument name is a (URL, fragment) pair.
 480         # The page may have been cached in the name_table variable.
 481         url, fragment = url_pair
 482         if self.name_table.has_key(url):
 483             return self.name_table[url]
 484
 485         scheme, path = urllib.splittype(url)
 486         if scheme in ('mailto', 'news', 'javascript', 'telnet'):
 487             self.note(1, " Not checking %s URL" % scheme)
 488             return None
 489         isint = self.inroots(url)
 490
 491         # Ensure that openpage gets the URL pair to
 492         # print out its error message and record the error pair
 493         # correctly.
 494         if not isint:
 495             if not self.checkext:
 496                 self.note(1, " Not checking ext link")
 497                 return None
 498             f = self.openpage(url_pair)
 499             if f:
 500                 self.safeclose(f)
 501             return None
 502         text, nurl = self.readhtml(url_pair)
 503
 504         if nurl != url:
 505             self.note(1, " Redirected to %s", nurl)
 506             url = nurl
 507         if text:
 508             return Page(text, url, maxpage=self.maxpage, checker=self)
 509
 510     # These next three functions take (URL, fragment) pairs as
 511     # arguments, so that openpage() receives the appropriate tuple to
 512     # record error messages.
 513     def readhtml(self, url_pair):
 514         url, fragment = url_pair
 515         text = None
 516         f, url = self.openhtml(url_pair)
 517         if f:
 518             text = f.read()
 519             f.close()
 520         return text, url
 521
 522     def openhtml(self, url_pair):
 523         url, fragment = url_pair
 524         f = self.openpage(url_pair)
 525         if f:
 526             url = f.geturl()
 527             info = f.info()
 528             if not self.checkforhtml(info, url):
 529                 self.safeclose(f)
 530                 f = None
 531         return f, url
 532
 533     def openpage(self, url_pair):
 534         url, fragment = url_pair
 535         try:
 536             return self.urlopener.open(url)
 537         except (OSError, IOError), msg:
 538             msg = self.sanitize(msg)
 539             self.note(0, "Error %s", msg)
 540             if self.verbose > 0:
 541                 self.show(" HREF ", url, "  from", self.todo[url_pair])
 542             self.setbad(url_pair, msg)
 543             return None
 544
 545     def checkforhtml(self, info, url):
 546         if info.has_key('content-type'):
 547             ctype = string.lower(cgi.parse_header(info['content-type'])[0])
 548         else:
 549             if url[-1:] == "/":
 550                 return 1
 551             ctype, encoding = mimetypes.guess_type(url)
 552         if ctype == 'text/html':
 553             return 1
 554         else:
 555             self.note(1, " Not HTML, mime type %s", ctype)
 556             return 0
 557
 558     def setgood(self, url):
 559         if self.bad.has_key(url):
 560             del self.bad[url]
 561             self.changed = 1
 562             self.note(0, "(Clear previously seen error)")
 563
 564     def setbad(self, url, msg):
 565         if self.bad.has_key(url) and self.bad[url] == msg:
 566             self.note(0, "(Seen this error before)")
 567             return
 568         self.bad[url] = msg
 569         self.changed = 1
 570         self.markerror(url)
 571
 572     def markerror(self, url):
 573         try:
 574             origins = self.todo[url]
 575         except KeyError:
 576             origins = self.done[url]
 577         for source, rawlink in origins:
 578             triple = url, rawlink, self.bad[url]
 579             self.seterror(source, triple)
 580
 581     def seterror(self, url, triple):
 582         try:
 583             # Because of the way the URLs are now processed, I need to
 584             # check to make sure the URL hasn't been entered in the
 585             # error list.  The first element of the triple here is a
 586             # (URL, fragment) pair, but the URL key is not, since it's
 587             # from the list of origins.
 588             if triple not in self.errors[url]:
 589                 self.errors[url].append(triple)
 590         except KeyError:
 591             self.errors[url] = [triple]
 592
 593     # The following used to be toplevel functions; they have been
 594     # changed into methods so they can be overridden in subclasses.
 595
 596     def show(self, p1, link, p2, origins):
 597         self.message("%s %s", p1, link)
 598         i = 0
 599         for source, rawlink in origins:
 600             i = i+1
 601             if i == 2:
 602                 p2 = ' '*len(p2)
 603             if rawlink != link: s = " (%s)" % rawlink
 604             else: s = ""
 605             self.message("%s %s%s", p2, source, s)
 606
 607     def sanitize(self, msg):
 608         if isinstance(IOError, ClassType) and isinstance(msg, IOError):
 609             # Do the other branch recursively
 610             msg.args = self.sanitize(msg.args)
 611         elif isinstance(msg, TupleType):
 612             if len(msg) >= 4 and msg[0] == 'http error' and \
 613                isinstance(msg[3], InstanceType):
 614                 # Remove the Message instance -- it may contain
 615                 # a file object which prevents pickling.
 616                 msg = msg[:3] + msg[4:]
 617         return msg
 618
 619     def safeclose(self, f):
 620         try:
 621             url = f.geturl()
 622         except AttributeError:
 623             pass
 624         else:
 625             if url[:4] == 'ftp:' or url[:7] == 'file://':
 626                 # Apparently ftp connections don't like to be closed
 627                 # prematurely...
 628                 text = f.read()
 629         f.close()
 630
 631     def save_pickle(self, dumpfile=DUMPFILE):
 632         if not self.changed:
 633             self.note(0, "\nNo need to save checkpoint")
 634         elif not dumpfile:
 635             self.note(0, "No dumpfile, won't save checkpoint")
 636         else:
 637             self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
 638             newfile = dumpfile + ".new"
 639             f = open(newfile, "wb")
 640             pickle.dump(self, f)
 641             f.close()
 642             try:
 643                 os.unlink(dumpfile)
 644             except os.error:
 645                 pass
 646             os.rename(newfile, dumpfile)
 647             self.note(0, "Done.")
 648             return 1
 649
 650
 651 class Page:
 652
 653     def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
 654         self.text = text
 655         self.url = url
 656         self.verbose = verbose
 657         self.maxpage = maxpage
 658         self.checker = checker
 659
 660         # The parsing of the page is done in the __init__() routine in
 661         # order to initialize the list of names the file
 662         # contains. Stored the parser in an instance variable. Passed
 663         # the URL to MyHTMLParser().
 664         size = len(self.text)
 665         if size > self.maxpage:
 666             self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
 667             self.parser = None
 668             return
 669         self.checker.note(2, "  Parsing %s (%d bytes)", self.url, size)
 670         self.parser = MyHTMLParser(url, verbose=self.verbose,
 671                                    checker=self.checker)
 672         self.parser.feed(self.text)
 673         self.parser.close()
 674
 675     def note(self, level, msg, *args):
 676         if self.checker:
 677             apply(self.checker.note, (level, msg) + args)
 678         else:
 679             if self.verbose >= level:
 680                 if args:
 681                     msg = msg%args
 682                 print msg
 683
 684     # Method to retrieve names.
 685     def getnames(self):
 686         if self.parser:
 687             return self.parser.names
 688         else:
 689             return []
 690
 691     def getlinkinfos(self):
 692         # File reading is done in __init__() routine.  Store parser in
 693         # local variable to indicate success of parsing.
 694
 695         # If no parser was stored, fail.
 696         if not self.parser: return []
 697
 698         rawlinks = self.parser.getlinks()
 699         base = urlparse.urljoin(self.url, self.parser.getbase() or "")
 700         infos = []
 701         for rawlink in rawlinks:
 702             t = urlparse.urlparse(rawlink)
 703             # DON'T DISCARD THE FRAGMENT! Instead, include
 704             # it in the tuples which are returned. See Checker.dopage().
 705             fragment = t[-1]
 706             t = t[:-1] + ('',)
 707             rawlink = urlparse.urlunparse(t)
 708             link = urlparse.urljoin(base, rawlink)
 709             infos.append((link, rawlink, fragment))
 710
 711         return infos
 712
 713
 714 class MyStringIO(StringIO.StringIO):
 715
 716     def __init__(self, url, info):
 717         self.__url = url
 718         self.__info = info
 719         StringIO.StringIO.__init__(self)
 720
 721     def info(self):
 722         return self.__info
 723
 724     def geturl(self):
 725         return self.__url
 726
 727
 728 class MyURLopener(urllib.FancyURLopener):
 729
 730     http_error_default = urllib.URLopener.http_error_default
 731
 732     def __init__(*args):
 733         self = args[0]
 734         apply(urllib.FancyURLopener.__init__, args)
 735         self.addheaders = [
 736             ('User-agent', 'Python-webchecker/%s' % __version__),
 737             ]
 738
 739     def http_error_401(self, url, fp, errcode, errmsg, headers):
 740         return None
 741
 742     def open_file(self, url):
 743         path = urllib.url2pathname(urllib.unquote(url))
 744         if os.path.isdir(path):
 745             if path[-1] != os.sep:
 746                 url = url + '/'
 747             indexpath = os.path.join(path, "index.html")
 748             if os.path.exists(indexpath):
 749                 return self.open_file(url + "index.html")
 750             try:
 751                 names = os.listdir(path)
 752             except os.error, msg:
 753                 raise IOError, msg, sys.exc_traceback
 754             names.sort()
 755             s = MyStringIO("file:"+url, {'content-type': 'text/html'})
 756             s.write('<BASE HREF="file:%s">\n' %
 757                     urllib.quote(os.path.join(path, "")))
 758             for name in names:
 759                 q = urllib.quote(name)
 760                 s.write('<A HREF="%s">%s</A>\n' % (q, q))
 761             s.seek(0)
 762             return s
 763         return urllib.FancyURLopener.open_file(self, url)
 764
 765
 766 class MyHTMLParser(sgmllib.SGMLParser):
 767
 768     def __init__(self, url, verbose=VERBOSE, checker=None):
 769         self.myverbose = verbose # now unused
 770         self.checker = checker
 771         self.base = None
 772         self.links = {}
 773         self.names = []
 774         self.url = url
 775         sgmllib.SGMLParser.__init__(self)
 776
 777     def start_a(self, attributes):
 778         self.link_attr(attributes, 'href')
 779
 780         # We must rescue the NAME
 781         # attributes from the anchor, in order to
 782         # cache the internal anchors which are made
 783         # available in the page.
 784         for name, value in attributes:
 785             if name == "name":
 786                 if value in self.names:
 787                     self.checker.message("WARNING: duplicate name %s in %s",
 788                                          value, self.url)
 789                 else: self.names.append(value)
 790                 break
 791
 792     def end_a(self): pass
 793
 794     def do_area(self, attributes):
 795         self.link_attr(attributes, 'href')
 796
 797     def do_body(self, attributes):
 798         self.link_attr(attributes, 'background', 'bgsound')
 799
 800     def do_img(self, attributes):
 801         self.link_attr(attributes, 'src', 'lowsrc')
 802
 803     def do_frame(self, attributes):
 804         self.link_attr(attributes, 'src', 'longdesc')
 805
 806     def do_iframe(self, attributes):
 807         self.link_attr(attributes, 'src', 'longdesc')
 808
 809     def do_link(self, attributes):
 810         for name, value in attributes:
 811             if name == "rel":
 812                 parts = string.split(string.lower(value))
 813                 if (  parts == ["stylesheet"]
 814                       or parts == ["alternate", "stylesheet"]):
 815                     self.link_attr(attributes, "href")
 816                     break
 817
 818     def do_object(self, attributes):
 819         self.link_attr(attributes, 'data', 'usemap')
 820
 821     def do_script(self, attributes):
 822         self.link_attr(attributes, 'src')
 823
 824     def do_table(self, attributes):
 825         self.link_attr(attributes, 'background')
 826
 827     def do_td(self, attributes):
 828         self.link_attr(attributes, 'background')
 829
 830     def do_th(self, attributes):
 831         self.link_attr(attributes, 'background')
 832
 833     def do_tr(self, attributes):
 834         self.link_attr(attributes, 'background')
 835
 836     def link_attr(self, attributes, *args):
 837         for name, value in attributes:
 838             if name in args:
 839                 if value: value = string.strip(value)
 840                 if value: self.links[value] = None
 841
 842     def do_base(self, attributes):
 843         for name, value in attributes:
 844             if name == 'href':
 845                 if value: value = string.strip(value)
 846                 if value:
 847                     if self.checker:
 848                         self.checker.note(1, "  Base %s", value)
 849                     self.base = value
 850
 851     def getlinks(self):
 852         return self.links.keys()
 853
 854     def getbase(self):
 855         return self.base
 856
 857
 858 if __name__ == '__main__':
 859     main()