Improved some error messages for command line processing.
[python/dscho.git] / Tools / webchecker / websucker.py
blob59da51b0a49634eb29c38c5042d9d517c3281a90
1 #! /usr/bin/env python
3 """A variant on webchecker that creates a mirror copy of a remote site."""
5 __version__ = "$Revision$"
7 import os
8 import sys
9 import string
10 import urllib
11 import getopt
13 import webchecker
15 # Extract real version number if necessary
16 if __version__[0] == '$':
17 _v = string.split(__version__)
18 if len(_v) == 3:
19 __version__ = _v[1]
21 def main():
22 verbose = webchecker.VERBOSE
23 try:
24 opts, args = getopt.getopt(sys.argv[1:], "qv")
25 except getopt.error, msg:
26 print msg
27 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
28 return 2
29 for o, a in opts:
30 if o == "-q":
31 verbose = 0
32 if o == "-v":
33 verbose = verbose + 1
34 c = Sucker()
35 c.setflags(verbose=verbose)
36 c.urlopener.addheaders = [
37 ('User-agent', 'websucker/%s' % __version__),
39 for arg in args:
40 print "Adding root", arg
41 c.addroot(arg)
42 print "Run..."
43 c.run()
45 class Sucker(webchecker.Checker):
47 checkext = 0
49 def readhtml(self, url):
50 text = None
51 path = self.savefilename(url)
52 try:
53 f = open(path, "rb")
54 except IOError:
55 f = self.openpage(url)
56 if f:
57 info = f.info()
58 nurl = f.geturl()
59 if nurl != url:
60 url = nurl
61 path = self.savefilename(url)
62 text = f.read()
63 f.close()
64 self.savefile(text, path)
65 if not self.checkforhtml(info, url):
66 text = None
67 else:
68 if self.checkforhtml({}, url):
69 text = f.read()
70 f.close()
71 return text, url
73 def savefile(self, text, path):
74 dir, base = os.path.split(path)
75 makedirs(dir)
76 f = open(path, "wb")
77 f.write(text)
78 f.close()
79 self.message("saved %s", path)
81 def savefilename(self, url):
82 type, rest = urllib.splittype(url)
83 host, path = urllib.splithost(rest)
84 while path[:1] == "/": path = path[1:]
85 user, host = urllib.splituser(host)
86 host, port = urllib.splitnport(host)
87 host = string.lower(host)
88 if not path or path[-1] == "/":
89 path = path + "index.html"
90 if os.sep != "/":
91 path = string.join(string.split(path, "/"), os.sep)
92 path = os.path.join(host, path)
93 return path
95 def makedirs(dir):
96 if not dir or os.path.exists(dir):
97 return
98 head, tail = os.path.split(dir)
99 if not tail:
100 print "Huh? Don't know how to make dir", dir
101 return
102 makedirs(head)
103 os.mkdir(dir, 0777)
105 if __name__ == '__main__':
106 sys.exit(main() or 0)