Updated for 2.1b2 distribution.
[python/dscho.git] / Tools / webchecker / websucker.py
blob5f726b36ce8593ff6a0fc17e653b2e9d4e2b38f3
1 #! /usr/bin/env python
3 """A variant on webchecker that creates a mirror copy of a remote site."""
5 __version__ = "$Revision$"
7 import os
8 import sys
9 import string
10 import urllib
11 import getopt
13 import webchecker
15 # Extract real version number if necessary
16 if __version__[0] == '$':
17 _v = string.split(__version__)
18 if len(_v) == 3:
19 __version__ = _v[1]
21 def main():
22 verbose = webchecker.VERBOSE
23 try:
24 opts, args = getopt.getopt(sys.argv[1:], "qv")
25 except getopt.error, msg:
26 print msg
27 print "usage:", sys.argv[0], "[-qv] ... [rooturl] ..."
28 return 2
29 for o, a in opts:
30 if o == "-q":
31 verbose = 0
32 if o == "-v":
33 verbose = verbose + 1
34 c = Sucker()
35 c.setflags(verbose=verbose)
36 c.urlopener.addheaders = [
37 ('User-agent', 'websucker/%s' % __version__),
39 for arg in args:
40 print "Adding root", arg
41 c.addroot(arg)
42 print "Run..."
43 c.run()
45 class Sucker(webchecker.Checker):
47 checkext = 0
48 nonames = 1
50 # SAM 11/13/99: in general, URLs are now URL pairs.
51 # Since we've suppressed name anchor checking,
52 # we can ignore the second dimension.
54 def readhtml(self, url_pair):
55 url = url_pair[0]
56 text = None
57 path = self.savefilename(url)
58 try:
59 f = open(path, "rb")
60 except IOError:
61 f = self.openpage(url_pair)
62 if f:
63 info = f.info()
64 nurl = f.geturl()
65 if nurl != url:
66 url = nurl
67 path = self.savefilename(url)
68 text = f.read()
69 f.close()
70 self.savefile(text, path)
71 if not self.checkforhtml(info, url):
72 text = None
73 else:
74 if self.checkforhtml({}, url):
75 text = f.read()
76 f.close()
77 return text, url
79 def savefile(self, text, path):
80 dir, base = os.path.split(path)
81 makedirs(dir)
82 try:
83 f = open(path, "wb")
84 f.write(text)
85 f.close()
86 self.message("saved %s", path)
87 except IOError, msg:
88 self.message("didn't save %s: %s", path, str(msg))
90 def savefilename(self, url):
91 type, rest = urllib.splittype(url)
92 host, path = urllib.splithost(rest)
93 while path[:1] == "/": path = path[1:]
94 user, host = urllib.splituser(host)
95 host, port = urllib.splitnport(host)
96 host = string.lower(host)
97 if not path or path[-1] == "/":
98 path = path + "index.html"
99 if os.sep != "/":
100 path = string.join(string.split(path, "/"), os.sep)
101 if os.name == "mac":
102 path = os.sep + path
103 path = os.path.join(host, path)
104 return path
106 def makedirs(dir):
107 if not dir:
108 return
109 if os.path.exists(dir):
110 if not os.path.isdir(dir):
111 try:
112 os.rename(dir, dir + ".bak")
113 os.mkdir(dir)
114 os.rename(dir + ".bak", os.path.join(dir, "index.html"))
115 except os.error:
116 pass
117 return
118 head, tail = os.path.split(dir)
119 if not tail:
120 print "Huh? Don't know how to make dir", dir
121 return
122 makedirs(head)
123 os.mkdir(dir, 0777)
125 if __name__ == '__main__':
126 sys.exit(main() or 0)