3 """A variant on webchecker that creates a mirror copy of a remote site."""
5 __version__
= "$Revision$"
15 # Extract real version number if necessary
16 if __version__
[0] == '$':
17 _v
= string
.split(__version__
)
22 verbose
= webchecker
.VERBOSE
24 opts
, args
= getopt
.getopt(sys
.argv
[1:], "qv")
25 except getopt
.error
, msg
:
27 print "usage:", sys
.argv
[0], "[-qv] ... [rooturl] ..."
35 c
.setflags(verbose
=verbose
)
36 c
.urlopener
.addheaders
= [
37 ('User-agent', 'websucker/%s' % __version__
),
40 print "Adding root", arg
45 class Sucker(webchecker
.Checker
):
50 # SAM 11/13/99: in general, URLs are now URL pairs.
51 # Since we've suppressed name anchor checking,
52 # we can ignore the second dimension.
54 def readhtml(self
, url_pair
):
57 path
= self
.savefilename(url
)
61 f
= self
.openpage(url_pair
)
67 path
= self
.savefilename(url
)
70 self
.savefile(text
, path
)
71 if not self
.checkforhtml(info
, url
):
74 if self
.checkforhtml({}, url
):
79 def savefile(self
, text
, path
):
80 dir, base
= os
.path
.split(path
)
86 self
.message("saved %s", path
)
88 self
.message("didn't save %s: %s", path
, str(msg
))
90 def savefilename(self
, url
):
91 type, rest
= urllib
.splittype(url
)
92 host
, path
= urllib
.splithost(rest
)
93 while path
[:1] == "/": path
= path
[1:]
94 user
, host
= urllib
.splituser(host
)
95 host
, port
= urllib
.splitnport(host
)
96 host
= string
.lower(host
)
97 if not path
or path
[-1] == "/":
98 path
= path
+ "index.html"
100 path
= string
.join(string
.split(path
, "/"), os
.sep
)
103 path
= os
.path
.join(host
, path
)
109 if os
.path
.exists(dir):
110 if not os
.path
.isdir(dir):
112 os
.rename(dir, dir + ".bak")
114 os
.rename(dir + ".bak", os
.path
.join(dir, "index.html"))
118 head
, tail
= os
.path
.split(dir)
120 print "Huh? Don't know how to make dir", dir
125 if __name__
== '__main__':
126 sys
.exit(main() or 0)