1 # Open an arbitrary URL
3 # See the following document for a tentative description of URLs:
4 # Uniform Resource Locators Tim Berners-Lee
6 # IETF URL Working Group 14 July 1993
7 # draft-ietf-uri-url-01.txt
9 # The object returned by URLopener().open(file) will differ per
10 # protocol. All you know is that is has methods read(), readline(),
11 # readlines(), fileno(), close() and info(). The read*(), fileno()
12 # and close() methods work like those of open files.
13 # The info() method returns an rfc822.Message object which can be
14 # used to query various info about the object, if available.
15 # (rfc822.Message objects are queried with the getheader() method.)
21 # This really consists of two pieces:
22 # (1) a class which handles opening of all sorts of URLs
23 # (plus assorted utilities etc.)
24 # (2) a set of functions for parsing URLs
25 # XXX Should these be separated out into different modules?
28 # Shortcut for basic usage
33 _urlopener
= URLopener()
34 return _urlopener
.open(url
)
38 _urlopener
= URLopener()
39 return _urlopener
.retrieve(url
)
46 # This is a class rather than just a subroutine because we may need
47 # more than one set of global protocol-specific options.
55 self
.ftpcache
= ftpcache
56 # Undocumented feature: you can use a different
57 # ftp cache by assigning to the .ftpcache member;
58 # in case you want logically independent URL openers
68 for url
in self
.tempcache
.keys():
70 os
.unlink(self
.tempcache
[url
][0])
73 del self
.tempcache
[url
]
75 # Add a header to be used by the HTTP interface only
76 # e.g. u.addheader('Accept', 'sound/basic')
77 def addheader(self
, *args
):
78 self
.addheaders
.append(args
)
81 # Use URLopener().open(file) instead of open(file, 'r')
83 type, url
= splittype(unwrap(url
))
84 if not type: type = 'file'
88 name
= regsub
.gsub('-', '_', name
)
89 if not hasattr(self
, name
):
90 raise IOError, ('url error', 'unknown url type', type)
92 return getattr(self
, name
)(url
)
93 except socket
.error
, msg
:
94 raise IOError, ('socket error', msg
)
97 # retrieve(url) returns (filename, None) for a local object
98 # or (tempfilename, headers) for a remote object
99 def retrieve(self
, url
):
100 if self
.tempcache
.has_key(url
):
101 return self
.tempcache
[url
]
103 if self
.tempcache
.has_key(url1
):
104 self
.tempcache
[url
] = self
.tempcache
[url1
]
105 return self
.tempcache
[url1
]
106 type, url1
= splittype(url1
)
107 if not type or type == 'file':
109 fp
= self
.open_local_file(url1
)
111 return splithost(url1
)[1], None
117 tfn
= tempfile
.mktemp()
118 self
.tempcache
[url
] = result
= tfn
, headers
129 # Each method named open_<type> knows how to open that type of URL
132 def open_http(self
, url
):
134 host
, selector
= splithost(url
)
135 h
= httplib
.HTTP(host
)
136 h
.putrequest('GET', selector
)
137 for args
in self
.addheaders
: apply(h
.putheader
, args
)
138 errcode
, errmsg
, headers
= h
.getreply()
139 if errcode
== 200: return addinfo(h
.getfile(), headers
)
140 else: raise IOError, ('http error', errcode
, errmsg
, headers
)
142 # Use Gopher protocol
143 def open_gopher(self
, url
):
145 host
, selector
= splithost(url
)
146 type, selector
= splitgophertype(selector
)
147 selector
, query
= splitquery(selector
)
148 if query
: fp
= gopherlib
.send_query(selector
, query
, host
)
149 else: fp
= gopherlib
.send_selector(selector
, host
)
150 return addinfo(fp
, noheaders())
152 # Use local file or FTP depending on form of URL
153 def open_file(self
, url
):
155 return self
.open_local_file(url
)
157 return self
.open_ftp(url
)
160 def open_local_file(self
, url
):
161 host
, file = splithost(url
)
162 if not host
: return addinfo(open(file, 'r'), noheaders())
163 host
, port
= splitport(host
)
164 if not port
and socket
.gethostbyname(host
) in (
165 localhost(), thishost()):
166 return addinfo(open(file, 'r'), noheaders())
167 raise IOError, ('local file error', 'not on local host')
170 def open_ftp(self
, url
):
171 host
, file = splithost(url
)
172 if not host
: raise IOError, ('ftp error', 'no host given')
173 host
, port
= splitport(host
)
174 host
= socket
.gethostbyname(host
)
177 port
= ftplib
.FTP_PORT
180 if not self
.ftpcache
.has_key(key
):
181 self
.ftpcache
[key
] = ftpwrapper(host
, port
)
182 return addinfo(self
.ftpcache
[key
].retrfile(file),
184 except ftperrors(), msg
:
185 raise IOError, ('ftp error', msg
)
190 # Return the IP address of the magic hostname 'localhost'
195 _localhost
= socket
.gethostbyname('localhost')
198 # Return the IP address of the current host
203 _thishost
= socket
.gethostbyname(socket
.gethostname())
206 # Return the set of errors raised by the FTP class
212 _ftperrors
= (ftplib
.error_reply
,
218 # Return an empty rfc822.Message object
224 _noheaders
= rfc822
.Message(open('/dev/null', 'r'))
225 _noheaders
.fp
.close() # Recycle file descriptor
231 # Class used by open_ftp() for cache of open FTP connections
233 def __init__(self
, host
, port
):
239 self
.ftp
= ftplib
.FTP()
240 self
.ftp
.connect(self
.host
, self
.port
)
242 def retrfile(self
, file):
245 self
.ftp
.voidcmd('TYPE I')
246 except ftplib
.all_errors
:
248 self
.ftp
.voidcmd('TYPE I')
253 conn
= self
.ftp
.transfercmd(cmd
)
254 except ftplib
.error_perm
, reason
:
255 if reason
[:3] != '550':
256 raise IOError, ('ftp error', reason
)
258 # Try a directory listing
259 if file: cmd
= 'LIST ' + file
261 conn
= self
.ftp
.transfercmd(cmd
)
262 return addclosehook(conn
.makefile('r'), self
.ftp
.voidresp
)
264 # Base class for addinfo and addclosehook
266 def __init__(self
, fp
):
268 self
.read
= self
.fp
.read
269 self
.readline
= self
.fp
.readline
270 self
.readlines
= self
.fp
.readlines
271 self
.fileno
= self
.fp
.fileno
273 return '<%s at %s whose fp = %s>' % (
274 self
.__class
__.__name
__, `
id(self
)`
, `self
.fp`
)
280 self
.readlines
= None
284 # Class to add a close hook to an open file
285 class addclosehook(addbase
):
286 def __init__(self
, fp
, closehook
, *hookargs
):
287 addbase
.__init
__(self
, fp
)
288 self
.closehook
= closehook
289 self
.hookargs
= hookargs
292 apply(self
.closehook
, self
.hookargs
)
293 self
.closehook
= None
297 # class to add an info() method to an open file
298 class addinfo(addbase
):
299 def __init__(self
, fp
, headers
):
300 addbase
.__init
__(self
, fp
)
301 self
.headers
= headers
306 # Utility to combine a URL with a base URL to form a new URL
308 def basejoin(base
, url
):
309 type, path
= splittype(url
)
311 host
, path
= splithost(path
)
312 basetype
, basepath
= splittype(base
)
313 basehost
, basepath
= splithost(basepath
)
314 basepath
, basetag
= splittag(basepath
)
315 basepath
, basequery
= splitquery(basepath
)
316 type = basetype
or 'file'
319 i
= string
.rfind(basepath
, '/')
320 if i
< 0: basepath
= '/'
321 else: basepath
= basepath
[:i
+1]
322 path
= basepath
+ path
323 if not host
: host
= basehost
324 if host
: return type + '://' + host
+ path
325 else: return type + ':' + path
328 # Utilities to parse URLs:
329 # unwrap('<URL:type//host/path>') --> 'type//host/path'
330 # splittype('type:opaquestring') --> 'type', 'opaquestring'
331 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
332 # splitport('host:port') --> 'host', 'port'
333 # splitquery('/path?query') --> '/path', 'query'
334 # splittag('/path#tag') --> '/path', 'tag'
335 # splitgophertype('/Xselector') --> 'X', 'selector'
339 url
= string
.strip(url
)
340 if url
[:1] == '<' and url
[-1:] == '>':
341 url
= string
.strip(url
[1:-1])
342 if url
[:4] == 'URL:': url
= string
.strip(url
[4:])
345 _typeprog
= regex
.compile('^\([^/:]+\):\(.*\)$')
347 if _typeprog
.match(url
) >= 0: return _typeprog
.group(1, 2)
350 _hostprog
= regex
.compile('^//\([^/]+\)\(.*\)$')
352 if _hostprog
.match(url
) >= 0: return _hostprog
.group(1, 2)
355 _portprog
= regex
.compile('^\(.*\):\([0-9]+\)$')
357 if _portprog
.match(host
) >= 0: return _portprog
.group(1, 2)
360 _queryprog
= regex
.compile('^\(.*\)\?\([^?]*\)$')
362 if _queryprog
.match(url
) >= 0: return _queryprog
.group(1, 2)
365 _tagprog
= regex
.compile('^\(.*\)#\([^#]*\)$')
367 if _tagprog
.match(url
) >= 0: return _tagprog
.group(1, 2)
370 def splitgophertype(selector
):
371 if selector
[:1] == '/' and selector
[1:2]:
372 return selector
[1], selector
[2:]
373 return None, selector
385 'file://localhost/etc/passwd',
386 'ftp://ftp.cwi.nl/etc/passwd',
387 'gopher://gopher.cwi.nl/11/',
388 'http://www.cwi.nl/index.html',
392 print '-'*10, url
, '-'*10
393 fn
, h
= urlretrieve(url
)
397 for k
in h
.keys(): print k
+ ':', h
[k
]
402 print regsub
.gsub('\r', '', data
)
408 # Run test program when run as a script
409 if __name__
== '__main__':