1 """Open an arbitrary URL.
3 See the following document for more info on URLs:
4 "Names and Addresses, URIs, URLs, URNs, URCs", at
5 http://www.w3.org/pub/WWW/Addressing/Overview.html
7 See also the HTTP spec (from which the error codes are derived):
8 "HTTP - Hypertext Transfer Protocol", at
9 http://www.w3.org/pub/WWW/Protocols/
11 Related standards and specs:
12 - RFC1808: the "relative URL" spec. (authoritative status)
13 - RFC1738 - the "URL standard". (authoritative status)
14 - RFC1630 - the "URI spec". (informational status)
16 All code but that related to URL parsing has been removed (since it is not
17 compatible with Google App Engine)from this fork of the original file,
19 http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915
24 from urlparse
import urljoin
as basejoin
26 __all__
= ["quote", "quote_plus", "unquote", "unquote_plus",
27 "urlencode", "splittag",
29 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
30 "splitnport", "splitquery", "splitattr", "splitvalue",
33 __version__
= '1.17' # XXX This version is not always updated :-(
36 # Utilities to parse URLs (most of these return None for missing parts):
37 # unwrap('<URL:type://host/path>') --> 'type://host/path'
38 # splittype('type:opaquestring') --> 'type', 'opaquestring'
39 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
40 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
41 # splitpasswd('user:passwd') -> 'user', 'passwd'
42 # splitport('host:port') --> 'host', 'port'
43 # splitquery('/path?query') --> '/path', 'query'
44 # splittag('/path#tag') --> '/path', 'tag'
45 # splitattr('/path;attr1=value1;attr2=value2;...') ->
46 # '/path', ['attr1=value1', 'attr2=value2', ...]
47 # splitvalue('attr=value') --> 'attr', 'value'
48 # splitgophertype('/Xselector') --> 'X', 'selector'
49 # unquote('abc%20def') -> 'abc def'
50 # quote('abc def') -> 'abc%20def')
59 return isinstance(x
, unicode)
62 """toBytes(u"URL") --> 'URL'."""
63 # Most URL schemes require ASCII. If that changes, the conversion
67 url
= url
.encode("ASCII")
69 raise UnicodeError("URL " + repr(url
) +
70 " contains non-ASCII characters")
74 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
76 if url
[:1] == '<' and url
[-1:] == '>':
77 url
= url
[1:-1].strip()
78 if url
[:4] == 'URL:': url
= url
[4:].strip()
83 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
87 _typeprog
= re
.compile('^([^/:]+):')
89 match
= _typeprog
.match(url
)
91 scheme
= match
.group(1)
92 return scheme
.lower(), url
[len(scheme
) + 1:]
97 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
101 _hostprog
= re
.compile('^//([^/?]*)(.*)$')
103 match
= _hostprog
.match(url
)
104 if match
: return match
.group(1, 2)
109 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
111 if _userprog
is None:
113 _userprog
= re
.compile('^(.*)@(.*)$')
115 match
= _userprog
.match(host
)
116 if match
: return map(unquote
, match
.group(1, 2))
120 def splitpasswd(user
):
121 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
123 if _passwdprog
is None:
125 _passwdprog
= re
.compile('^([^:]*):(.*)$')
127 match
= _passwdprog
.match(user
)
128 if match
: return match
.group(1, 2)
131 # splittag('/path#tag') --> '/path', 'tag'
134 """splitport('host:port') --> 'host', 'port'."""
136 if _portprog
is None:
138 _portprog
= re
.compile('^(.*):([0-9]+)$')
140 match
= _portprog
.match(host
)
141 if match
: return match
.group(1, 2)
145 def splitnport(host
, defport
=-1):
146 """Split host and port, returning numeric port.
147 Return given default port if no ':' found; defaults to -1.
148 Return numerical port if a valid number are found after ':'.
149 Return None if ':' but not a valid number."""
151 if _nportprog
is None:
153 _nportprog
= re
.compile('^(.*):(.*)$')
155 match
= _nportprog
.match(host
)
157 host
, port
= match
.group(1, 2)
159 if not port
: raise ValueError, "no digits"
168 """splitquery('/path?query') --> '/path', 'query'."""
170 if _queryprog
is None:
172 _queryprog
= re
.compile('^(.*)\?([^?]*)$')
174 match
= _queryprog
.match(url
)
175 if match
: return match
.group(1, 2)
180 """splittag('/path#tag') --> '/path', 'tag'."""
184 _tagprog
= re
.compile('^(.*)#([^#]*)$')
186 match
= _tagprog
.match(url
)
187 if match
: return match
.group(1, 2)
191 """splitattr('/path;attr1=value1;attr2=value2;...') ->
192 '/path', ['attr1=value1', 'attr2=value2', ...]."""
193 words
= url
.split(';')
194 return words
[0], words
[1:]
197 def splitvalue(attr
):
198 """splitvalue('attr=value') --> 'attr', 'value'."""
200 if _valueprog
is None:
202 _valueprog
= re
.compile('^([^=]*)=(.*)$')
204 match
= _valueprog
.match(attr
)
205 if match
: return match
.group(1, 2)
208 def splitgophertype(selector
):
209 """splitgophertype('/Xselector') --> 'X', 'selector'."""
210 if selector
[:1] == '/' and selector
[1:2]:
211 return selector
[1], selector
[2:]
212 return None, selector
214 _hextochr
= dict(('%02x' % i
, chr(i
)) for i
in range(256))
215 _hextochr
.update(('%02X' % i
, chr(i
)) for i
in range(256))
218 """unquote('abc%20def') -> 'abc def'."""
220 for i
in xrange(1, len(res
)):
223 res
[i
] = _hextochr
[item
[:2]] + item
[2:]
226 except UnicodeDecodeError:
227 res
[i
] = unichr(int(item
[:2], 16)) + item
[2:]
231 """unquote('%7e/abc+def') -> '~/abc def'"""
232 s
= s
.replace('+', ' ')
235 always_safe
= ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
236 'abcdefghijklmnopqrstuvwxyz'
240 def quote(s
, safe
= '/'):
241 """quote('abc def') -> 'abc%20def'
243 Each part of a URL, e.g. the path info, the query, etc., has a
244 different set of reserved characters that must be quoted.
246 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
247 the following reserved characters.
249 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
252 Each of these characters is reserved in some component of a URL,
253 but not necessarily in all of them.
255 By default, the quote function is intended for quoting the path
256 section of a URL. Thus, it will not encode '/'. This character
257 is reserved, but in typical usage the quote function is being
258 called on a path where the existing slash characters are used as
261 cachekey
= (safe
, always_safe
)
263 safe_map
= _safemaps
[cachekey
]
269 safe_map
[c
] = (c
in safe
) and c
or ('%%%02X' % i
)
270 _safemaps
[cachekey
] = safe_map
271 res
= map(safe_map
.__getitem
__, s
)
274 def quote_plus(s
, safe
= ''):
275 """Quote the query fragment of a URL; replacing ' ' with '+'"""
277 s
= quote(s
, safe
+ ' ')
278 return s
.replace(' ', '+')
279 return quote(s
, safe
)
281 def urlencode(query
,doseq
=0):
282 """Encode a sequence of two-element tuples or dictionary into a URL query string.
284 If any values in the query arg are sequences and doseq is true, each
285 sequence element is converted to a separate parameter.
287 If the query arg is a sequence of two-element tuples, the order of the
288 parameters in the output will match the order of parameters in the
292 if hasattr(query
,"items"):
294 query
= query
.items()
296 # it's a bother at times that strings and string-like objects are
299 # non-sequence items should not work with len()
300 # non-empty strings will fail this
301 if len(query
) and not isinstance(query
[0], tuple):
303 # zero-length sequences of all types will get here and succeed,
304 # but that's a minor nit - since the original implementation
305 # allowed empty dicts that type of behavior probably should be
306 # preserved for consistency
308 ty
,va
,tb
= sys
.exc_info()
309 raise TypeError, "not a valid non-string sequence or mapping object", tb
313 # preserve old behavior
315 k
= quote_plus(str(k
))
316 v
= quote_plus(str(v
))
317 l
.append(k
+ '=' + v
)
320 k
= quote_plus(str(k
))
321 if isinstance(v
, str):
323 l
.append(k
+ '=' + v
)
325 # is there a reasonable way to convert to ASCII?
326 # encode generates a string, but "replace" or "ignore"
327 # lose information and "strict" can raise UnicodeError
328 v
= quote_plus(v
.encode("ASCII","replace"))
329 l
.append(k
+ '=' + v
)
332 # is this a sufficient test for sequence-ness?
336 v
= quote_plus(str(v
))
337 l
.append(k
+ '=' + v
)
339 # loop over the sequence
341 l
.append(k
+ '=' + quote_plus(str(elt
)))