1 """Parse (absolute and relative) URLs.
3 urlparse module is based upon the following RFC specifications.
5 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6 and L. Masinter, January 2005.
8 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9 and L.Masinter, December 1999.
11 RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12 Berners-Lee, R. Fielding, and L. Masinter, August 1998.
14 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
16 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
19 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20 McCahill, December 1994
22 RFC 3986 is considered the current standard and any future changes to
23 urlparse module should conform with it. The urlparse module is
24 currently not entirely compliant with this RFC due to defacto
25 scenarios for parsing, and for backward compatibility purposes, some
26 parsing quirks from older RFCs are retained. The testcases in
27 test_urlparse.py provides a good indicator of parsing behavior.
31 __all__
= ["urlparse", "urlunparse", "urljoin", "urldefrag",
32 "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
34 # A classification of schemes ('' means apply by default)
35 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'imap',
36 'wais', 'file', 'https', 'shttp', 'mms',
37 'prospero', 'rtsp', 'rtspu', '', 'sftp']
38 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39 'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
41 'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
42 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news',
43 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44 uses_params
= ['ftp', 'hdl', 'prospero', 'http', 'imap',
45 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47 uses_query
= ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news',
50 'nntp', 'wais', 'https', 'shttp', 'snews',
51 'file', 'prospero', '']
53 # Characters valid in scheme names
54 scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
55 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
63 """Clear the parse cache."""
67 class ResultMixin(object):
68 """Shared methods for the parsed result objects."""
74 userinfo
= netloc
.rsplit("@", 1)[0]
76 userinfo
= userinfo
.split(":", 1)[0]
84 userinfo
= netloc
.rsplit("@", 1)[0]
86 return userinfo
.split(":", 1)[1]
91 netloc
= self
.netloc
.split('@')[-1]
92 if '[' in netloc
and ']' in netloc
:
93 return netloc
.split(']')[0][1:].lower()
95 return netloc
.split(':')[0].lower()
103 netloc
= self
.netloc
.split('@')[-1].split(']')[-1]
105 port
= netloc
.split(':')[1]
110 from collections
import namedtuple
112 class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin
):
117 return urlunsplit(self
)
120 class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin
):
125 return urlunparse(self
)
128 def urlparse(url
, scheme
='', allow_fragments
=True):
129 """Parse a URL into 6 components:
130 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132 Note that we don't break the components up in smaller bits
133 (e.g. netloc is a single string) and we don't expand % escapes."""
134 tuple = urlsplit(url
, scheme
, allow_fragments
)
135 scheme
, netloc
, url
, query
, fragment
= tuple
136 if scheme
in uses_params
and ';' in url
:
137 url
, params
= _splitparams(url
)
140 return ParseResult(scheme
, netloc
, url
, params
, query
, fragment
)
142 def _splitparams(url
):
144 i
= url
.find(';', url
.rfind('/'))
149 return url
[:i
], url
[i
+1:]
151 def _splitnetloc(url
, start
=0):
152 delim
= len(url
) # position of end of domain part of url, default is end
153 for c
in '/?#': # look for delimiters; the order is NOT important
154 wdelim
= url
.find(c
, start
) # find first of this delim
155 if wdelim
>= 0: # if found
156 delim
= min(delim
, wdelim
) # use earliest delim position
157 return url
[start
:delim
], url
[delim
:] # return (domain, rest)
159 def urlsplit(url
, scheme
='', allow_fragments
=True):
160 """Parse a URL into 5 components:
161 <scheme>://<netloc>/<path>?<query>#<fragment>
162 Return a 5-tuple: (scheme, netloc, path, query, fragment).
163 Note that we don't break the components up in smaller bits
164 (e.g. netloc is a single string) and we don't expand % escapes."""
165 allow_fragments
= bool(allow_fragments
)
166 key
= url
, scheme
, allow_fragments
, type(url
), type(scheme
)
167 cached
= _parse_cache
.get(key
, None)
170 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
172 netloc
= query
= fragment
= ''
175 if url
[:i
] == 'http': # optimize the common case
176 scheme
= url
[:i
].lower()
179 netloc
, url
= _splitnetloc(url
, 2)
180 if (('[' in netloc
and ']' not in netloc
) or
181 (']' in netloc
and '[' not in netloc
)):
182 raise ValueError("Invalid IPv6 URL")
183 if allow_fragments
and '#' in url
:
184 url
, fragment
= url
.split('#', 1)
186 url
, query
= url
.split('?', 1)
187 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
188 _parse_cache
[key
] = v
191 if c
not in scheme_chars
:
194 scheme
, url
= url
[:i
].lower(), url
[i
+1:]
197 netloc
, url
= _splitnetloc(url
, 2)
198 if (('[' in netloc
and ']' not in netloc
) or
199 (']' in netloc
and '[' not in netloc
)):
200 raise ValueError("Invalid IPv6 URL")
201 if allow_fragments
and scheme
in uses_fragment
and '#' in url
:
202 url
, fragment
= url
.split('#', 1)
203 if scheme
in uses_query
and '?' in url
:
204 url
, query
= url
.split('?', 1)
205 v
= SplitResult(scheme
, netloc
, url
, query
, fragment
)
206 _parse_cache
[key
] = v
209 def urlunparse(data
):
210 """Put a parsed URL back together again. This may result in a
211 slightly different, but equivalent URL, if the URL that was parsed
212 originally had redundant delimiters, e.g. a ? with an empty query
213 (the draft states that these are equivalent)."""
214 scheme
, netloc
, url
, params
, query
, fragment
= data
216 url
= "%s;%s" % (url
, params
)
217 return urlunsplit((scheme
, netloc
, url
, query
, fragment
))
219 def urlunsplit(data
):
220 """Combine the elements of a tuple as returned by urlsplit() into a
221 complete URL as a string. The data argument can be any five-item iterable.
222 This may result in a slightly different, but equivalent URL, if the URL that
223 was parsed originally had unnecessary delimiters (for example, a ? with an
224 empty query; the RFC states that these are equivalent)."""
225 scheme
, netloc
, url
, query
, fragment
= data
226 if netloc
or (scheme
and scheme
in uses_netloc
and url
[:2] != '//'):
227 if url
and url
[:1] != '/': url
= '/' + url
228 url
= '//' + (netloc
or '') + url
230 url
= scheme
+ ':' + url
232 url
= url
+ '?' + query
234 url
= url
+ '#' + fragment
237 def urljoin(base
, url
, allow_fragments
=True):
238 """Join a base URL and a possibly relative URL to form an absolute
239 interpretation of the latter."""
244 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
245 urlparse(base
, '', allow_fragments
)
246 scheme
, netloc
, path
, params
, query
, fragment
= \
247 urlparse(url
, bscheme
, allow_fragments
)
248 if scheme
!= bscheme
or scheme
not in uses_relative
:
250 if scheme
in uses_netloc
:
252 return urlunparse((scheme
, netloc
, path
,
253 params
, query
, fragment
))
256 return urlunparse((scheme
, netloc
, path
,
257 params
, query
, fragment
))
264 return urlunparse((scheme
, netloc
, path
,
265 params
, query
, fragment
))
268 return urlunparse((scheme
, netloc
, path
,
269 params
, query
, fragment
))
270 segments
= bpath
.split('/')[:-1] + path
.split('/')
271 # XXX The stuff below is bogus in various ways...
272 if segments
[-1] == '.':
274 while '.' in segments
:
278 n
= len(segments
) - 1
280 if (segments
[i
] == '..'
281 and segments
[i
-1] not in ('', '..')):
282 del segments
[i
-1:i
+1]
287 if segments
== ['', '..']:
289 elif len(segments
) >= 2 and segments
[-1] == '..':
291 return urlunparse((scheme
, netloc
, '/'.join(segments
),
292 params
, query
, fragment
))
295 """Removes any existing fragment from URL.
297 Returns a tuple of the defragmented URL and the fragment. If
298 the URL contained no fragments, the second element is the
302 s
, n
, p
, a
, q
, frag
= urlparse(url
)
303 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
308 # unquote method for parse_qs and parse_qsl
309 # Cannot use directly from urllib as it would create a circular reference
310 # because urllib uses urlparse methods (urljoin). If you update this function,
311 # update it also in urllib. This code duplication does not existin in Python3.
313 _hexdig
= '0123456789ABCDEFabcdef'
314 _hextochr
= dict((a
+b
, chr(int(a
+b
,16)))
315 for a
in _hexdig
for b
in _hexdig
)
318 """unquote('abc%20def') -> 'abc def'."""
326 s
+= _hextochr
[item
[:2]] + item
[2:]
329 except UnicodeDecodeError:
330 s
+= unichr(int(item
[:2], 16)) + item
[2:]
333 def parse_qs(qs
, keep_blank_values
=0, strict_parsing
=0):
334 """Parse a query given as a string argument.
338 qs: URL-encoded query string to be parsed
340 keep_blank_values: flag indicating whether blank values in
341 URL encoded queries should be treated as blank strings.
342 A true value indicates that blanks should be retained as
343 blank strings. The default false value indicates that
344 blank values are to be ignored and treated as if they were
347 strict_parsing: flag indicating what to do with parsing errors.
348 If false (the default), errors are silently ignored.
349 If true, errors raise a ValueError exception.
352 for name
, value
in parse_qsl(qs
, keep_blank_values
, strict_parsing
):
354 dict[name
].append(value
)
359 def parse_qsl(qs
, keep_blank_values
=0, strict_parsing
=0):
360 """Parse a query given as a string argument.
364 qs: URL-encoded query string to be parsed
366 keep_blank_values: flag indicating whether blank values in
367 URL encoded queries should be treated as blank strings. A
368 true value indicates that blanks should be retained as blank
369 strings. The default false value indicates that blank values
370 are to be ignored and treated as if they were not included.
372 strict_parsing: flag indicating what to do with parsing errors. If
373 false (the default), errors are silently ignored. If true,
374 errors raise a ValueError exception.
376 Returns a list, as G-d intended.
378 pairs
= [s2
for s1
in qs
.split('&') for s2
in s1
.split(';')]
380 for name_value
in pairs
:
381 if not name_value
and not strict_parsing
:
383 nv
= name_value
.split('=', 1)
386 raise ValueError, "bad query field: %r" % (name_value
,)
387 # Handle case of a control-name with no equal sign
388 if keep_blank_values
:
392 if len(nv
[1]) or keep_blank_values
:
393 name
= unquote(nv
[0].replace('+', ' '))
394 value
= unquote(nv
[1].replace('+', ' '))
395 r
.append((name
, value
))
404 http:g = <URL:http://a/b/c/g>
405 http: = <URL:http://a/b/c/d>
406 g = <URL:http://a/b/c/g>
407 ./g = <URL:http://a/b/c/g>
408 g/ = <URL:http://a/b/c/g/>
409 /g = <URL:http://a/g>
411 ?y = <URL:http://a/b/c/d?y>
412 g?y = <URL:http://a/b/c/g?y>
413 g?y/./x = <URL:http://a/b/c/g?y/./x>
414 . = <URL:http://a/b/c/>
415 ./ = <URL:http://a/b/c/>
416 .. = <URL:http://a/b/>
417 ../ = <URL:http://a/b/>
418 ../g = <URL:http://a/b/g>
419 ../.. = <URL:http://a/>
420 ../../g = <URL:http://a/g>
421 ../../../g = <URL:http://a/../g>
422 ./../g = <URL:http://a/b/g>
423 ./g/. = <URL:http://a/b/c/g/>
424 /./g = <URL:http://a/./g>
425 g/./h = <URL:http://a/b/c/g/h>
426 g/../h = <URL:http://a/b/c/h>
427 http:g = <URL:http://a/b/c/g>
428 http: = <URL:http://a/b/c/d>
429 http:?y = <URL:http://a/b/c/d?y>
430 http:g?y = <URL:http://a/b/c/g?y>
431 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
445 from cStringIO
import StringIO
447 from StringIO
import StringIO
448 fp
= StringIO(test_input
)
454 parts
= urlparse(url
)
455 print '%-10s : %s' % (url
, parts
)
456 abs = urljoin(base
, url
)
459 wrapped
= '<URL:%s>' % abs
460 print '%-10s = %s' % (url
, wrapped
)
461 if len(words
) == 3 and words
[1] == '=':
462 if wrapped
!= words
[2]:
463 print 'EXPECTED', words
[2], '!!!!!!!!!!'
465 if __name__
== '__main__':