1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 __all__
= ["urlparse", "urlunparse", "urljoin"]
9 # A classification of schemes ('' means apply by default)
10 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
12 'prospero', 'rtsp', 'rtspu', '']
13 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
15 'https', 'shttp', 'snews',
16 'prospero', 'rtsp', 'rtspu', '']
17 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
20 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
23 uses_query
= ['http', 'wais',
25 'gopher', 'rtsp', 'rtspu', 'sip',
27 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
28 'https', 'shttp', 'snews',
29 'file', 'prospero', '']
31 # Characters valid in scheme names
32 scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
33 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
41 """Clear the parse cache."""
46 def urlparse(url
, scheme
='', allow_fragments
=1):
47 """Parse a URL into 6 components:
48 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50 Note that we don't break the components up in smaller bits
51 (e.g. netloc is a single string) and we don't expand % escapes."""
52 tuple = urlsplit(url
, scheme
, allow_fragments
)
53 scheme
, netloc
, url
, query
, fragment
= tuple
54 if scheme
in uses_params
and ';' in url
:
55 url
, params
= _splitparams(url
)
58 return scheme
, netloc
, url
, params
, query
, fragment
60 def _splitparams(url
):
62 i
= url
.find(';', url
.rfind('/'))
67 return url
[:i
], url
[i
+1:]
69 def urlsplit(url
, scheme
='', allow_fragments
=1):
70 """Parse a URL into 5 components:
71 <scheme>://<netloc>/<path>?<query>#<fragment>
72 Return a 5-tuple: (scheme, netloc, path, query, fragment).
73 Note that we don't break the components up in smaller bits
74 (e.g. netloc is a single string) and we don't expand % escapes."""
75 key
= url
, scheme
, allow_fragments
76 cached
= _parse_cache
.get(key
, None)
79 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
81 netloc
= query
= fragment
= ''
84 if url
[:i
] == 'http': # optimize the common case
85 scheme
= url
[:i
].lower()
95 if allow_fragments
and '#' in url
:
96 url
, fragment
= url
.split('#', 1)
98 url
, query
= url
.split('?', 1)
99 tuple = scheme
, netloc
, url
, query
, fragment
100 _parse_cache
[key
] = tuple
103 if c
not in scheme_chars
:
106 scheme
, url
= url
[:i
].lower(), url
[i
+1:]
107 if scheme
in uses_netloc
:
112 netloc
, url
= url
[2:i
], url
[i
:]
113 if allow_fragments
and scheme
in uses_fragment
and '#' in url
:
114 url
, fragment
= url
.split('#', 1)
115 if scheme
in uses_query
and '?' in url
:
116 url
, query
= url
.split('?', 1)
117 tuple = scheme
, netloc
, url
, query
, fragment
118 _parse_cache
[key
] = tuple
121 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
122 """Put a parsed URL back together again. This may result in a
123 slightly different, but equivalent URL, if the URL that was parsed
124 originally had redundant delimiters, e.g. a ? with an empty query
125 (the draft states that these are equivalent)."""
127 url
= "%s;%s" % (url
, params
)
128 return urlunsplit((scheme
, netloc
, url
, query
, fragment
))
130 def urlunsplit((scheme
, netloc
, url
, query
, fragment
)):
131 if netloc
or (scheme
in uses_netloc
and url
[:2] == '//'):
132 if url
and url
[:1] != '/': url
= '/' + url
133 url
= '//' + (netloc
or '') + url
135 url
= scheme
+ ':' + url
137 url
= url
+ '?' + query
139 url
= url
+ '#' + fragment
142 def urljoin(base
, url
, allow_fragments
= 1):
143 """Join a base URL and a possibly relative URL to form an absolute
144 interpretation of the latter."""
149 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
150 urlparse(base
, '', allow_fragments
)
151 scheme
, netloc
, path
, params
, query
, fragment
= \
152 urlparse(url
, bscheme
, allow_fragments
)
153 if scheme
!= bscheme
or scheme
not in uses_relative
:
155 if scheme
in uses_netloc
:
157 return urlunparse((scheme
, netloc
, path
,
158 params
, query
, fragment
))
161 return urlunparse((scheme
, netloc
, path
,
162 params
, query
, fragment
))
168 return urlunparse((scheme
, netloc
, bpath
,
169 params
, query
, fragment
))
170 segments
= bpath
.split('/')[:-1] + path
.split('/')
171 # XXX The stuff below is bogus in various ways...
172 if segments
[-1] == '.':
174 while '.' in segments
:
178 n
= len(segments
) - 1
180 if (segments
[i
] == '..'
181 and segments
[i
-1] not in ('', '..')):
182 del segments
[i
-1:i
+1]
187 if segments
== ['', '..']:
189 elif len(segments
) >= 2 and segments
[-1] == '..':
191 return urlunparse((scheme
, netloc
, '/'.join(segments
),
192 params
, query
, fragment
))
195 """Removes any existing fragment from URL.
197 Returns a tuple of the defragmented URL and the fragment. If
198 the URL contained no fragments, the second element is the
202 s
, n
, p
, a
, q
, frag
= urlparse(url
)
203 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
213 http:g = <URL:http://a/b/c/g>
214 http: = <URL:http://a/b/c/d>
215 g = <URL:http://a/b/c/g>
216 ./g = <URL:http://a/b/c/g>
217 g/ = <URL:http://a/b/c/g/>
218 /g = <URL:http://a/g>
220 ?y = <URL:http://a/b/c/d?y>
221 g?y = <URL:http://a/b/c/g?y>
222 g?y/./x = <URL:http://a/b/c/g?y/./x>
223 . = <URL:http://a/b/c/>
224 ./ = <URL:http://a/b/c/>
225 .. = <URL:http://a/b/>
226 ../ = <URL:http://a/b/>
227 ../g = <URL:http://a/b/g>
228 ../.. = <URL:http://a/>
229 ../../g = <URL:http://a/g>
230 ../../../g = <URL:http://a/../g>
231 ./../g = <URL:http://a/b/g>
232 ./g/. = <URL:http://a/b/c/g/>
233 /./g = <URL:http://a/./g>
234 g/./h = <URL:http://a/b/c/g/h>
235 g/../h = <URL:http://a/b/c/h>
236 http:g = <URL:http://a/b/c/g>
237 http: = <URL:http://a/b/c/d>
238 http:?y = <URL:http://a/b/c/d?y>
239 http:g?y = <URL:http://a/b/c/g?y>
240 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
242 # XXX The result for //g is actually http://g/; is this a problem?
255 fp
= StringIO
.StringIO(test_input
)
263 parts
= urlparse(url
)
264 print '%-10s : %s' % (url
, parts
)
265 abs = urljoin(base
, url
)
268 wrapped
= '<URL:%s>' % abs
269 print '%-10s = %s' % (url
, wrapped
)
270 if len(words
) == 3 and words
[1] == '=':
271 if wrapped
!= words
[2]:
272 print 'EXPECTED', words
[2], '!!!!!!!!!!'
274 if __name__
== '__main__':