1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 __all__
= ["urlparse", "urlunparse", "urljoin"]
9 # A classification of schemes ('' means apply by default)
10 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
12 'prospero', 'rtsp', 'rtspu', '']
13 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
15 'https', 'shttp', 'snews',
16 'prospero', 'rtsp', 'rtspu', '']
17 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
20 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
23 uses_query
= ['http', 'wais',
25 'gopher', 'rtsp', 'rtspu', 'sip',
27 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
28 'https', 'shttp', 'snews',
29 'file', 'prospero', '']
31 # Characters valid in scheme names
32 scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
33 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
41 """Clear the parse cache."""
46 def urlparse(url
, scheme
= '', allow_fragments
= 1):
47 """Parse a URL into 6 components:
48 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50 Note that we don't break the components up in smaller bits
51 (e.g. netloc is a single string) and we don't expand % escapes."""
52 key
= url
, scheme
, allow_fragments
53 cached
= _parse_cache
.get(key
, None)
56 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
58 netloc
= path
= params
= query
= fragment
= ''
61 if url
[:i
] == 'http': # optimize the common case
62 scheme
= url
[:i
].lower()
83 tuple = scheme
, netloc
, url
, params
, query
, fragment
84 _parse_cache
[key
] = tuple
87 if c
not in scheme_chars
:
90 scheme
, url
= url
[:i
].lower(), url
[i
+1:]
91 if scheme
in uses_netloc
:
96 netloc
, url
= url
[2:i
], url
[i
:]
97 if allow_fragments
and scheme
in uses_fragment
:
100 url
, fragment
= url
[:i
], url
[i
+1:]
101 if scheme
in uses_query
:
104 url
, query
= url
[:i
], url
[i
+1:]
105 if scheme
in uses_params
:
108 url
, params
= url
[:i
], url
[i
+1:]
109 tuple = scheme
, netloc
, url
, params
, query
, fragment
110 _parse_cache
[key
] = tuple
113 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
118 if netloc
or (scheme
in uses_netloc
and url
[:2] == '//'):
119 if url
and url
[:1] != '/': url
= '/' + url
120 url
= '//' + (netloc
or '') + url
122 url
= scheme
+ ':' + url
124 url
= url
+ ';' + params
126 url
= url
+ '?' + query
128 url
= url
+ '#' + fragment
131 def urljoin(base
, url
, allow_fragments
= 1):
132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
138 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
139 urlparse(base
, '', allow_fragments
)
140 scheme
, netloc
, path
, params
, query
, fragment
= \
141 urlparse(url
, bscheme
, allow_fragments
)
142 if scheme
!= bscheme
or scheme
not in uses_relative
:
144 if scheme
in uses_netloc
:
146 return urlunparse((scheme
, netloc
, path
,
147 params
, query
, fragment
))
150 return urlunparse((scheme
, netloc
, path
,
151 params
, query
, fragment
))
157 return urlunparse((scheme
, netloc
, bpath
,
158 params
, query
, fragment
))
159 segments
= bpath
.split('/')[:-1] + path
.split('/')
160 # XXX The stuff below is bogus in various ways...
161 if segments
[-1] == '.':
163 while '.' in segments
:
167 n
= len(segments
) - 1
169 if (segments
[i
] == '..'
170 and segments
[i
-1] not in ('', '..')):
171 del segments
[i
-1:i
+1]
176 if segments
== ['', '..']:
178 elif len(segments
) >= 2 and segments
[-1] == '..':
180 return urlunparse((scheme
, netloc
, '/'.join(segments
),
181 params
, query
, fragment
))
184 """Removes any existing fragment from URL.
186 Returns a tuple of the defragmented URL and the fragment. If
187 the URL contained no fragments, the second element is the
190 s
, n
, p
, a
, q
, frag
= urlparse(url
)
191 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
199 http:g = <URL:http://a/b/c/g>
200 http: = <URL:http://a/b/c/d>
201 g = <URL:http://a/b/c/g>
202 ./g = <URL:http://a/b/c/g>
203 g/ = <URL:http://a/b/c/g/>
204 /g = <URL:http://a/g>
206 ?y = <URL:http://a/b/c/d?y>
207 g?y = <URL:http://a/b/c/g?y>
208 g?y/./x = <URL:http://a/b/c/g?y/./x>
209 . = <URL:http://a/b/c/>
210 ./ = <URL:http://a/b/c/>
211 .. = <URL:http://a/b/>
212 ../ = <URL:http://a/b/>
213 ../g = <URL:http://a/b/g>
214 ../.. = <URL:http://a/>
215 ../../g = <URL:http://a/g>
216 ../../../g = <URL:http://a/../g>
217 ./../g = <URL:http://a/b/g>
218 ./g/. = <URL:http://a/b/c/g/>
219 /./g = <URL:http://a/./g>
220 g/./h = <URL:http://a/b/c/g/h>
221 g/../h = <URL:http://a/b/c/h>
222 http:g = <URL:http://a/b/c/g>
223 http: = <URL:http://a/b/c/d>
224 http:?y = <URL:http://a/b/c/d?y>
225 http:g?y = <URL:http://a/b/c/g?y>
226 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
228 # XXX The result for //g is actually http://g/; is this a problem?
241 fp
= StringIO
.StringIO(test_input
)
249 parts
= urlparse(url
)
250 print '%-10s : %s' % (url
, parts
)
251 abs = urljoin(base
, url
)
254 wrapped
= '<URL:%s>' % abs
255 print '%-10s = %s' % (url
, wrapped
)
256 if len(words
) == 3 and words
[1] == '=':
257 if wrapped
!= words
[2]:
258 print 'EXPECTED', words
[2], '!!!!!!!!!!'
260 if __name__
== '__main__':