1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 # Standard/builtin Python modules
9 from string
import join
, split
, rfind
11 # A classification of schemes ('' means apply by default)
12 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
14 'prospero', 'rtsp', 'rtspu', '']
15 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
17 'https', 'shttp', 'snews',
18 'prospero', 'rtsp', 'rtspu', '']
19 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
22 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
23 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
25 uses_query
= ['http', 'wais',
27 'gopher', 'rtsp', 'rtspu', 'sip',
29 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
30 'https', 'shttp', 'snews',
31 'file', 'prospero', '']
33 # Characters valid in scheme names
34 scheme_chars
= string
.letters
+ string
.digits
+ '+-.'
40 """Clear the parse cache."""
45 def urlparse(url
, scheme
= '', allow_fragments
= 1):
46 """Parse a URL into 6 components:
47 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
48 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
49 Note that we don't break the components up in smaller bits
50 (e.g. netloc is a single string) and we don't expand % escapes."""
51 key
= url
, scheme
, allow_fragments
52 cached
= _parse_cache
.get(key
, None)
55 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
58 netloc
= path
= params
= query
= fragment
= ''
61 if url
[:i
] == 'http': # optimize the common case
62 scheme
= string
.lower(url
[:i
])
71 i
= string
.rfind(url
, '#')
83 tuple = scheme
, netloc
, url
, params
, query
, fragment
84 _parse_cache
[key
] = tuple
87 if c
not in scheme_chars
:
90 scheme
, url
= string
.lower(url
[:i
]), url
[i
+1:]
91 if scheme
in uses_netloc
:
96 netloc
, url
= url
[2:i
], url
[i
:]
97 if allow_fragments
and scheme
in uses_fragment
:
98 i
= string
.rfind(url
, '#')
100 url
, fragment
= url
[:i
], url
[i
+1:]
101 if scheme
in uses_query
:
104 url
, query
= url
[:i
], url
[i
+1:]
105 if scheme
in uses_params
:
108 url
, params
= url
[:i
], url
[i
+1:]
109 tuple = scheme
, netloc
, url
, params
, query
, fragment
110 _parse_cache
[key
] = tuple
113 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
118 if netloc
or (scheme
in uses_netloc
and url
[:2] == '//'):
119 if url
[:1] != '/': url
= '/' + url
120 url
= '//' + (netloc
or '') + url
122 url
= scheme
+ ':' + url
124 url
= url
+ ';' + params
126 url
= url
+ '?' + query
128 url
= url
+ '#' + fragment
131 def urljoin(base
, url
, allow_fragments
= 1):
132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
136 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
137 urlparse(base
, '', allow_fragments
)
138 scheme
, netloc
, path
, params
, query
, fragment
= \
139 urlparse(url
, bscheme
, allow_fragments
)
140 if scheme
!= bscheme
or scheme
not in uses_relative
:
141 return urlunparse((scheme
, netloc
, path
,
142 params
, query
, fragment
))
143 if scheme
in uses_netloc
:
145 return urlunparse((scheme
, netloc
, path
,
146 params
, query
, fragment
))
149 return urlunparse((scheme
, netloc
, path
,
150 params
, query
, fragment
))
152 return urlunparse((scheme
, netloc
, bpath
,
153 params
, query
or bquery
, fragment
))
154 segments
= split(bpath
, '/')[:-1] + split(path
, '/')
155 # XXX The stuff below is bogus in various ways...
156 if segments
[-1] == '.':
158 while '.' in segments
:
162 n
= len(segments
) - 1
164 if segments
[i
] == '..' and segments
[i
-1]:
165 del segments
[i
-1:i
+1]
170 if len(segments
) == 2 and segments
[1] == '..' and segments
[0] == '':
172 elif len(segments
) >= 2 and segments
[-1] == '..':
174 return urlunparse((scheme
, netloc
, join(segments
, '/'),
175 params
, query
, fragment
))
178 """Removes any existing fragment from URL.
180 Returns a tuple of the defragmented URL and the fragment. If
181 the URL contained no fragments, the second element is the
184 s
, n
, p
, a
, q
, frag
= urlparse(url
)
185 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
193 http:g = <URL:http://a/b/c/g>
194 http: = <URL:http://a/b/c/d>
195 g = <URL:http://a/b/c/g>
196 ./g = <URL:http://a/b/c/g>
197 g/ = <URL:http://a/b/c/g/>
198 /g = <URL:http://a/g>
200 ?y = <URL:http://a/b/c/d?y>
201 g?y = <URL:http://a/b/c/g?y>
202 g?y/./x = <URL:http://a/b/c/g?y/./x>
203 . = <URL:http://a/b/c/>
204 ./ = <URL:http://a/b/c/>
205 .. = <URL:http://a/b/>
206 ../ = <URL:http://a/b/>
207 ../g = <URL:http://a/b/g>
208 ../.. = <URL:http://a/>
209 ../../g = <URL:http://a/g>
210 ../../../g = <URL:http://a/../g>
211 ./../g = <URL:http://a/b/g>
212 ./g/. = <URL:http://a/b/c/g/>
213 /./g = <URL:http://a/./g>
214 g/./h = <URL:http://a/b/c/g/h>
215 g/../h = <URL:http://a/b/c/h>
216 http:g = <URL:http://a/b/c/g>
217 http: = <URL:http://a/b/c/d>
218 http:?y = <URL:http://a/b/c/d?y>
219 http:g?y = <URL:http://a/b/c/g?y>
220 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
222 # XXX The result for //g is actually http://g/; is this a problem?
235 fp
= StringIO
.StringIO(test_input
)
239 words
= string
.split(line
)
243 parts
= urlparse(url
)
244 print '%-10s : %s' % (url
, parts
)
245 abs = urljoin(base
, url
)
248 wrapped
= '<URL:%s>' % abs
249 print '%-10s = %s' % (url
, wrapped
)
250 if len(words
) == 3 and words
[1] == '=':
251 if wrapped
!= words
[2]:
252 print 'EXPECTED', words
[2], '!!!!!!!!!!'
254 if __name__
== '__main__':