1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
7 # Standard/builtin Python modules
9 from string
import joinfields
, splitfields
, rfind
11 # A classification of schemes ('' means apply by default)
12 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
15 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
17 'https', 'shttp', 'snews',
19 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
22 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
25 uses_query
= ['http', 'wais',
29 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
30 'https', 'shttp', 'snews',
31 'file', 'prospero', '']
33 # Characters valid in scheme names
34 scheme_chars
= string
.letters
+ string
.digits
+ '+-.'
40 """Clear the parse cache."""
45 def urlparse(url
, scheme
= '', allow_fragments
= 1):
46 """Parse a URL into 6 components:
47 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
48 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
49 Note that we don't break the components up in smaller bits
50 (e.g. netloc is a single string) and we don't expand % escapes."""
51 key
= url
, scheme
, allow_fragments
52 cached
= _parse_cache
.get(key
, None)
55 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
58 netloc
= path
= params
= query
= fragment
= ''
61 if url
[:i
] == 'http': # optimize the common case
62 scheme
= string
.lower(url
[:i
])
71 i
= string
.rfind(url
, '#')
83 tuple = scheme
, netloc
, url
, params
, query
, fragment
84 _parse_cache
[key
] = tuple
87 if c
not in scheme_chars
:
90 scheme
, url
= string
.lower(url
[:i
]), url
[i
+1:]
91 if scheme
in uses_netloc
:
96 netloc
, url
= url
[2:i
], url
[i
:]
97 if allow_fragments
and scheme
in uses_fragment
:
98 i
= string
.rfind(url
, '#')
100 url
, fragment
= url
[:i
], url
[i
+1:]
101 if scheme
in uses_query
:
104 url
, query
= url
[:i
], url
[i
+1:]
105 if scheme
in uses_params
:
108 url
, params
= url
[:i
], url
[i
+1:]
109 tuple = scheme
, netloc
, url
, params
, query
, fragment
110 _parse_cache
[key
] = tuple
113 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
118 if netloc
or (scheme
in uses_netloc
and url
[:2] == '//'):
119 if url
[:1] != '/': url
= '/' + url
120 url
= '//' + (netloc
or '') + url
122 url
= scheme
+ ':' + url
124 url
= url
+ ';' + params
126 url
= url
+ '?' + query
128 url
= url
+ '#' + fragment
131 def urljoin(base
, url
, allow_fragments
= 1):
132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
136 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
137 urlparse(base
, '', allow_fragments
)
138 scheme
, netloc
, path
, params
, query
, fragment
= \
139 urlparse(url
, bscheme
, allow_fragments
)
140 if scheme
!= bscheme
or scheme
not in uses_relative
:
141 return urlunparse((scheme
, netloc
, path
,
142 params
, query
, fragment
))
143 if scheme
in uses_netloc
:
145 return urlunparse((scheme
, netloc
, path
,
146 params
, query
, fragment
))
149 return urlunparse((scheme
, netloc
, path
,
150 params
, query
, fragment
))
152 return urlunparse((scheme
, netloc
, bpath
,
153 params
, query
or bquery
, fragment
))
154 i
= rfind(bpath
, '/')
156 path
= bpath
[:i
] + '/' + path
157 segments
= splitfields(path
, '/')
158 if segments
[-1] == '.':
160 while '.' in segments
:
164 n
= len(segments
) - 1
166 if segments
[i
] == '..' and segments
[i
-1]:
167 del segments
[i
-1:i
+1]
172 if len(segments
) == 2 and segments
[1] == '..' and segments
[0] == '':
174 elif len(segments
) >= 2 and segments
[-1] == '..':
176 return urlunparse((scheme
, netloc
, joinfields(segments
, '/'),
177 params
, query
, fragment
))
180 """Removes any existing fragment from URL.
182 Returns a tuple of the defragmented URL and the fragment. If
183 the URL contained no fragments, the second element is the
186 s
, n
, p
, a
, q
, frag
= urlparse(url
)
187 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
195 http:g = <URL:http://a/b/c/g>
196 http: = <URL:http://a/b/c/d>
197 g = <URL:http://a/b/c/g>
198 ./g = <URL:http://a/b/c/g>
199 g/ = <URL:http://a/b/c/g/>
200 /g = <URL:http://a/g>
202 ?y = <URL:http://a/b/c/d?y>
203 g?y = <URL:http://a/b/c/g?y>
204 g?y/./x = <URL:http://a/b/c/g?y/./x>
205 . = <URL:http://a/b/c/>
206 ./ = <URL:http://a/b/c/>
207 .. = <URL:http://a/b/>
208 ../ = <URL:http://a/b/>
209 ../g = <URL:http://a/b/g>
210 ../.. = <URL:http://a/>
211 ../../g = <URL:http://a/g>
212 ../../../g = <URL:http://a/../g>
213 ./../g = <URL:http://a/b/g>
214 ./g/. = <URL:http://a/b/c/g/>
215 /./g = <URL:http://a/./g>
216 g/./h = <URL:http://a/b/c/g/h>
217 g/../h = <URL:http://a/b/c/h>
218 http:g = <URL:http://a/b/c/g>
219 http: = <URL:http://a/b/c/d>
220 http:?y = <URL:http://a/b/c/d?y>
221 http:g?y = <URL:http://a/b/c/g?y>
222 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
224 # XXX The result for //g is actually http://g/; is this a problem?
237 fp
= StringIO
.StringIO(test_input
)
241 words
= string
.split(line
)
245 parts
= urlparse(url
)
246 print '%-10s : %s' % (url
, parts
)
247 abs = urljoin(base
, url
)
250 wrapped
= '<URL:%s>' % abs
251 print '%-10s = %s' % (url
, wrapped
)
252 if len(words
) == 3 and words
[1] == '=':
253 if wrapped
!= words
[2]:
254 print 'EXPECTED', words
[2], '!!!!!!!!!!'
256 if __name__
== '__main__':