1 # Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2 # Resource Locators", by R. Fielding, UC Irvine, June 1995.
4 # Standard/builtin Python modules
6 from string
import joinfields
, splitfields
, find
, rfind
8 # A classification of schemes ('' means apply by default)
9 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
12 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
13 'https', 'shttp', 'snews',
15 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
18 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
21 uses_query
= ['http', 'wais',
25 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
26 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars
= string
.letters
+ string
.digits
+ '+-.'
36 """Clear the parse cache."""
41 # Parse a URL into 6 components:
42 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43 # Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44 # Note that we don't break the components up in smaller bits
45 # (e.g. netloc is a single string) and we don't expand % escapes.
46 def urlparse(url
, scheme
= '', allow_fragments
= 1):
47 key
= url
, scheme
, allow_fragments
48 cached
= _parse_cache
.get(key
, None)
51 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
54 netloc
= path
= params
= query
= fragment
= ''
57 if url
[:i
] == 'http': # optimize the common case
58 scheme
= string
.lower(url
[:i
])
67 i
= string
.rfind(url
, '#')
79 tuple = scheme
, netloc
, url
, params
, query
, fragment
80 _parse_cache
[key
] = tuple
83 if c
not in scheme_chars
:
86 scheme
, url
= string
.lower(url
[:i
]), url
[i
+1:]
87 if scheme
in uses_netloc
:
92 netloc
, url
= url
[2:i
], url
[i
:]
93 if allow_fragments
and scheme
in uses_fragment
:
94 i
= string
.rfind(url
, '#')
96 url
, fragment
= url
[:i
], url
[i
+1:]
97 if scheme
in uses_query
:
100 url
, query
= url
[:i
], url
[i
+1:]
101 if scheme
in uses_params
:
104 url
, params
= url
[:i
], url
[i
+1:]
105 tuple = scheme
, netloc
, url
, params
, query
, fragment
106 _parse_cache
[key
] = tuple
109 # Put a parsed URL back together again. This may result in a slightly
110 # different, but equivalent URL, if the URL that was parsed originally
111 # had redundant delimiters, e.g. a ? with an empty query (the draft
112 # states that these are equivalent).
113 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
115 if url
[:1] != '/': url
= '/' + url
116 url
= '//' + netloc
+ url
118 url
= scheme
+ ':' + url
120 url
= url
+ ';' + params
122 url
= url
+ '?' + query
124 url
= url
+ '#' + fragment
127 # Join a base URL and a possibly relative URL to form an absolute
128 # interpretation of the latter.
129 def urljoin(base
, url
, allow_fragments
= 1):
132 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
133 urlparse(base
, '', allow_fragments
)
134 scheme
, netloc
, path
, params
, query
, fragment
= \
135 urlparse(url
, bscheme
, allow_fragments
)
136 # XXX Unofficial hack: default netloc to bnetloc even if
138 if scheme
!= bscheme
and not netloc
and \
139 scheme
in uses_relative
and bscheme
in uses_relative
and \
140 scheme
in uses_netloc
and bscheme
in uses_netloc
:
142 # Strip the port number
143 i
= find(netloc
, '@')
145 i
= find(netloc
, ':', i
)
148 if scheme
!= bscheme
or scheme
not in uses_relative
:
149 return urlunparse((scheme
, netloc
, path
,
150 params
, query
, fragment
))
151 if scheme
in uses_netloc
:
153 return urlunparse((scheme
, netloc
, path
,
154 params
, query
, fragment
))
157 return urlunparse((scheme
, netloc
, path
,
158 params
, query
, fragment
))
160 return urlunparse((scheme
, netloc
, bpath
,
161 params
, query
or bquery
, fragment
))
162 i
= rfind(bpath
, '/')
164 path
= bpath
[:i
] + '/' + path
165 segments
= splitfields(path
, '/')
166 if segments
[-1] == '.':
168 while '.' in segments
:
172 n
= len(segments
) - 1
174 if segments
[i
] == '..' and segments
[i
-1]:
175 del segments
[i
-1:i
+1]
180 if len(segments
) == 2 and segments
[1] == '..' and segments
[0] == '':
182 elif len(segments
) >= 2 and segments
[-1] == '..':
184 return urlunparse((scheme
, netloc
, joinfields(segments
, '/'),
185 params
, query
, fragment
))
188 """Removes any existing fragment from URL.
190 Returns a tuple of the defragmented URL and the fragment. If
191 the URL contained no fragments, the second element is the
194 s
, n
, p
, a
, q
, frag
= urlparse(url
)
195 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
203 http:g = <URL:http://a/b/c/g>
204 http: = <URL:http://a/b/c/d>
205 g = <URL:http://a/b/c/g>
206 ./g = <URL:http://a/b/c/g>
207 g/ = <URL:http://a/b/c/g/>
208 /g = <URL:http://a/g>
210 ?y = <URL:http://a/b/c/d?y>
211 g?y = <URL:http://a/b/c/g?y>
212 g?y/./x = <URL:http://a/b/c/g?y/./x>
213 . = <URL:http://a/b/c/>
214 ./ = <URL:http://a/b/c/>
215 .. = <URL:http://a/b/>
216 ../ = <URL:http://a/b/>
217 ../g = <URL:http://a/b/g>
218 ../.. = <URL:http://a/>
219 ../../g = <URL:http://a/g>
220 ../../../g = <URL:http://a/../g>
221 ./../g = <URL:http://a/b/g>
222 ./g/. = <URL:http://a/b/c/g/>
223 /./g = <URL:http://a/./g>
224 g/./h = <URL:http://a/b/c/g/h>
225 g/../h = <URL:http://a/b/c/h>
226 http:g = <URL:http://a/b/c/g>
227 http: = <URL:http://a/b/c/d>
228 http:?y = <URL:http://a/b/c/d?y>
229 http:g?y = <URL:http://a/b/c/g?y>
230 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
232 # XXX The result for //g is actually http://g/; is this a problem?
245 fp
= StringIO
.StringIO(test_input
)
249 words
= string
.split(line
)
253 parts
= urlparse(url
)
254 print '%-10s : %s' % (url
, parts
)
255 abs = urljoin(base
, url
)
258 wrapped
= '<URL:%s>' % abs
259 print '%-10s = %s' % (url
, wrapped
)
260 if len(words
) == 3 and words
[1] == '=':
261 if wrapped
!= words
[2]:
262 print 'EXPECTED', words
[2], '!!!!!!!!!!'
264 if __name__
== '__main__':