Merged release21-maint changes.
[python/dscho.git] / Lib / urlparse.py
blob1df83d68d31b13b895c069657e149271cbf2fcac
1 """Parse (absolute and relative) URLs.
3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
4 UC Irvine, June 1995.
5 """
7 __all__ = ["urlparse", "urlunparse", "urljoin"]
9 # A classification of schemes ('' means apply by default)
10 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
11 'https', 'shttp',
12 'prospero', 'rtsp', 'rtspu', '']
13 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
14 'file',
15 'https', 'shttp', 'snews',
16 'prospero', 'rtsp', 'rtspu', '']
17 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
18 'snews', 'sip',
20 uses_params = ['ftp', 'hdl', 'prospero', 'http',
21 'https', 'shttp', 'rtsp', 'rtspu', 'sip',
22 '']
23 uses_query = ['http', 'wais',
24 'https', 'shttp',
25 'gopher', 'rtsp', 'rtspu', 'sip',
26 '']
27 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
28 'https', 'shttp', 'snews',
29 'file', 'prospero', '']
31 # Characters valid in scheme names
32 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
33 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
34 '0123456789'
35 '+-.')
37 MAX_CACHE_SIZE = 20
38 _parse_cache = {}
40 def clear_cache():
41 """Clear the parse cache."""
42 global _parse_cache
43 _parse_cache = {}
46 def urlparse(url, scheme = '', allow_fragments = 1):
47 """Parse a URL into 6 components:
48 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
49 Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
50 Note that we don't break the components up in smaller bits
51 (e.g. netloc is a single string) and we don't expand % escapes."""
52 key = url, scheme, allow_fragments
53 cached = _parse_cache.get(key, None)
54 if cached:
55 return cached
56 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
57 clear_cache()
58 netloc = path = params = query = fragment = ''
59 i = url.find(':')
60 if i > 0:
61 if url[:i] == 'http': # optimize the common case
62 scheme = url[:i].lower()
63 url = url[i+1:]
64 if url[:2] == '//':
65 i = url.find('/', 2)
66 if i < 0:
67 i = len(url)
68 netloc = url[2:i]
69 url = url[i:]
70 if allow_fragments:
71 i = url.rfind('#')
72 if i >= 0:
73 fragment = url[i+1:]
74 url = url[:i]
75 i = url.find('?')
76 if i >= 0:
77 query = url[i+1:]
78 url = url[:i]
79 i = url.find(';')
80 if i >= 0:
81 params = url[i+1:]
82 url = url[:i]
83 tuple = scheme, netloc, url, params, query, fragment
84 _parse_cache[key] = tuple
85 return tuple
86 for c in url[:i]:
87 if c not in scheme_chars:
88 break
89 else:
90 scheme, url = url[:i].lower(), url[i+1:]
91 if scheme in uses_netloc:
92 if url[:2] == '//':
93 i = url.find('/', 2)
94 if i < 0:
95 i = len(url)
96 netloc, url = url[2:i], url[i:]
97 if allow_fragments and scheme in uses_fragment:
98 i = url.rfind('#')
99 if i >= 0:
100 url, fragment = url[:i], url[i+1:]
101 if scheme in uses_query:
102 i = url.find('?')
103 if i >= 0:
104 url, query = url[:i], url[i+1:]
105 if scheme in uses_params:
106 i = url.find(';')
107 if i >= 0:
108 url, params = url[:i], url[i+1:]
109 tuple = scheme, netloc, url, params, query, fragment
110 _parse_cache[key] = tuple
111 return tuple
113 def urlunparse((scheme, netloc, url, params, query, fragment)):
114 """Put a parsed URL back together again. This may result in a
115 slightly different, but equivalent URL, if the URL that was parsed
116 originally had redundant delimiters, e.g. a ? with an empty query
117 (the draft states that these are equivalent)."""
118 if netloc or (scheme in uses_netloc and url[:2] == '//'):
119 if url and url[:1] != '/': url = '/' + url
120 url = '//' + (netloc or '') + url
121 if scheme:
122 url = scheme + ':' + url
123 if params:
124 url = url + ';' + params
125 if query:
126 url = url + '?' + query
127 if fragment:
128 url = url + '#' + fragment
129 return url
131 def urljoin(base, url, allow_fragments = 1):
132 """Join a base URL and a possibly relative URL to form an absolute
133 interpretation of the latter."""
134 if not base:
135 return url
136 if not url:
137 return base
138 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
139 urlparse(base, '', allow_fragments)
140 scheme, netloc, path, params, query, fragment = \
141 urlparse(url, bscheme, allow_fragments)
142 if scheme != bscheme or scheme not in uses_relative:
143 return url
144 if scheme in uses_netloc:
145 if netloc:
146 return urlunparse((scheme, netloc, path,
147 params, query, fragment))
148 netloc = bnetloc
149 if path[:1] == '/':
150 return urlunparse((scheme, netloc, path,
151 params, query, fragment))
152 if not path:
153 if not params:
154 params = bparams
155 if not query:
156 query = bquery
157 return urlunparse((scheme, netloc, bpath,
158 params, query, fragment))
159 segments = bpath.split('/')[:-1] + path.split('/')
160 # XXX The stuff below is bogus in various ways...
161 if segments[-1] == '.':
162 segments[-1] = ''
163 while '.' in segments:
164 segments.remove('.')
165 while 1:
166 i = 1
167 n = len(segments) - 1
168 while i < n:
169 if (segments[i] == '..'
170 and segments[i-1] not in ('', '..')):
171 del segments[i-1:i+1]
172 break
173 i = i+1
174 else:
175 break
176 if segments == ['', '..']:
177 segments[-1] = ''
178 elif len(segments) >= 2 and segments[-1] == '..':
179 segments[-2:] = ['']
180 return urlunparse((scheme, netloc, '/'.join(segments),
181 params, query, fragment))
183 def urldefrag(url):
184 """Removes any existing fragment from URL.
186 Returns a tuple of the defragmented URL and the fragment. If
187 the URL contained no fragments, the second element is the
188 empty string.
190 s, n, p, a, q, frag = urlparse(url)
191 defrag = urlunparse((s, n, p, a, q, ''))
192 return defrag, frag
195 test_input = """
196 http://a/b/c/d
198 g:h = <URL:g:h>
199 http:g = <URL:http://a/b/c/g>
200 http: = <URL:http://a/b/c/d>
201 g = <URL:http://a/b/c/g>
202 ./g = <URL:http://a/b/c/g>
203 g/ = <URL:http://a/b/c/g/>
204 /g = <URL:http://a/g>
205 //g = <URL:http://g>
206 ?y = <URL:http://a/b/c/d?y>
207 g?y = <URL:http://a/b/c/g?y>
208 g?y/./x = <URL:http://a/b/c/g?y/./x>
209 . = <URL:http://a/b/c/>
210 ./ = <URL:http://a/b/c/>
211 .. = <URL:http://a/b/>
212 ../ = <URL:http://a/b/>
213 ../g = <URL:http://a/b/g>
214 ../.. = <URL:http://a/>
215 ../../g = <URL:http://a/g>
216 ../../../g = <URL:http://a/../g>
217 ./../g = <URL:http://a/b/g>
218 ./g/. = <URL:http://a/b/c/g/>
219 /./g = <URL:http://a/./g>
220 g/./h = <URL:http://a/b/c/g/h>
221 g/../h = <URL:http://a/b/c/h>
222 http:g = <URL:http://a/b/c/g>
223 http: = <URL:http://a/b/c/d>
224 http:?y = <URL:http://a/b/c/d?y>
225 http:g?y = <URL:http://a/b/c/g?y>
226 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
228 # XXX The result for //g is actually http://g/; is this a problem?
230 def test():
231 import sys
232 base = ''
233 if sys.argv[1:]:
234 fn = sys.argv[1]
235 if fn == '-':
236 fp = sys.stdin
237 else:
238 fp = open(fn)
239 else:
240 import StringIO
241 fp = StringIO.StringIO(test_input)
242 while 1:
243 line = fp.readline()
244 if not line: break
245 words = line.split()
246 if not words:
247 continue
248 url = words[0]
249 parts = urlparse(url)
250 print '%-10s : %s' % (url, parts)
251 abs = urljoin(base, url)
252 if not base:
253 base = abs
254 wrapped = '<URL:%s>' % abs
255 print '%-10s = %s' % (url, wrapped)
256 if len(words) == 3 and words[1] == '=':
257 if wrapped != words[2]:
258 print 'EXPECTED', words[2], '!!!!!!!!!!'
260 if __name__ == '__main__':
261 test()