Fix an amazing number of typos & malformed sentences reported by Detlef
[python/dscho.git] / Lib / urlparse.py
blob148633e954e8e37891a65caf667c9071e6348674
1 # Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2 # Resource Locators", by R. Fielding, UC Irvine, June 1995.
4 # Standard/builtin Python modules
5 import string
6 from string import joinfields, splitfields, find, rfind
8 # A classification of schemes ('' means apply by default)
9 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
10 'https', 'shttp',
11 'prospero', '']
12 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
13 'https', 'shttp', 'snews',
14 'prospero', '']
15 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
18 uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21 uses_query = ['http', 'wais',
22 'https', 'shttp',
23 'gopher',
24 '']
25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
26 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars = string.letters + string.digits + '+-.'
32 MAX_CACHE_SIZE = 20
33 _parse_cache = {}
35 def clear_cache():
36 """Clear the parse cache."""
37 global _parse_cache
38 _parse_cache = {}
41 # Parse a URL into 6 components:
42 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43 # Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44 # Note that we don't break the components up in smaller bits
45 # (e.g. netloc is a single string) and we don't expand % escapes.
46 def urlparse(url, scheme = '', allow_fragments = 1):
47 key = url, scheme, allow_fragments
48 cached = _parse_cache.get(key, None)
49 if cached:
50 return cached
51 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
52 clear_cache()
53 find = string.find
54 netloc = path = params = query = fragment = ''
55 i = find(url, ':')
56 if i > 0:
57 if url[:i] == 'http': # optimize the common case
58 scheme = string.lower(url[:i])
59 url = url[i+1:]
60 if url[:2] == '//':
61 i = find(url, '/', 2)
62 if i < 0:
63 i = len(url)
64 netloc = url[2:i]
65 url = url[i:]
66 if allow_fragments:
67 i = string.rfind(url, '#')
68 if i >= 0:
69 fragment = url[i+1:]
70 url = url[:i]
71 i = find(url, '?')
72 if i >= 0:
73 query = url[i+1:]
74 url = url[:i]
75 i = find(url, ';')
76 if i >= 0:
77 params = url[i+1:]
78 url = url[:i]
79 tuple = scheme, netloc, url, params, query, fragment
80 _parse_cache[key] = tuple
81 return tuple
82 for c in url[:i]:
83 if c not in scheme_chars:
84 break
85 else:
86 scheme, url = string.lower(url[:i]), url[i+1:]
87 if scheme in uses_netloc:
88 if url[:2] == '//':
89 i = find(url, '/', 2)
90 if i < 0:
91 i = len(url)
92 netloc, url = url[2:i], url[i:]
93 if allow_fragments and scheme in uses_fragment:
94 i = string.rfind(url, '#')
95 if i >= 0:
96 url, fragment = url[:i], url[i+1:]
97 if scheme in uses_query:
98 i = find(url, '?')
99 if i >= 0:
100 url, query = url[:i], url[i+1:]
101 if scheme in uses_params:
102 i = find(url, ';')
103 if i >= 0:
104 url, params = url[:i], url[i+1:]
105 tuple = scheme, netloc, url, params, query, fragment
106 _parse_cache[key] = tuple
107 return tuple
109 # Put a parsed URL back together again. This may result in a slightly
110 # different, but equivalent URL, if the URL that was parsed originally
111 # had redundant delimiters, e.g. a ? with an empty query (the draft
112 # states that these are equivalent).
113 def urlunparse((scheme, netloc, url, params, query, fragment)):
114 if netloc:
115 if url[:1] != '/': url = '/' + url
116 url = '//' + netloc + url
117 if scheme:
118 url = scheme + ':' + url
119 if params:
120 url = url + ';' + params
121 if query:
122 url = url + '?' + query
123 if fragment:
124 url = url + '#' + fragment
125 return url
127 # Join a base URL and a possibly relative URL to form an absolute
128 # interpretation of the latter.
129 def urljoin(base, url, allow_fragments = 1):
130 if not base:
131 return url
132 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
133 urlparse(base, '', allow_fragments)
134 scheme, netloc, path, params, query, fragment = \
135 urlparse(url, bscheme, allow_fragments)
136 # XXX Unofficial hack: default netloc to bnetloc even if
137 # schemes differ
138 if scheme != bscheme and not netloc and \
139 scheme in uses_relative and bscheme in uses_relative and \
140 scheme in uses_netloc and bscheme in uses_netloc:
141 netloc = bnetloc
142 # Strip the port number
143 i = find(netloc, '@')
144 if i < 0: i = 0
145 i = find(netloc, ':', i)
146 if i >= 0:
147 netloc = netloc[:i]
148 if scheme != bscheme or scheme not in uses_relative:
149 return urlunparse((scheme, netloc, path,
150 params, query, fragment))
151 if scheme in uses_netloc:
152 if netloc:
153 return urlunparse((scheme, netloc, path,
154 params, query, fragment))
155 netloc = bnetloc
156 if path[:1] == '/':
157 return urlunparse((scheme, netloc, path,
158 params, query, fragment))
159 if not path:
160 return urlunparse((scheme, netloc, bpath,
161 params, query or bquery, fragment))
162 i = rfind(bpath, '/')
163 if i >= 0:
164 path = bpath[:i] + '/' + path
165 segments = splitfields(path, '/')
166 if segments[-1] == '.':
167 segments[-1] = ''
168 while '.' in segments:
169 segments.remove('.')
170 while 1:
171 i = 1
172 n = len(segments) - 1
173 while i < n:
174 if segments[i] == '..' and segments[i-1]:
175 del segments[i-1:i+1]
176 break
177 i = i+1
178 else:
179 break
180 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
181 segments[-1] = ''
182 elif len(segments) >= 2 and segments[-1] == '..':
183 segments[-2:] = ['']
184 return urlunparse((scheme, netloc, joinfields(segments, '/'),
185 params, query, fragment))
187 def urldefrag(url):
188 """Removes any existing fragment from URL.
190 Returns a tuple of the defragmented URL and the fragment. If
191 the URL contained no fragments, the second element is the
192 empty string.
194 s, n, p, a, q, frag = urlparse(url)
195 defrag = urlunparse((s, n, p, a, q, ''))
196 return defrag, frag
199 test_input = """
200 http://a/b/c/d
202 g:h = <URL:g:h>
203 http:g = <URL:http://a/b/c/g>
204 http: = <URL:http://a/b/c/d>
205 g = <URL:http://a/b/c/g>
206 ./g = <URL:http://a/b/c/g>
207 g/ = <URL:http://a/b/c/g/>
208 /g = <URL:http://a/g>
209 //g = <URL:http://g>
210 ?y = <URL:http://a/b/c/d?y>
211 g?y = <URL:http://a/b/c/g?y>
212 g?y/./x = <URL:http://a/b/c/g?y/./x>
213 . = <URL:http://a/b/c/>
214 ./ = <URL:http://a/b/c/>
215 .. = <URL:http://a/b/>
216 ../ = <URL:http://a/b/>
217 ../g = <URL:http://a/b/g>
218 ../.. = <URL:http://a/>
219 ../../g = <URL:http://a/g>
220 ../../../g = <URL:http://a/../g>
221 ./../g = <URL:http://a/b/g>
222 ./g/. = <URL:http://a/b/c/g/>
223 /./g = <URL:http://a/./g>
224 g/./h = <URL:http://a/b/c/g/h>
225 g/../h = <URL:http://a/b/c/h>
226 http:g = <URL:http://a/b/c/g>
227 http: = <URL:http://a/b/c/d>
228 http:?y = <URL:http://a/b/c/d?y>
229 http:g?y = <URL:http://a/b/c/g?y>
230 http:g?y/./x = <URL:http://a/b/c/g?y/./x>
232 # XXX The result for //g is actually http://g/; is this a problem?
234 def test():
235 import sys
236 base = ''
237 if sys.argv[1:]:
238 fn = sys.argv[1]
239 if fn == '-':
240 fp = sys.stdin
241 else:
242 fp = open(fn)
243 else:
244 import StringIO
245 fp = StringIO.StringIO(test_input)
246 while 1:
247 line = fp.readline()
248 if not line: break
249 words = string.split(line)
250 if not words:
251 continue
252 url = words[0]
253 parts = urlparse(url)
254 print '%-10s : %s' % (url, parts)
255 abs = urljoin(base, url)
256 if not base:
257 base = abs
258 wrapped = '<URL:%s>' % abs
259 print '%-10s = %s' % (url, wrapped)
260 if len(words) == 3 and words[1] == '=':
261 if wrapped != words[2]:
262 print 'EXPECTED', words[2], '!!!!!!!!!!'
264 if __name__ == '__main__':
265 test()