Oops -- Lib/Test should be Lib/test, of course!
[python/dscho.git] / Lib / urlparse.py
blob185eb7fd6efa4faee194ac3d20d450fabcb65cef
1 # Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2 # Resource Locators", by R. Fielding, UC Irvine, June 1995.
4 # Standard/builtin Python modules
5 import string
6 from string import joinfields, splitfields, find, rfind
8 # A classification of schemes ('' means apply by default)
9 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
10 'https', 'shttp',
11 'prospero', '']
12 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
13 'https', 'shttp', 'snews',
14 'prospero', '']
15 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
16 'snews',
18 uses_params = ['ftp', 'hdl', 'prospero', 'http',
19 'https', 'shttp',
20 '']
21 uses_query = ['http', 'wais',
22 'https', 'shttp',
23 'gopher',
24 '']
25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
26 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars = string.letters + string.digits + '+-.'
32 MAX_CACHE_SIZE = 20
33 _parse_cache = {}
35 def clear_cache():
36 """Clear the parse cache."""
37 global _parse_cache
38 _parse_cache = {}
41 # Parse a URL into 6 components:
42 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43 # Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44 # Note that we don't break the components up in smaller bits
45 # (e.g. netloc is a single string) and we don't expand % escapes.
46 def urlparse(url, scheme = '', allow_framents = 1):
47 key = url, scheme, allow_framents
48 try:
49 return _parse_cache[key]
50 except KeyError:
51 pass
52 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
53 clear_cache()
54 netloc = path = params = query = fragment = ''
55 i = string.find(url, ':')
56 if i > 0:
57 for c in url[:i]:
58 if c not in scheme_chars:
59 break
60 else:
61 scheme, url = string.lower(url[:i]), url[i+1:]
62 if scheme in uses_netloc:
63 if url[:2] == '//':
64 i = string.find(url, '/', 2)
65 if i < 0:
66 i = len(url)
67 netloc, url = url[2:i], url[i:]
68 if allow_framents and scheme in uses_fragment:
69 i = string.rfind(url, '#')
70 if i >= 0:
71 url, fragment = url[:i], url[i+1:]
72 if scheme in uses_query:
73 i = string.find(url, '?')
74 if i >= 0:
75 url, query = url[:i], url[i+1:]
76 if scheme in uses_params:
77 i = string.find(url, ';')
78 if i >= 0:
79 url, params = url[:i], url[i+1:]
80 tuple = scheme, netloc, url, params, query, fragment
81 _parse_cache[key] = tuple
82 return tuple
84 # Put a parsed URL back together again. This may result in a slightly
85 # different, but equivalent URL, if the URL that was parsed originally
86 # had redundant delimiters, e.g. a ? with an empty query (the draft
87 # states that these are equivalent).
88 def urlunparse((scheme, netloc, url, params, query, fragment)):
89 if netloc:
90 if url[:1] != '/': url = '/' + url
91 url = '//' + netloc + url
92 if scheme:
93 url = scheme + ':' + url
94 if params:
95 url = url + ';' + params
96 if query:
97 url = url + '?' + query
98 if fragment:
99 url = url + '#' + fragment
100 return url
102 # Join a base URL and a possibly relative URL to form an absolute
103 # interpretation of the latter.
104 def urljoin(base, url, allow_framents = 1):
105 if not base:
106 return url
107 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
108 urlparse(base, '', allow_framents)
109 scheme, netloc, path, params, query, fragment = \
110 urlparse(url, bscheme, allow_framents)
111 # XXX Unofficial hack: default netloc to bnetloc even if
112 # schemes differ
113 if scheme != bscheme and not netloc and \
114 scheme in uses_relative and bscheme in uses_relative and \
115 scheme in uses_netloc and bscheme in uses_netloc:
116 netloc = bnetloc
117 # Strip the port number
118 i = find(netloc, '@')
119 if i < 0: i = 0
120 i = find(netloc, ':', i)
121 if i >= 0:
122 netloc = netloc[:i]
123 if scheme != bscheme or scheme not in uses_relative:
124 return urlunparse((scheme, netloc, path,
125 params, query, fragment))
126 if scheme in uses_netloc:
127 if netloc:
128 return urlunparse((scheme, netloc, path,
129 params, query, fragment))
130 netloc = bnetloc
131 if path[:1] == '/':
132 return urlunparse((scheme, netloc, path,
133 params, query, fragment))
134 if not path:
135 return urlunparse((scheme, netloc, bpath,
136 params, query or bquery, fragment))
137 i = rfind(bpath, '/')
138 if i >= 0:
139 path = bpath[:i] + '/' + path
140 segments = splitfields(path, '/')
141 if segments[-1] == '.':
142 segments[-1] = ''
143 while '.' in segments:
144 segments.remove('.')
145 while 1:
146 i = 1
147 n = len(segments) - 1
148 while i < n:
149 if segments[i] == '..' and segments[i-1]:
150 del segments[i-1:i+1]
151 break
152 i = i+1
153 else:
154 break
155 if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
156 segments[-1] = ''
157 elif len(segments) >= 2 and segments[-1] == '..':
158 segments[-2:] = ['']
159 return urlunparse((scheme, netloc, joinfields(segments, '/'),
160 params, query, fragment))
162 def urldefrag(url):
163 """Removes any existing fragment from URL.
165 Returns a tuple of the defragmented URL and the fragment. If
166 the URL contained no fragments, the second element is the
167 empty string.
169 s, n, p, a, q, frag = urlparse(url)
170 defrag = urlunparse((s, n, p, a, q, ''))
171 return defrag, frag
174 test_input = """
175 http://a/b/c/d
177 g:h = <URL:g:h>
178 http:g = <URL:http://a/b/c/g>
179 http: = <URL:http://a/b/c/d>
180 g = <URL:http://a/b/c/g>
181 ./g = <URL:http://a/b/c/g>
182 g/ = <URL:http://a/b/c/g/>
183 /g = <URL:http://a/g>
184 //g = <URL:http://g>
185 ?y = <URL:http://a/b/c/d?y>
186 g?y = <URL:http://a/b/c/g?y>
187 g?y/./x = <URL:http://a/b/c/g?y/./x>
188 . = <URL:http://a/b/c/>
189 ./ = <URL:http://a/b/c/>
190 .. = <URL:http://a/b/>
191 ../ = <URL:http://a/b/>
192 ../g = <URL:http://a/b/g>
193 ../.. = <URL:http://a/>
194 ../../g = <URL:http://a/g>
195 ../../../g = <URL:http://a/../g>
196 ./../g = <URL:http://a/b/g>
197 ./g/. = <URL:http://a/b/c/g/>
198 /./g = <URL:http://a/./g>
199 g/./h = <URL:http://a/b/c/g/h>
200 g/../h = <URL:http://a/b/c/h>
201 http:g = <URL:http://a/b/c/g>
202 http: = <URL:http://a/b/c/d>
205 def test():
206 import sys
207 base = ''
208 if sys.argv[1:]:
209 fn = sys.argv[1]
210 if fn == '-':
211 fp = sys.stdin
212 else:
213 fp = open(fn)
214 else:
215 import StringIO
216 fp = StringIO.StringIO(test_input)
217 while 1:
218 line = fp.readline()
219 if not line: break
220 words = string.split(line)
221 if not words:
222 continue
223 url = words[0]
224 parts = urlparse(url)
225 print '%-10s : %s' % (url, parts)
226 abs = urljoin(base, url)
227 if not base:
228 base = abs
229 wrapped = '<URL:%s>' % abs
230 print '%-10s = %s' % (url, wrapped)
231 if len(words) == 3 and words[1] == '=':
232 if wrapped != words[2]:
233 print 'EXPECTED', words[2], '!!!!!!!!!!'
235 if __name__ == '__main__':
236 test()