1 # Parse (absolute and relative) URLs. See RFC 1808: "Relative Uniform
2 # Resource Locators", by R. Fielding, UC Irvine, June 1995.
4 # Standard/builtin Python modules
6 from string
import joinfields
, splitfields
, find
, rfind
8 # A classification of schemes ('' means apply by default)
9 uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
12 uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
13 'https', 'shttp', 'snews',
15 non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
18 uses_params
= ['ftp', 'hdl', 'prospero', 'http',
21 uses_query
= ['http', 'wais',
25 uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
26 'https', 'shttp', 'snews',
27 'file', 'prospero', '']
29 # Characters valid in scheme names
30 scheme_chars
= string
.letters
+ string
.digits
+ '+-.'
36 """Clear the parse cache."""
41 # Parse a URL into 6 components:
42 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
43 # Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
44 # Note that we don't break the components up in smaller bits
45 # (e.g. netloc is a single string) and we don't expand % escapes.
46 def urlparse(url
, scheme
= '', allow_framents
= 1):
47 key
= url
, scheme
, allow_framents
49 return _parse_cache
[key
]
52 if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
54 netloc
= path
= params
= query
= fragment
= ''
55 i
= string
.find(url
, ':')
58 if c
not in scheme_chars
:
61 scheme
, url
= string
.lower(url
[:i
]), url
[i
+1:]
62 if scheme
in uses_netloc
:
64 i
= string
.find(url
, '/', 2)
67 netloc
, url
= url
[2:i
], url
[i
:]
68 if allow_framents
and scheme
in uses_fragment
:
69 i
= string
.rfind(url
, '#')
71 url
, fragment
= url
[:i
], url
[i
+1:]
72 if scheme
in uses_query
:
73 i
= string
.find(url
, '?')
75 url
, query
= url
[:i
], url
[i
+1:]
76 if scheme
in uses_params
:
77 i
= string
.find(url
, ';')
79 url
, params
= url
[:i
], url
[i
+1:]
80 tuple = scheme
, netloc
, url
, params
, query
, fragment
81 _parse_cache
[key
] = tuple
84 # Put a parsed URL back together again. This may result in a slightly
85 # different, but equivalent URL, if the URL that was parsed originally
86 # had redundant delimiters, e.g. a ? with an empty query (the draft
87 # states that these are equivalent).
88 def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
90 if url
[:1] != '/': url
= '/' + url
91 url
= '//' + netloc
+ url
93 url
= scheme
+ ':' + url
95 url
= url
+ ';' + params
97 url
= url
+ '?' + query
99 url
= url
+ '#' + fragment
102 # Join a base URL and a possibly relative URL to form an absolute
103 # interpretation of the latter.
104 def urljoin(base
, url
, allow_framents
= 1):
107 bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
108 urlparse(base
, '', allow_framents
)
109 scheme
, netloc
, path
, params
, query
, fragment
= \
110 urlparse(url
, bscheme
, allow_framents
)
111 # XXX Unofficial hack: default netloc to bnetloc even if
113 if scheme
!= bscheme
and not netloc
and \
114 scheme
in uses_relative
and bscheme
in uses_relative
and \
115 scheme
in uses_netloc
and bscheme
in uses_netloc
:
117 # Strip the port number
118 i
= find(netloc
, '@')
120 i
= find(netloc
, ':', i
)
123 if scheme
!= bscheme
or scheme
not in uses_relative
:
124 return urlunparse((scheme
, netloc
, path
,
125 params
, query
, fragment
))
126 if scheme
in uses_netloc
:
128 return urlunparse((scheme
, netloc
, path
,
129 params
, query
, fragment
))
132 return urlunparse((scheme
, netloc
, path
,
133 params
, query
, fragment
))
135 return urlunparse((scheme
, netloc
, bpath
,
136 params
, query
or bquery
, fragment
))
137 i
= rfind(bpath
, '/')
139 path
= bpath
[:i
] + '/' + path
140 segments
= splitfields(path
, '/')
141 if segments
[-1] == '.':
143 while '.' in segments
:
147 n
= len(segments
) - 1
149 if segments
[i
] == '..' and segments
[i
-1]:
150 del segments
[i
-1:i
+1]
155 if len(segments
) == 2 and segments
[1] == '..' and segments
[0] == '':
157 elif len(segments
) >= 2 and segments
[-1] == '..':
159 return urlunparse((scheme
, netloc
, joinfields(segments
, '/'),
160 params
, query
, fragment
))
163 """Removes any existing fragment from URL.
165 Returns a tuple of the defragmented URL and the fragment. If
166 the URL contained no fragments, the second element is the
169 s
, n
, p
, a
, q
, frag
= urlparse(url
)
170 defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
178 http:g = <URL:http://a/b/c/g>
179 http: = <URL:http://a/b/c/d>
180 g = <URL:http://a/b/c/g>
181 ./g = <URL:http://a/b/c/g>
182 g/ = <URL:http://a/b/c/g/>
183 /g = <URL:http://a/g>
185 ?y = <URL:http://a/b/c/d?y>
186 g?y = <URL:http://a/b/c/g?y>
187 g?y/./x = <URL:http://a/b/c/g?y/./x>
188 . = <URL:http://a/b/c/>
189 ./ = <URL:http://a/b/c/>
190 .. = <URL:http://a/b/>
191 ../ = <URL:http://a/b/>
192 ../g = <URL:http://a/b/g>
193 ../.. = <URL:http://a/>
194 ../../g = <URL:http://a/g>
195 ../../../g = <URL:http://a/../g>
196 ./../g = <URL:http://a/b/g>
197 ./g/. = <URL:http://a/b/c/g/>
198 /./g = <URL:http://a/./g>
199 g/./h = <URL:http://a/b/c/g/h>
200 g/../h = <URL:http://a/b/c/h>
201 http:g = <URL:http://a/b/c/g>
202 http: = <URL:http://a/b/c/d>
216 fp
= StringIO
.StringIO(test_input
)
220 words
= string
.split(line
)
224 parts
= urlparse(url
)
225 print '%-10s : %s' % (url
, parts
)
226 abs = urljoin(base
, url
)
229 wrapped
= '<URL:%s>' % abs
230 print '%-10s = %s' % (url
, wrapped
)
231 if len(words
) == 3 and words
[1] == '=':
232 if wrapped
!= words
[2]:
233 print 'EXPECTED', words
[2], '!!!!!!!!!!'
235 if __name__
== '__main__':