Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
   4 UC Irvine, June 1995.
   5 """
   6
   7 # Standard/builtin Python modules
   8 import string
   9 from string import join, split, rfind
  10
  11 # A classification of schemes ('' means apply by default)
  12 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
  13                  'https', 'shttp',
  14                  'prospero', 'rtsp', 'rtspu', '']
  15 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
  16                'file',
  17                'https', 'shttp', 'snews',
  18                'prospero', 'rtsp', 'rtspu', '']
  19 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
  20                     'snews', 'sip',
  21                     ]
  22 uses_params = ['ftp', 'hdl', 'prospero', 'http',
  23                'https', 'shttp', 'rtsp', 'rtspu', 'sip',
  24                '']
  25 uses_query = ['http', 'wais',
  26               'https', 'shttp',
  27               'gopher', 'rtsp', 'rtspu', 'sip',
  28               '']
  29 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
  30                  'https', 'shttp', 'snews',
  31                  'file', 'prospero', '']
  32
  33 # Characters valid in scheme names
  34 scheme_chars = string.letters + string.digits + '+-.'
  35
  36 MAX_CACHE_SIZE = 20
  37 _parse_cache = {}
  38
  39 def clear_cache():
  40         """Clear the parse cache."""
  41         global _parse_cache
  42         _parse_cache = {}
  43
  44
  45 def urlparse(url, scheme = '', allow_fragments = 1):
  46         """Parse a URL into 6 components:
  47         <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  48         Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  49         Note that we don't break the components up in smaller bits
  50         (e.g. netloc is a single string) and we don't expand % escapes."""
  51         key = url, scheme, allow_fragments
  52         cached = _parse_cache.get(key, None)
  53         if cached:
  54                 return cached
  55         if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  56                 clear_cache()
  57         find = string.find
  58         netloc = path = params = query = fragment = ''
  59         i = find(url, ':')
  60         if i > 0:
  61                 if url[:i] == 'http': # optimize the common case
  62                         scheme = string.lower(url[:i])
  63                         url = url[i+1:]
  64                         if url[:2] == '//':
  65                                 i = find(url, '/', 2)
  66                                 if i < 0:
  67                                         i = len(url)
  68                                 netloc = url[2:i]
  69                                 url = url[i:]
  70                         if allow_fragments:
  71                                 i = string.rfind(url, '#')
  72                                 if i >= 0:
  73                                         fragment = url[i+1:]
  74                                         url = url[:i]
  75                         i = find(url, '?')
  76                         if i >= 0:
  77                                 query = url[i+1:]
  78                                 url = url[:i]
  79                         i = find(url, ';')
  80                         if i >= 0:
  81                                 params = url[i+1:]
  82                                 url = url[:i]
  83                         tuple = scheme, netloc, url, params, query, fragment
  84                         _parse_cache[key] = tuple
  85                         return tuple
  86                 for c in url[:i]:
  87                         if c not in scheme_chars:
  88                                 break
  89                 else:
  90                         scheme, url = string.lower(url[:i]), url[i+1:]
  91         if scheme in uses_netloc:
  92                 if url[:2] == '//':
  93                         i = find(url, '/', 2)
  94                         if i < 0:
  95                                 i = len(url)
  96                         netloc, url = url[2:i], url[i:]
  97         if allow_fragments and scheme in uses_fragment:
  98                 i = string.rfind(url, '#')
  99                 if i >= 0:
 100                         url, fragment = url[:i], url[i+1:]
 101         if scheme in uses_query:
 102                 i = find(url, '?')
 103                 if i >= 0:
 104                         url, query = url[:i], url[i+1:]
 105         if scheme in uses_params:
 106                 i = find(url, ';')
 107                 if i >= 0:
 108                         url, params = url[:i], url[i+1:]
 109         tuple = scheme, netloc, url, params, query, fragment
 110         _parse_cache[key] = tuple
 111         return tuple
 112
 113 def urlunparse((scheme, netloc, url, params, query, fragment)):
 114         """Put a parsed URL back together again.  This may result in a
 115         slightly different, but equivalent URL, if the URL that was parsed
 116         originally had redundant delimiters, e.g. a ? with an empty query
 117         (the draft states that these are equivalent)."""
 118         if netloc or (scheme in uses_netloc and url[:2] == '//'):
 119                 if url[:1] != '/': url = '/' + url
 120                 url = '//' + (netloc or '') + url
 121         if scheme:
 122                 url = scheme + ':' + url
 123         if params:
 124                 url = url + ';' + params
 125         if query:
 126                 url = url + '?' + query
 127         if fragment:
 128                 url = url + '#' + fragment
 129         return url
 130
 131 def urljoin(base, url, allow_fragments = 1):
 132         """Join a base URL and a possibly relative URL to form an absolute
 133         interpretation of the latter."""
 134         if not base:
 135                 return url
 136         bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 137                 urlparse(base, '', allow_fragments)
 138         scheme, netloc, path, params, query, fragment = \
 139                 urlparse(url, bscheme, allow_fragments)
 140         if scheme != bscheme or scheme not in uses_relative:
 141                 return urlunparse((scheme, netloc, path,
 142                                    params, query, fragment))
 143         if scheme in uses_netloc:
 144                 if netloc:
 145                         return urlunparse((scheme, netloc, path,
 146                                            params, query, fragment))
 147                 netloc = bnetloc
 148         if path[:1] == '/':
 149                 return urlunparse((scheme, netloc, path,
 150                                    params, query, fragment))
 151         if not path:
 152                 return urlunparse((scheme, netloc, bpath,
 153                                    params, query or bquery, fragment))
 154         segments = split(bpath, '/')[:-1] + split(path, '/')
 155         # XXX The stuff below is bogus in various ways...
 156         if segments[-1] == '.':
 157                 segments[-1] = ''
 158         while '.' in segments:
 159                 segments.remove('.')
 160         while 1:
 161                 i = 1
 162                 n = len(segments) - 1
 163                 while i < n:
 164                         if segments[i] == '..' and segments[i-1]:
 165                                 del segments[i-1:i+1]
 166                                 break
 167                         i = i+1
 168                 else:
 169                         break
 170         if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
 171                 segments[-1] = ''
 172         elif len(segments) >= 2 and segments[-1] == '..':
 173                 segments[-2:] = ['']
 174         return urlunparse((scheme, netloc, join(segments, '/'),
 175                            params, query, fragment))
 176
 177 def urldefrag(url):
 178         """Removes any existing fragment from URL.
 179
 180         Returns a tuple of the defragmented URL and the fragment.  If
 181         the URL contained no fragments, the second element is the
 182         empty string.
 183         """
 184         s, n, p, a, q, frag = urlparse(url)
 185         defrag = urlunparse((s, n, p, a, q, ''))
 186         return defrag, frag
 187
 188
 189 test_input = """
 190       http://a/b/c/d
 191
 192       g:h        = <URL:g:h>
 193       http:g     = <URL:http://a/b/c/g>
 194       http:      = <URL:http://a/b/c/d>
 195       g          = <URL:http://a/b/c/g>
 196       ./g        = <URL:http://a/b/c/g>
 197       g/         = <URL:http://a/b/c/g/>
 198       /g         = <URL:http://a/g>
 199       //g        = <URL:http://g>
 200       ?y         = <URL:http://a/b/c/d?y>
 201       g?y        = <URL:http://a/b/c/g?y>
 202       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 203       .          = <URL:http://a/b/c/>
 204       ./         = <URL:http://a/b/c/>
 205       ..         = <URL:http://a/b/>
 206       ../        = <URL:http://a/b/>
 207       ../g       = <URL:http://a/b/g>
 208       ../..      = <URL:http://a/>
 209       ../../g    = <URL:http://a/g>
 210       ../../../g = <URL:http://a/../g>
 211       ./../g     = <URL:http://a/b/g>
 212       ./g/.      = <URL:http://a/b/c/g/>
 213       /./g       = <URL:http://a/./g>
 214       g/./h      = <URL:http://a/b/c/g/h>
 215       g/../h     = <URL:http://a/b/c/h>
 216       http:g     = <URL:http://a/b/c/g>
 217       http:      = <URL:http://a/b/c/d>
 218       http:?y         = <URL:http://a/b/c/d?y>
 219       http:g?y        = <URL:http://a/b/c/g?y>
 220       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 221 """
 222 # XXX The result for //g is actually http://g/; is this a problem?
 223
 224 def test():
 225         import sys
 226         base = ''
 227         if sys.argv[1:]:
 228                 fn = sys.argv[1]
 229                 if fn == '-':
 230                         fp = sys.stdin
 231                 else:
 232                         fp = open(fn)
 233         else:
 234                 import StringIO
 235                 fp = StringIO.StringIO(test_input)
 236         while 1:
 237                 line = fp.readline()
 238                 if not line: break
 239                 words = string.split(line)
 240                 if not words:
 241                         continue
 242                 url = words[0]
 243                 parts = urlparse(url)
 244                 print '%-10s : %s' % (url, parts)
 245                 abs = urljoin(base, url)
 246                 if not base:
 247                         base = abs
 248                 wrapped = '<URL:%s>' % abs
 249                 print '%-10s = %s' % (url, wrapped)
 250                 if len(words) == 3 and words[1] == '=':
 251                         if wrapped != words[2]:
 252                                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 253
 254 if __name__ == '__main__':
 255         test()