Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
   4 UC Irvine, June 1995.
   5 """
   6
   7 # Standard/builtin Python modules
   8 import string
   9 from string import joinfields, splitfields, rfind
  10
  11 # A classification of schemes ('' means apply by default)
  12 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
  13                  'https', 'shttp',
  14                  'prospero', '']
  15 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
  16                'file',
  17                'https', 'shttp', 'snews',
  18                'prospero', '']
  19 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
  20                     'snews',
  21                     ]
  22 uses_params = ['ftp', 'hdl', 'prospero', 'http',
  23                'https', 'shttp',
  24                '']
  25 uses_query = ['http', 'wais',
  26               'https', 'shttp',
  27               'gopher',
  28               '']
  29 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
  30                  'https', 'shttp', 'snews',
  31                  'file', 'prospero', '']
  32
  33 # Characters valid in scheme names
  34 scheme_chars = string.letters + string.digits + '+-.'
  35
  36 MAX_CACHE_SIZE = 20
  37 _parse_cache = {}
  38
  39 def clear_cache():
  40     """Clear the parse cache."""
  41     global _parse_cache
  42     _parse_cache = {}
  43
  44
  45 def urlparse(url, scheme = '', allow_fragments = 1):
  46         """Parse a URL into 6 components:
  47         <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  48         Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  49         Note that we don't break the components up in smaller bits
  50         (e.g. netloc is a single string) and we don't expand % escapes."""
  51         key = url, scheme, allow_fragments
  52         cached = _parse_cache.get(key, None)
  53         if cached:
  54                 return cached
  55         if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  56             clear_cache()
  57         find = string.find
  58         netloc = path = params = query = fragment = ''
  59         i = find(url, ':')
  60         if i > 0:
  61                 if url[:i] == 'http': # optimize the common case
  62                         scheme = string.lower(url[:i])
  63                         url = url[i+1:]
  64                         if url[:2] == '//':
  65                                 i = find(url, '/', 2)
  66                                 if i < 0:
  67                                         i = len(url)
  68                                 netloc = url[2:i]
  69                                 url = url[i:]
  70                         if allow_fragments:
  71                                 i = string.rfind(url, '#')
  72                                 if i >= 0:
  73                                         fragment = url[i+1:]
  74                                         url = url[:i]
  75                         i = find(url, '?')
  76                         if i >= 0:
  77                                 query = url[i+1:]
  78                                 url = url[:i]
  79                         i = find(url, ';')
  80                         if i >= 0:
  81                                 params = url[i+1:]
  82                                 url = url[:i]
  83                         tuple = scheme, netloc, url, params, query, fragment
  84                         _parse_cache[key] = tuple
  85                         return tuple
  86                 for c in url[:i]:
  87                         if c not in scheme_chars:
  88                                 break
  89                 else:
  90                         scheme, url = string.lower(url[:i]), url[i+1:]
  91         if scheme in uses_netloc:
  92                 if url[:2] == '//':
  93                         i = find(url, '/', 2)
  94                         if i < 0:
  95                                 i = len(url)
  96                         netloc, url = url[2:i], url[i:]
  97         if allow_fragments and scheme in uses_fragment:
  98                 i = string.rfind(url, '#')
  99                 if i >= 0:
 100                         url, fragment = url[:i], url[i+1:]
 101         if scheme in uses_query:
 102                 i = find(url, '?')
 103                 if i >= 0:
 104                         url, query = url[:i], url[i+1:]
 105         if scheme in uses_params:
 106                 i = find(url, ';')
 107                 if i >= 0:
 108                         url, params = url[:i], url[i+1:]
 109         tuple = scheme, netloc, url, params, query, fragment
 110         _parse_cache[key] = tuple
 111         return tuple
 112
 113 def urlunparse((scheme, netloc, url, params, query, fragment)):
 114         """Put a parsed URL back together again.  This may result in a
 115         slightly different, but equivalent URL, if the URL that was parsed
 116         originally had redundant delimiters, e.g. a ? with an empty query
 117         (the draft states that these are equivalent)."""
 118         if netloc or (scheme in uses_netloc and url[:2] == '//'):
 119                 if url[:1] != '/': url = '/' + url
 120                 url = '//' + (netloc or '') + url
 121         if scheme:
 122                 url = scheme + ':' + url
 123         if params:
 124                 url = url + ';' + params
 125         if query:
 126                 url = url + '?' + query
 127         if fragment:
 128                 url = url + '#' + fragment
 129         return url
 130
 131 def urljoin(base, url, allow_fragments = 1):
 132         """Join a base URL and a possibly relative URL to form an absolute
 133         interpretation of the latter."""
 134         if not base:
 135                 return url
 136         bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 137                 urlparse(base, '', allow_fragments)
 138         scheme, netloc, path, params, query, fragment = \
 139                 urlparse(url, bscheme, allow_fragments)
 140         if scheme != bscheme or scheme not in uses_relative:
 141                 return urlunparse((scheme, netloc, path,
 142                                    params, query, fragment))
 143         if scheme in uses_netloc:
 144                 if netloc:
 145                         return urlunparse((scheme, netloc, path,
 146                                            params, query, fragment))
 147                 netloc = bnetloc
 148         if path[:1] == '/':
 149                 return urlunparse((scheme, netloc, path,
 150                                    params, query, fragment))
 151         if not path:
 152                 return urlunparse((scheme, netloc, bpath,
 153                                    params, query or bquery, fragment))
 154         i = rfind(bpath, '/')
 155         if i >= 0:
 156                 path = bpath[:i] + '/' + path
 157         segments = splitfields(path, '/')
 158         if segments[-1] == '.':
 159                 segments[-1] = ''
 160         while '.' in segments:
 161                 segments.remove('.')
 162         while 1:
 163                 i = 1
 164                 n = len(segments) - 1
 165                 while i < n:
 166                         if segments[i] == '..' and segments[i-1]:
 167                                 del segments[i-1:i+1]
 168                                 break
 169                         i = i+1
 170                 else:
 171                         break
 172         if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
 173                 segments[-1] = ''
 174         elif len(segments) >= 2 and segments[-1] == '..':
 175                 segments[-2:] = ['']
 176         return urlunparse((scheme, netloc, joinfields(segments, '/'),
 177                            params, query, fragment))
 178
 179 def urldefrag(url):
 180     """Removes any existing fragment from URL.
 181
 182     Returns a tuple of the defragmented URL and the fragment.  If
 183     the URL contained no fragments, the second element is the
 184     empty string.
 185     """
 186     s, n, p, a, q, frag = urlparse(url)
 187     defrag = urlunparse((s, n, p, a, q, ''))
 188     return defrag, frag
 189
 190
 191 test_input = """
 192       http://a/b/c/d
 193
 194       g:h        = <URL:g:h>
 195       http:g     = <URL:http://a/b/c/g>
 196       http:      = <URL:http://a/b/c/d>
 197       g          = <URL:http://a/b/c/g>
 198       ./g        = <URL:http://a/b/c/g>
 199       g/         = <URL:http://a/b/c/g/>
 200       /g         = <URL:http://a/g>
 201       //g        = <URL:http://g>
 202       ?y         = <URL:http://a/b/c/d?y>
 203       g?y        = <URL:http://a/b/c/g?y>
 204       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 205       .          = <URL:http://a/b/c/>
 206       ./         = <URL:http://a/b/c/>
 207       ..         = <URL:http://a/b/>
 208       ../        = <URL:http://a/b/>
 209       ../g       = <URL:http://a/b/g>
 210       ../..      = <URL:http://a/>
 211       ../../g    = <URL:http://a/g>
 212       ../../../g = <URL:http://a/../g>
 213       ./../g     = <URL:http://a/b/g>
 214       ./g/.      = <URL:http://a/b/c/g/>
 215       /./g       = <URL:http://a/./g>
 216       g/./h      = <URL:http://a/b/c/g/h>
 217       g/../h     = <URL:http://a/b/c/h>
 218       http:g     = <URL:http://a/b/c/g>
 219       http:      = <URL:http://a/b/c/d>
 220       http:?y         = <URL:http://a/b/c/d?y>
 221       http:g?y        = <URL:http://a/b/c/g?y>
 222       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 223 """
 224 # XXX The result for //g is actually http://g/; is this a problem?
 225
 226 def test():
 227         import sys
 228         base = ''
 229         if sys.argv[1:]:
 230                 fn = sys.argv[1]
 231                 if fn == '-':
 232                         fp = sys.stdin
 233                 else:
 234                         fp = open(fn)
 235         else:
 236                 import StringIO
 237                 fp = StringIO.StringIO(test_input)
 238         while 1:
 239                 line = fp.readline()
 240                 if not line: break
 241                 words = string.split(line)
 242                 if not words:
 243                         continue
 244                 url = words[0]
 245                 parts = urlparse(url)
 246                 print '%-10s : %s' % (url, parts)
 247                 abs = urljoin(base, url)
 248                 if not base:
 249                         base = abs
 250                 wrapped = '<URL:%s>' % abs
 251                 print '%-10s = %s' % (url, wrapped)
 252                 if len(words) == 3 and words[1] == '=':
 253                         if wrapped != words[2]:
 254                                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 255
 256 if __name__ == '__main__':
 257         test()