Lib/urlparse.py

   1 """Parse (absolute and relative) URLs.
   2
   3 See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
   4 UC Irvine, June 1995.
   5 """
   6
   7 __all__ = ["urlparse", "urlunparse", "urljoin"]
   8
   9 # A classification of schemes ('' means apply by default)
  10 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
  11                  'https', 'shttp',
  12                  'prospero', 'rtsp', 'rtspu', '']
  13 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
  14                'file',
  15                'https', 'shttp', 'snews',
  16                'prospero', 'rtsp', 'rtspu', '']
  17 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
  18                     'snews', 'sip',
  19                     ]
  20 uses_params = ['ftp', 'hdl', 'prospero', 'http',
  21                'https', 'shttp', 'rtsp', 'rtspu', 'sip',
  22                '']
  23 uses_query = ['http', 'wais',
  24               'https', 'shttp',
  25               'gopher', 'rtsp', 'rtspu', 'sip',
  26               '']
  27 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
  28                  'https', 'shttp', 'snews',
  29                  'file', 'prospero', '']
  30
  31 # Characters valid in scheme names
  32 scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
  33                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  34                 '0123456789'
  35                 '+-.')
  36
  37 MAX_CACHE_SIZE = 20
  38 _parse_cache = {}
  39
  40 def clear_cache():
  41     """Clear the parse cache."""
  42     global _parse_cache
  43     _parse_cache = {}
  44
  45
  46 def urlparse(url, scheme = '', allow_fragments = 1):
  47     """Parse a URL into 6 components:
  48     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  49     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  50     Note that we don't break the components up in smaller bits
  51     (e.g. netloc is a single string) and we don't expand % escapes."""
  52     key = url, scheme, allow_fragments
  53     cached = _parse_cache.get(key, None)
  54     if cached:
  55         return cached
  56     if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  57         clear_cache()
  58     netloc = path = params = query = fragment = ''
  59     i = url.find(':')
  60     if i > 0:
  61         if url[:i] == 'http': # optimize the common case
  62             scheme = url[:i].lower()
  63             url = url[i+1:]
  64             if url[:2] == '//':
  65                 i = url.find('/', 2)
  66                 if i < 0:
  67                     i = len(url)
  68                 netloc = url[2:i]
  69                 url = url[i:]
  70             if allow_fragments:
  71                 i = url.rfind('#')
  72                 if i >= 0:
  73                     fragment = url[i+1:]
  74                     url = url[:i]
  75             i = url.find('?')
  76             if i >= 0:
  77                 query = url[i+1:]
  78                 url = url[:i]
  79             i = url.find(';')
  80             if i >= 0:
  81                 params = url[i+1:]
  82                 url = url[:i]
  83             tuple = scheme, netloc, url, params, query, fragment
  84             _parse_cache[key] = tuple
  85             return tuple
  86         for c in url[:i]:
  87             if c not in scheme_chars:
  88                 break
  89         else:
  90             scheme, url = url[:i].lower(), url[i+1:]
  91     if scheme in uses_netloc:
  92         if url[:2] == '//':
  93             i = url.find('/', 2)
  94             if i < 0:
  95                 i = len(url)
  96             netloc, url = url[2:i], url[i:]
  97     if allow_fragments and scheme in uses_fragment:
  98         i = url.rfind('#')
  99         if i >= 0:
 100             url, fragment = url[:i], url[i+1:]
 101     if scheme in uses_query:
 102         i = url.find('?')
 103         if i >= 0:
 104             url, query = url[:i], url[i+1:]
 105     if scheme in uses_params:
 106         i = url.find(';')
 107         if i >= 0:
 108             url, params = url[:i], url[i+1:]
 109     tuple = scheme, netloc, url, params, query, fragment
 110     _parse_cache[key] = tuple
 111     return tuple
 112
 113 def urlunparse((scheme, netloc, url, params, query, fragment)):
 114     """Put a parsed URL back together again.  This may result in a
 115     slightly different, but equivalent URL, if the URL that was parsed
 116     originally had redundant delimiters, e.g. a ? with an empty query
 117     (the draft states that these are equivalent)."""
 118     if netloc or (scheme in uses_netloc and url[:2] == '//'):
 119         if url and url[:1] != '/': url = '/' + url
 120         url = '//' + (netloc or '') + url
 121     if scheme:
 122         url = scheme + ':' + url
 123     if params:
 124         url = url + ';' + params
 125     if query:
 126         url = url + '?' + query
 127     if fragment:
 128         url = url + '#' + fragment
 129     return url
 130
 131 def urljoin(base, url, allow_fragments = 1):
 132     """Join a base URL and a possibly relative URL to form an absolute
 133     interpretation of the latter."""
 134     if not base:
 135         return url
 136     if not url:
 137         return base
 138     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 139             urlparse(base, '', allow_fragments)
 140     scheme, netloc, path, params, query, fragment = \
 141             urlparse(url, bscheme, allow_fragments)
 142     if scheme != bscheme or scheme not in uses_relative:
 143         return url
 144     if scheme in uses_netloc:
 145         if netloc:
 146             return urlunparse((scheme, netloc, path,
 147                                params, query, fragment))
 148         netloc = bnetloc
 149     if path[:1] == '/':
 150         return urlunparse((scheme, netloc, path,
 151                            params, query, fragment))
 152     if not path:
 153         if not params:
 154             params = bparams
 155             if not query:
 156                 query = bquery
 157         return urlunparse((scheme, netloc, bpath,
 158                            params, query, fragment))
 159     segments = bpath.split('/')[:-1] + path.split('/')
 160     # XXX The stuff below is bogus in various ways...
 161     if segments[-1] == '.':
 162         segments[-1] = ''
 163     while '.' in segments:
 164         segments.remove('.')
 165     while 1:
 166         i = 1
 167         n = len(segments) - 1
 168         while i < n:
 169             if (segments[i] == '..'
 170                 and segments[i-1] not in ('', '..')):
 171                 del segments[i-1:i+1]
 172                 break
 173             i = i+1
 174         else:
 175             break
 176     if segments == ['', '..']:
 177         segments[-1] = ''
 178     elif len(segments) >= 2 and segments[-1] == '..':
 179         segments[-2:] = ['']
 180     return urlunparse((scheme, netloc, '/'.join(segments),
 181                        params, query, fragment))
 182
 183 def urldefrag(url):
 184     """Removes any existing fragment from URL.
 185
 186     Returns a tuple of the defragmented URL and the fragment.  If
 187     the URL contained no fragments, the second element is the
 188     empty string.
 189     """
 190     s, n, p, a, q, frag = urlparse(url)
 191     defrag = urlunparse((s, n, p, a, q, ''))
 192     return defrag, frag
 193
 194
 195 test_input = """
 196       http://a/b/c/d
 197
 198       g:h        = <URL:g:h>
 199       http:g     = <URL:http://a/b/c/g>
 200       http:      = <URL:http://a/b/c/d>
 201       g          = <URL:http://a/b/c/g>
 202       ./g        = <URL:http://a/b/c/g>
 203       g/         = <URL:http://a/b/c/g/>
 204       /g         = <URL:http://a/g>
 205       //g        = <URL:http://g>
 206       ?y         = <URL:http://a/b/c/d?y>
 207       g?y        = <URL:http://a/b/c/g?y>
 208       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 209       .          = <URL:http://a/b/c/>
 210       ./         = <URL:http://a/b/c/>
 211       ..         = <URL:http://a/b/>
 212       ../        = <URL:http://a/b/>
 213       ../g       = <URL:http://a/b/g>
 214       ../..      = <URL:http://a/>
 215       ../../g    = <URL:http://a/g>
 216       ../../../g = <URL:http://a/../g>
 217       ./../g     = <URL:http://a/b/g>
 218       ./g/.      = <URL:http://a/b/c/g/>
 219       /./g       = <URL:http://a/./g>
 220       g/./h      = <URL:http://a/b/c/g/h>
 221       g/../h     = <URL:http://a/b/c/h>
 222       http:g     = <URL:http://a/b/c/g>
 223       http:      = <URL:http://a/b/c/d>
 224       http:?y         = <URL:http://a/b/c/d?y>
 225       http:g?y        = <URL:http://a/b/c/g?y>
 226       http:g?y/./x    = <URL:http://a/b/c/g?y/./x>
 227 """
 228 # XXX The result for //g is actually http://g/; is this a problem?
 229
 230 def test():
 231     import sys
 232     base = ''
 233     if sys.argv[1:]:
 234         fn = sys.argv[1]
 235         if fn == '-':
 236             fp = sys.stdin
 237         else:
 238             fp = open(fn)
 239     else:
 240         import StringIO
 241         fp = StringIO.StringIO(test_input)
 242     while 1:
 243         line = fp.readline()
 244         if not line: break
 245         words = line.split()
 246         if not words:
 247             continue
 248         url = words[0]
 249         parts = urlparse(url)
 250         print '%-10s : %s' % (url, parts)
 251         abs = urljoin(base, url)
 252         if not base:
 253             base = abs
 254         wrapped = '<URL:%s>' % abs
 255         print '%-10s = %s' % (url, wrapped)
 256         if len(words) == 3 and words[1] == '=':
 257             if wrapped != words[2]:
 258                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 259
 260 if __name__ == '__main__':
 261     test()