Lib/urlparse.py

   1 # Parse (absolute and relative) URLs.  See RFC 1808: "Relative Uniform
   2 # Resource Locators", by R. Fielding, UC Irvine, June 1995.
   3
   4 # Standard/builtin Python modules
   5 import string
   6 from string import joinfields, splitfields, find, rfind
   7
   8 # A classification of schemes ('' means apply by default)
   9 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'wais', 'file',
  10                  'https', 'shttp',
  11                  'prospero', '']
  12 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', 'wais',
  13                'https', 'shttp', 'snews',
  14                'prospero', '']
  15 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', 'telnet', 'wais',
  16                     'snews',
  17                     ]
  18 uses_params = ['ftp', 'hdl', 'prospero', 'http',
  19                'https', 'shttp',
  20                '']
  21 uses_query = ['http', 'wais',
  22               'https', 'shttp',
  23               'gopher',
  24               '']
  25 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', 'nntp', 'wais',
  26                  'https', 'shttp', 'snews',
  27                  'file', 'prospero', '']
  28
  29 # Characters valid in scheme names
  30 scheme_chars = string.letters + string.digits + '+-.'
  31
  32 MAX_CACHE_SIZE = 20
  33 _parse_cache = {}
  34
  35 def clear_cache():
  36     """Clear the parse cache."""
  37     global _parse_cache
  38     _parse_cache = {}
  39
  40
  41 # Parse a URL into 6 components:
  42 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
  43 # Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
  44 # Note that we don't break the components up in smaller bits
  45 # (e.g. netloc is a single string) and we don't expand % escapes.
  46 def urlparse(url, scheme = '', allow_framents = 1):
  47         key = url, scheme, allow_framents
  48         try:
  49             return _parse_cache[key]
  50         except KeyError:
  51             pass
  52         if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
  53             clear_cache()
  54         netloc = path = params = query = fragment = ''
  55         i = string.find(url, ':')
  56         if i > 0:
  57                 for c in url[:i]:
  58                         if c not in scheme_chars:
  59                                 break
  60                 else:
  61                         scheme, url = string.lower(url[:i]), url[i+1:]
  62         if scheme in uses_netloc:
  63                 if url[:2] == '//':
  64                         i = string.find(url, '/', 2)
  65                         if i < 0:
  66                                 i = len(url)
  67                         netloc, url = url[2:i], url[i:]
  68         if allow_framents and scheme in uses_fragment:
  69                 i = string.rfind(url, '#')
  70                 if i >= 0:
  71                         url, fragment = url[:i], url[i+1:]
  72         if scheme in uses_query:
  73                 i = string.find(url, '?')
  74                 if i >= 0:
  75                         url, query = url[:i], url[i+1:]
  76         if scheme in uses_params:
  77                 i = string.find(url, ';')
  78                 if i >= 0:
  79                         url, params = url[:i], url[i+1:]
  80         tuple = scheme, netloc, url, params, query, fragment
  81         _parse_cache[key] = tuple
  82         return tuple
  83
  84 # Put a parsed URL back together again.  This may result in a slightly
  85 # different, but equivalent URL, if the URL that was parsed originally
  86 # had redundant delimiters, e.g. a ? with an empty query (the draft
  87 # states that these are equivalent).
  88 def urlunparse((scheme, netloc, url, params, query, fragment)):
  89         if netloc:
  90                 if url[:1] != '/': url = '/' + url
  91                 url = '//' + netloc + url
  92         if scheme:
  93                 url = scheme + ':' + url
  94         if params:
  95                 url = url + ';' + params
  96         if query:
  97                 url = url + '?' + query
  98         if fragment:
  99                 url = url + '#' + fragment
 100         return url
 101
 102 # Join a base URL and a possibly relative URL to form an absolute
 103 # interpretation of the latter.
 104 def urljoin(base, url, allow_framents = 1):
 105         if not base:
 106                 return url
 107         bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
 108                 urlparse(base, '', allow_framents)
 109         scheme, netloc, path, params, query, fragment = \
 110                 urlparse(url, bscheme, allow_framents)
 111         # XXX Unofficial hack: default netloc to bnetloc even if
 112         # schemes differ
 113         if scheme != bscheme and not netloc and \
 114            scheme in uses_relative and bscheme in uses_relative and \
 115            scheme in uses_netloc and bscheme in uses_netloc:
 116            netloc = bnetloc
 117            # Strip the port number
 118            i = find(netloc, '@')
 119            if i < 0: i = 0
 120            i = find(netloc, ':', i)
 121            if i >= 0:
 122                    netloc = netloc[:i]
 123         if scheme != bscheme or scheme not in uses_relative:
 124                 return urlunparse((scheme, netloc, path,
 125                                    params, query, fragment))
 126         if scheme in uses_netloc:
 127                 if netloc:
 128                         return urlunparse((scheme, netloc, path,
 129                                            params, query, fragment))
 130                 netloc = bnetloc
 131         if path[:1] == '/':
 132                 return urlunparse((scheme, netloc, path,
 133                                    params, query, fragment))
 134         if not path:
 135                 return urlunparse((scheme, netloc, bpath,
 136                                    params, query or bquery, fragment))
 137         i = rfind(bpath, '/')
 138         if i >= 0:
 139                 path = bpath[:i] + '/' + path
 140         segments = splitfields(path, '/')
 141         if segments[-1] == '.':
 142                 segments[-1] = ''
 143         while '.' in segments:
 144                 segments.remove('.')
 145         while 1:
 146                 i = 1
 147                 n = len(segments) - 1
 148                 while i < n:
 149                         if segments[i] == '..' and segments[i-1]:
 150                                 del segments[i-1:i+1]
 151                                 break
 152                         i = i+1
 153                 else:
 154                         break
 155         if len(segments) == 2 and segments[1] == '..' and segments[0] == '':
 156                 segments[-1] = ''
 157         elif len(segments) >= 2 and segments[-1] == '..':
 158                 segments[-2:] = ['']
 159         return urlunparse((scheme, netloc, joinfields(segments, '/'),
 160                            params, query, fragment))
 161
 162 def urldefrag(url):
 163     """Removes any existing fragment from URL.
 164
 165     Returns a tuple of the defragmented URL and the fragment.  If
 166     the URL contained no fragments, the second element is the
 167     empty string.
 168     """
 169     s, n, p, a, q, frag = urlparse(url)
 170     defrag = urlunparse((s, n, p, a, q, ''))
 171     return defrag, frag
 172
 173
 174 test_input = """
 175       http://a/b/c/d
 176
 177       g:h        = <URL:g:h>
 178       http:g     = <URL:http://a/b/c/g>
 179       http:      = <URL:http://a/b/c/d>
 180       g          = <URL:http://a/b/c/g>
 181       ./g        = <URL:http://a/b/c/g>
 182       g/         = <URL:http://a/b/c/g/>
 183       /g         = <URL:http://a/g>
 184       //g        = <URL:http://g>
 185       ?y         = <URL:http://a/b/c/d?y>
 186       g?y        = <URL:http://a/b/c/g?y>
 187       g?y/./x    = <URL:http://a/b/c/g?y/./x>
 188       .          = <URL:http://a/b/c/>
 189       ./         = <URL:http://a/b/c/>
 190       ..         = <URL:http://a/b/>
 191       ../        = <URL:http://a/b/>
 192       ../g       = <URL:http://a/b/g>
 193       ../..      = <URL:http://a/>
 194       ../../g    = <URL:http://a/g>
 195       ../../../g = <URL:http://a/../g>
 196       ./../g     = <URL:http://a/b/g>
 197       ./g/.      = <URL:http://a/b/c/g/>
 198       /./g       = <URL:http://a/./g>
 199       g/./h      = <URL:http://a/b/c/g/h>
 200       g/../h     = <URL:http://a/b/c/h>
 201       http:g     = <URL:http://a/b/c/g>
 202       http:      = <URL:http://a/b/c/d>
 203 """
 204
 205 def test():
 206         import sys
 207         base = ''
 208         if sys.argv[1:]:
 209                 fn = sys.argv[1]
 210                 if fn == '-':
 211                         fp = sys.stdin
 212                 else:
 213                         fp = open(fn)
 214         else:
 215                 import StringIO
 216                 fp = StringIO.StringIO(test_input)
 217         while 1:
 218                 line = fp.readline()
 219                 if not line: break
 220                 words = string.split(line)
 221                 if not words:
 222                         continue
 223                 url = words[0]
 224                 parts = urlparse(url)
 225                 print '%-10s : %s' % (url, parts)
 226                 abs = urljoin(base, url)
 227                 if not base:
 228                         base = abs
 229                 wrapped = '<URL:%s>' % abs
 230                 print '%-10s = %s' % (url, wrapped)
 231                 if len(words) == 3 and words[1] == '=':
 232                         if wrapped != words[2]:
 233                                 print 'EXPECTED', words[2], '!!!!!!!!!!'
 234
 235 if __name__ == '__main__':
 236         test()