app/python25src/urllib.py

   1 """Open an arbitrary URL.
   2
   3 See the following document for more info on URLs:
   4 "Names and Addresses, URIs, URLs, URNs, URCs", at
   5 http://www.w3.org/pub/WWW/Addressing/Overview.html
   6
   7 See also the HTTP spec (from which the error codes are derived):
   8 "HTTP - Hypertext Transfer Protocol", at
   9 http://www.w3.org/pub/WWW/Protocols/
  10
  11 Related standards and specs:
  12 - RFC1808: the "relative URL" spec. (authoritative status)
  13 - RFC1738 - the "URL standard". (authoritative status)
  14 - RFC1630 - the "URI spec". (informational status)
  15
  16 All code but that related to URL parsing has been removed (since it is not
  17 compatible with Google App Engine)from this fork of the original file,
  18 obtained from:
  19 http://svn.python.org/view/*checkout*/python/tags/r252/Lib/urllib.py?content-type=text%2Fplain&rev=60915
  20 """
  21
  22 import string
  23 import sys
  24 from urlparse import urljoin as basejoin
  25
  26 __all__ = ["quote", "quote_plus", "unquote", "unquote_plus",
  27            "urlencode", "splittag",
  28            "basejoin", "unwrap",
  29            "splittype", "splithost", "splituser", "splitpasswd", "splitport",
  30            "splitnport", "splitquery", "splitattr", "splitvalue",
  31            "splitgophertype",]
  32
  33 __version__ = '1.17'    # XXX This version is not always updated :-(
  34
  35
  36 # Utilities to parse URLs (most of these return None for missing parts):
  37 # unwrap('<URL:type://host/path>') --> 'type://host/path'
  38 # splittype('type:opaquestring') --> 'type', 'opaquestring'
  39 # splithost('//host[:port]/path') --> 'host[:port]', '/path'
  40 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
  41 # splitpasswd('user:passwd') -> 'user', 'passwd'
  42 # splitport('host:port') --> 'host', 'port'
  43 # splitquery('/path?query') --> '/path', 'query'
  44 # splittag('/path#tag') --> '/path', 'tag'
  45 # splitattr('/path;attr1=value1;attr2=value2;...') ->
  46 #   '/path', ['attr1=value1', 'attr2=value2', ...]
  47 # splitvalue('attr=value') --> 'attr', 'value'
  48 # splitgophertype('/Xselector') --> 'X', 'selector'
  49 # unquote('abc%20def') -> 'abc def'
  50 # quote('abc def') -> 'abc%20def')
  51
  52 try:
  53     unicode
  54 except NameError:
  55     def _is_unicode(x):
  56         return 0
  57 else:
  58     def _is_unicode(x):
  59         return isinstance(x, unicode)
  60
  61 def toBytes(url):
  62     """toBytes(u"URL") --> 'URL'."""
  63     # Most URL schemes require ASCII. If that changes, the conversion
  64     # can be relaxed
  65     if _is_unicode(url):
  66         try:
  67             url = url.encode("ASCII")
  68         except UnicodeError:
  69             raise UnicodeError("URL " + repr(url) +
  70                                " contains non-ASCII characters")
  71     return url
  72
  73 def unwrap(url):
  74     """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
  75     url = url.strip()
  76     if url[:1] == '<' and url[-1:] == '>':
  77         url = url[1:-1].strip()
  78     if url[:4] == 'URL:': url = url[4:].strip()
  79     return url
  80
  81 _typeprog = None
  82 def splittype(url):
  83     """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
  84     global _typeprog
  85     if _typeprog is None:
  86         import re
  87         _typeprog = re.compile('^([^/:]+):')
  88
  89     match = _typeprog.match(url)
  90     if match:
  91         scheme = match.group(1)
  92         return scheme.lower(), url[len(scheme) + 1:]
  93     return None, url
  94
  95 _hostprog = None
  96 def splithost(url):
  97     """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
  98     global _hostprog
  99     if _hostprog is None:
 100         import re
 101         _hostprog = re.compile('^//([^/?]*)(.*)$')
 102
 103     match = _hostprog.match(url)
 104     if match: return match.group(1, 2)
 105     return None, url
 106
 107 _userprog = None
 108 def splituser(host):
 109     """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
 110     global _userprog
 111     if _userprog is None:
 112         import re
 113         _userprog = re.compile('^(.*)@(.*)$')
 114
 115     match = _userprog.match(host)
 116     if match: return map(unquote, match.group(1, 2))
 117     return None, host
 118
 119 _passwdprog = None
 120 def splitpasswd(user):
 121     """splitpasswd('user:passwd') -> 'user', 'passwd'."""
 122     global _passwdprog
 123     if _passwdprog is None:
 124         import re
 125         _passwdprog = re.compile('^([^:]*):(.*)$')
 126
 127     match = _passwdprog.match(user)
 128     if match: return match.group(1, 2)
 129     return user, None
 130
 131 # splittag('/path#tag') --> '/path', 'tag'
 132 _portprog = None
 133 def splitport(host):
 134     """splitport('host:port') --> 'host', 'port'."""
 135     global _portprog
 136     if _portprog is None:
 137         import re
 138         _portprog = re.compile('^(.*):([0-9]+)$')
 139
 140     match = _portprog.match(host)
 141     if match: return match.group(1, 2)
 142     return host, None
 143
 144 _nportprog = None
 145 def splitnport(host, defport=-1):
 146     """Split host and port, returning numeric port.
 147     Return given default port if no ':' found; defaults to -1.
 148     Return numerical port if a valid number are found after ':'.
 149     Return None if ':' but not a valid number."""
 150     global _nportprog
 151     if _nportprog is None:
 152         import re
 153         _nportprog = re.compile('^(.*):(.*)$')
 154
 155     match = _nportprog.match(host)
 156     if match:
 157         host, port = match.group(1, 2)
 158         try:
 159             if not port: raise ValueError, "no digits"
 160             nport = int(port)
 161         except ValueError:
 162             nport = None
 163         return host, nport
 164     return host, defport
 165
 166 _queryprog = None
 167 def splitquery(url):
 168     """splitquery('/path?query') --> '/path', 'query'."""
 169     global _queryprog
 170     if _queryprog is None:
 171         import re
 172         _queryprog = re.compile('^(.*)\?([^?]*)$')
 173
 174     match = _queryprog.match(url)
 175     if match: return match.group(1, 2)
 176     return url, None
 177
 178 _tagprog = None
 179 def splittag(url):
 180     """splittag('/path#tag') --> '/path', 'tag'."""
 181     global _tagprog
 182     if _tagprog is None:
 183         import re
 184         _tagprog = re.compile('^(.*)#([^#]*)$')
 185
 186     match = _tagprog.match(url)
 187     if match: return match.group(1, 2)
 188     return url, None
 189
 190 def splitattr(url):
 191     """splitattr('/path;attr1=value1;attr2=value2;...') ->
 192         '/path', ['attr1=value1', 'attr2=value2', ...]."""
 193     words = url.split(';')
 194     return words[0], words[1:]
 195
 196 _valueprog = None
 197 def splitvalue(attr):
 198     """splitvalue('attr=value') --> 'attr', 'value'."""
 199     global _valueprog
 200     if _valueprog is None:
 201         import re
 202         _valueprog = re.compile('^([^=]*)=(.*)$')
 203
 204     match = _valueprog.match(attr)
 205     if match: return match.group(1, 2)
 206     return attr, None
 207
 208 def splitgophertype(selector):
 209     """splitgophertype('/Xselector') --> 'X', 'selector'."""
 210     if selector[:1] == '/' and selector[1:2]:
 211         return selector[1], selector[2:]
 212     return None, selector
 213
 214 _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
 215 _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
 216
 217 def unquote(s):
 218     """unquote('abc%20def') -> 'abc def'."""
 219     res = s.split('%')
 220     for i in xrange(1, len(res)):
 221         item = res[i]
 222         try:
 223             res[i] = _hextochr[item[:2]] + item[2:]
 224         except KeyError:
 225             res[i] = '%' + item
 226         except UnicodeDecodeError:
 227             res[i] = unichr(int(item[:2], 16)) + item[2:]
 228     return "".join(res)
 229
 230 def unquote_plus(s):
 231     """unquote('%7e/abc+def') -> '~/abc def'"""
 232     s = s.replace('+', ' ')
 233     return unquote(s)
 234
 235 always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 236                'abcdefghijklmnopqrstuvwxyz'
 237                '0123456789' '_.-')
 238 _safemaps = {}
 239
 240 def quote(s, safe = '/'):
 241     """quote('abc def') -> 'abc%20def'
 242
 243     Each part of a URL, e.g. the path info, the query, etc., has a
 244     different set of reserved characters that must be quoted.
 245
 246     RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
 247     the following reserved characters.
 248
 249     reserved    = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
 250                   "$" | ","
 251
 252     Each of these characters is reserved in some component of a URL,
 253     but not necessarily in all of them.
 254
 255     By default, the quote function is intended for quoting the path
 256     section of a URL.  Thus, it will not encode '/'.  This character
 257     is reserved, but in typical usage the quote function is being
 258     called on a path where the existing slash characters are used as
 259     reserved characters.
 260     """
 261     cachekey = (safe, always_safe)
 262     try:
 263         safe_map = _safemaps[cachekey]
 264     except KeyError:
 265         safe += always_safe
 266         safe_map = {}
 267         for i in range(256):
 268             c = chr(i)
 269             safe_map[c] = (c in safe) and c or ('%%%02X' % i)
 270         _safemaps[cachekey] = safe_map
 271     res = map(safe_map.__getitem__, s)
 272     return ''.join(res)
 273
 274 def quote_plus(s, safe = ''):
 275     """Quote the query fragment of a URL; replacing ' ' with '+'"""
 276     if ' ' in s:
 277         s = quote(s, safe + ' ')
 278         return s.replace(' ', '+')
 279     return quote(s, safe)
 280
 281 def urlencode(query,doseq=0):
 282     """Encode a sequence of two-element tuples or dictionary into a URL query string.
 283
 284     If any values in the query arg are sequences and doseq is true, each
 285     sequence element is converted to a separate parameter.
 286
 287     If the query arg is a sequence of two-element tuples, the order of the
 288     parameters in the output will match the order of parameters in the
 289     input.
 290     """
 291
 292     if hasattr(query,"items"):
 293         # mapping objects
 294         query = query.items()
 295     else:
 296         # it's a bother at times that strings and string-like objects are
 297         # sequences...
 298         try:
 299             # non-sequence items should not work with len()
 300             # non-empty strings will fail this
 301             if len(query) and not isinstance(query[0], tuple):
 302                 raise TypeError
 303             # zero-length sequences of all types will get here and succeed,
 304             # but that's a minor nit - since the original implementation
 305             # allowed empty dicts that type of behavior probably should be
 306             # preserved for consistency
 307         except TypeError:
 308             ty,va,tb = sys.exc_info()
 309             raise TypeError, "not a valid non-string sequence or mapping object", tb
 310
 311     l = []
 312     if not doseq:
 313         # preserve old behavior
 314         for k, v in query:
 315             k = quote_plus(str(k))
 316             v = quote_plus(str(v))
 317             l.append(k + '=' + v)
 318     else:
 319         for k, v in query:
 320             k = quote_plus(str(k))
 321             if isinstance(v, str):
 322                 v = quote_plus(v)
 323                 l.append(k + '=' + v)
 324             elif _is_unicode(v):
 325                 # is there a reasonable way to convert to ASCII?
 326                 # encode generates a string, but "replace" or "ignore"
 327                 # lose information and "strict" can raise UnicodeError
 328                 v = quote_plus(v.encode("ASCII","replace"))
 329                 l.append(k + '=' + v)
 330             else:
 331                 try:
 332                     # is this a sufficient test for sequence-ness?
 333                     x = len(v)
 334                 except TypeError:
 335                     # not a sequence
 336                     v = quote_plus(str(v))
 337                     l.append(k + '=' + v)
 338                 else:
 339                     # loop over the sequence
 340                     for elt in v:
 341                         l.append(k + '=' + quote_plus(str(elt)))
 342     return '&'.join(l)