mechanize/_headersutil.py

   1 """Utility functions for HTTP header value parsing and construction.
   2
   3 Copyright 1997-1998, Gisle Aas
   4 Copyright 2002-2006, John J. Lee
   5
   6 This code is free software; you can redistribute it and/or modify it
   7 under the terms of the BSD or ZPL 2.1 licenses (see the file
   8 COPYING.txt included with the distribution).
   9
  10 """
  11
  12 import os, re
  13 from types import StringType
  14 from types import UnicodeType
  15 STRING_TYPES = StringType, UnicodeType
  16
  17 from _util import http2time
  18 import _rfc3986
  19
  20 def is_html(ct_headers, url, allow_xhtml=False):
  21     """
  22     ct_headers: Sequence of Content-Type headers
  23     url: Response URL
  24
  25     """
  26     if not ct_headers:
  27         # guess
  28         ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1]
  29         html_exts = [".htm", ".html"]
  30         if allow_xhtml:
  31             html_exts += [".xhtml"]
  32         return ext in html_exts
  33     # use first header
  34     ct = split_header_words(ct_headers)[0][0][0]
  35     html_types = ["text/html"]
  36     if allow_xhtml:
  37         html_types += [
  38             "text/xhtml", "text/xml",
  39             "application/xml", "application/xhtml+xml",
  40             ]
  41     return ct in html_types
  42
  43 def unmatched(match):
  44     """Return unmatched part of re.Match object."""
  45     start, end = match.span(0)
  46     return match.string[:start]+match.string[end:]
  47
  48 token_re =        re.compile(r"^\s*([^=\s;,]+)")
  49 quoted_value_re = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
  50 value_re =        re.compile(r"^\s*=\s*([^\s;,]*)")
  51 escape_re = re.compile(r"\\(.)")
  52 def split_header_words(header_values):
  53     r"""Parse header values into a list of lists containing key,value pairs.
  54
  55     The function knows how to deal with ",", ";" and "=" as well as quoted
  56     values after "=".  A list of space separated tokens are parsed as if they
  57     were separated by ";".
  58
  59     If the header_values passed as argument contains multiple values, then they
  60     are treated as if they were a single value separated by comma ",".
  61
  62     This means that this function is useful for parsing header fields that
  63     follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
  64     the requirement for tokens).
  65
  66       headers           = #header
  67       header            = (token | parameter) *( [";"] (token | parameter))
  68
  69       token             = 1*<any CHAR except CTLs or separators>
  70       separators        = "(" | ")" | "<" | ">" | "@"
  71                         | "," | ";" | ":" | "\" | <">
  72                         | "/" | "[" | "]" | "?" | "="
  73                         | "{" | "}" | SP | HT
  74
  75       quoted-string     = ( <"> *(qdtext | quoted-pair ) <"> )
  76       qdtext            = <any TEXT except <">>
  77       quoted-pair       = "\" CHAR
  78
  79       parameter         = attribute "=" value
  80       attribute         = token
  81       value             = token | quoted-string
  82
  83     Each header is represented by a list of key/value pairs.  The value for a
  84     simple token (not part of a parameter) is None.  Syntactically incorrect
  85     headers will not necessarily be parsed as you would want.
  86
  87     This is easier to describe with some examples:
  88
  89     >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
  90     [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
  91     >>> split_header_words(['text/html; charset="iso-8859-1"'])
  92     [[('text/html', None), ('charset', 'iso-8859-1')]]
  93     >>> split_header_words([r'Basic realm="\"foo\bar\""'])
  94     [[('Basic', None), ('realm', '"foobar"')]]
  95
  96     """
  97     assert type(header_values) not in STRING_TYPES
  98     result = []
  99     for text in header_values:
 100         orig_text = text
 101         pairs = []
 102         while text:
 103             m = token_re.search(text)
 104             if m:
 105                 text = unmatched(m)
 106                 name = m.group(1)
 107                 m = quoted_value_re.search(text)
 108                 if m:  # quoted value
 109                     text = unmatched(m)
 110                     value = m.group(1)
 111                     value = escape_re.sub(r"\1", value)
 112                 else:
 113                     m = value_re.search(text)
 114                     if m:  # unquoted value
 115                         text = unmatched(m)
 116                         value = m.group(1)
 117                         value = value.rstrip()
 118                     else:
 119                         # no value, a lone token
 120                         value = None
 121                 pairs.append((name, value))
 122             elif text.lstrip().startswith(","):
 123                 # concatenated headers, as per RFC 2616 section 4.2
 124                 text = text.lstrip()[1:]
 125                 if pairs: result.append(pairs)
 126                 pairs = []
 127             else:
 128                 # skip junk
 129                 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
 130                 assert nr_junk_chars > 0, (
 131                     "split_header_words bug: '%s', '%s', %s" %
 132                     (orig_text, text, pairs))
 133                 text = non_junk
 134         if pairs: result.append(pairs)
 135     return result
 136
 137 join_escape_re = re.compile(r"([\"\\])")
 138 def join_header_words(lists):
 139     """Do the inverse of the conversion done by split_header_words.
 140
 141     Takes a list of lists of (key, value) pairs and produces a single header
 142     value.  Attribute values are quoted if needed.
 143
 144     >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
 145     'text/plain; charset="iso-8859/1"'
 146     >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
 147     'text/plain, charset="iso-8859/1"'
 148
 149     """
 150     headers = []
 151     for pairs in lists:
 152         attr = []
 153         for k, v in pairs:
 154             if v is not None:
 155                 if not re.search(r"^\w+$", v):
 156                     v = join_escape_re.sub(r"\\\1", v)  # escape " and \
 157                     v = '"%s"' % v
 158                 if k is None:  # Netscape cookies may have no name
 159                     k = v
 160                 else:
 161                     k = "%s=%s" % (k, v)
 162             attr.append(k)
 163         if attr: headers.append("; ".join(attr))
 164     return ", ".join(headers)
 165
 166 def parse_ns_headers(ns_headers):
 167     """Ad-hoc parser for Netscape protocol cookie-attributes.
 168
 169     The old Netscape cookie format for Set-Cookie can for instance contain
 170     an unquoted "," in the expires field, so we have to use this ad-hoc
 171     parser instead of split_header_words.
 172
 173     XXX This may not make the best possible effort to parse all the crap
 174     that Netscape Cookie headers contain.  Ronald Tschalar's HTTPClient
 175     parser is probably better, so could do worse than following that if
 176     this ever gives any trouble.
 177
 178     Currently, this is also used for parsing RFC 2109 cookies.
 179
 180     """
 181     known_attrs = ("expires", "domain", "path", "secure",
 182                    # RFC 2109 attrs (may turn up in Netscape cookies, too)
 183                    "port", "max-age")
 184
 185     result = []
 186     for ns_header in ns_headers:
 187         pairs = []
 188         version_set = False
 189         params = re.split(r";\s*", ns_header)
 190         for ii in range(len(params)):
 191             param = params[ii]
 192             param = param.rstrip()
 193             if param == "": continue
 194             if "=" not in param:
 195                 k, v = param, None
 196             else:
 197                 k, v = re.split(r"\s*=\s*", param, 1)
 198                 k = k.lstrip()
 199             if ii != 0:
 200                 lc = k.lower()
 201                 if lc in known_attrs:
 202                     k = lc
 203                 if k == "version":
 204                     # This is an RFC 2109 cookie.
 205                     version_set = True
 206                 if k == "expires":
 207                     # convert expires date to seconds since epoch
 208                     if v.startswith('"'): v = v[1:]
 209                     if v.endswith('"'): v = v[:-1]
 210                     v = http2time(v)  # None if invalid
 211             pairs.append((k, v))
 212
 213         if pairs:
 214             if not version_set:
 215                 pairs.append(("version", "0"))
 216             result.append(pairs)
 217
 218     return result
 219
 220
 221 def _test():
 222    import doctest, _headersutil
 223    return doctest.testmod(_headersutil)
 224
 225 if __name__ == "__main__":
 226    _test()