1 """HTTP header value parsing utility functions.
3 from ClientCookie._HeadersUtil import split_header_words
4 values = split_header_words(h.headers["Content-Type"])
6 This module provides a few functions that help parsing and construction of
7 valid HTTP header values.
10 Copyright 1997-1998, Gisle Aas
11 Copyright 2002-2003, John J. Lee
13 This code is free software; you can redistribute it and/or modify it under
14 the terms of the BSD License (see the file COPYING included with the
20 from types
import StringType
22 from types
import UnicodeType
23 STRING_TYPES
= StringType
, UnicodeType
25 STRING_TYPES
= StringType
,
27 from _Util
import startswith
, endswith
, http2time
35 """Return unmatched part of re.Match object."""
36 start
, end
= match
.span(0)
37 return match
.string
[:start
]+match
.string
[end
:]
39 # XXX I really can't see what this =* was for (came from LWP, I guess)
40 #token_re = re.compile(r"^\s*(=*[^\s=;,]+)")
41 token_re
= re
.compile(r
"^\s*([^=\s;,]+)")
42 quoted_value_re
= re
.compile(r
"^\s*=\s*\"([^
\"\\]*(?
:\\.[^
\"\\]*)*)\"")
43 value_re = re.compile(r"^\s
*=\s
*([^\s
;,]*)")
44 escape_re = re.compile(r"\\(.)")
45 def split_header_words(header_values):
46 r"""Parse header values into a list of lists containing key,value pairs.
48 The function knows how to deal with ",", ";" and "=" as well as quoted
49 values after "=". A list of space separated tokens are parsed as if they
50 were separated by ";".
52 If the header_values passed as argument contains multiple values, then they
53 are treated as if they were a single value separated by comma ",".
55 This means that this function is useful for parsing header fields that
56 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
57 the requirement for tokens).
60 header = (token | parameter) *( [";"] (token | parameter))
62 token = 1*<any CHAR except CTLs or separators>
63 separators = "(" | ")" | "<" | ">" | "@"
64 | "," | ";" | ":" | "\" |
<">
65 | "/" | "[" | "]" | "?
" | "="
68 quoted-string = ( <"> *(qdtext | quoted
-pair
) <"> )
69 qdtext = <any TEXT except <">>
70 quoted
-pair
= "\" CHAR
72 parameter = attribute "=" value
74 value = token | quoted-string
76 Each header is represented by a list of key/value pairs. The value for a
77 simple token (not part of a parameter) is None. Syntactically incorrect
78 headers will not necessarily be parsed as you would want.
80 This is easier to describe with some examples:
82 >>> split_header_words(['foo="bar
"; port="80,81"; discard, bar=baz'])
83 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
84 >>> split_header_words(['text/html; charset="iso
-8859-1"'])
85 [[('text/html', None), ('charset', 'iso-8859-1')]]
86 >>> split_header_words([r'Basic realm="\"foo
\bar
\""'])
87 [[('Basic', None), ('realm', '"foobar
"')]]
90 assert type(header_values) not in STRING_TYPES
92 for text in header_values:
96 m = token_re.search(text)
100 m = quoted_value_re.search(text)
104 value = escape_re.sub(r"\
1", value)
106 m = value_re.search(text)
107 if m: # unquoted value
110 value = string.rstrip(value)
112 # no value, a lone token
114 pairs.append((name, value))
115 elif startswith(string.lstrip(text), ","):
116 # concatenated headers, as per RFC 2616 section 4.2
117 text = string.lstrip(text)[1:]
118 if pairs: result.append(pairs)
122 non_junk, nr_junk_chars = re.subn("^
[=\s
;]*", "", text)
123 assert nr_junk_chars > 0, (
124 "split_header_words bug
: '%s', '%s', %s" %
125 (orig_text, text, pairs))
127 if pairs: result.append(pairs)
130 join_escape_re = re.compile(r"([\"\\])")
131 def join_header_words(lists):
132 """Do the inverse of the conversion done by split_header_words.
134 Takes a list of lists of (key, value) pairs and produces a single header
135 value. Attribute values are quoted if needed.
137 >>> join_header_words([[("text
/plain
", None), ("charset
", "iso
-8859/1")]])
138 'text/plain; charset="iso
-8859/1"'
139 >>> join_header_words([[("text
/plain
", None)], [("charset
", "iso
-8859/1")]])
140 'text/plain, charset="iso
-8859/1"'
148 if not re.search(r"^\w
+$
", v):
149 v = join_escape_re.sub(r"\\\
1", v) # escape " and \
151 if k
is None: # Netscape cookies may have no name
156 if attr
: headers
.append(string
.join(attr
, "; "))
157 return string
.join(headers
, ", ")
159 def parse_ns_headers(ns_headers
):
160 """Ad-hoc parser for Netscape protocol cookie-attributes.
162 The old Netscape cookie format for Set-Cookie can for instance contain
163 an unquoted "," in the expires field, so we have to use this ad-hoc
164 parser instead of split_header_words.
166 XXX This may not make the best possible effort to parse all the crap
167 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
168 parser is probably better, so could do worse than following that if
169 this ever gives any trouble.
171 Currently, this is also used for parsing RFC 2109 cookies.
174 known_attrs
= ("expires", "domain", "path", "secure",
175 # RFC 2109 attrs (may turn up in Netscape cookies, too)
179 for ns_header
in ns_headers
:
182 for param
in re
.split(r
";\s*", ns_header
):
183 param
= string
.rstrip(param
)
184 if param
== "": continue
186 if string
.lower(param
) in known_attrs
:
189 # cookie with missing name
192 k
, v
= re
.split(r
"\s*=\s*", param
, 1)
196 if lc
in known_attrs
:
199 # This is an RFC 2109 cookie. Will be treated as RFC 2965
200 # cookie in rest of code.
201 # Probably it should be parsed with split_header_words, but
202 # that's too much hassle.
205 # convert expires date to seconds since epoch
206 if startswith(v
, '"'): v
= v
[1:]
207 if endswith(v
, '"'): v
= v
[:-1]
208 v
= http2time(v
) # None if invalid
213 pairs
.append(("version", "0"))
220 import doctest
, _HeadersUtil
221 return doctest
.testmod(_HeadersUtil
)
223 if __name__
== "__main__":