1 """Utility functions for HTTP header value parsing and construction.
3 Copyright 1997-1998, Gisle Aas
4 Copyright 2002-2006, John J. Lee
6 This code is free software; you can redistribute it and/or modify it
7 under the terms of the BSD or ZPL 2.1 licenses (see the file
8 COPYING.txt included with the distribution).
13 from types
import StringType
14 from types
import UnicodeType
15 STRING_TYPES
= StringType
, UnicodeType
17 from _util
import http2time
20 def is_html(ct_headers
, url
, allow_xhtml
=False):
22 ct_headers: Sequence of Content-Type headers
28 ext
= os
.path
.splitext(_rfc3986
.urlsplit(url
)[2])[1]
29 html_exts
= [".htm", ".html"]
31 html_exts
+= [".xhtml"]
32 return ext
in html_exts
34 ct
= split_header_words(ct_headers
)[0][0][0]
35 html_types
= ["text/html"]
38 "text/xhtml", "text/xml",
39 "application/xml", "application/xhtml+xml",
41 return ct
in html_types
44 """Return unmatched part of re.Match object."""
45 start
, end
= match
.span(0)
46 return match
.string
[:start
]+match
.string
[end
:]
48 token_re
= re
.compile(r
"^\s*([^=\s;,]+)")
49 quoted_value_re
= re
.compile(r
"^\s*=\s*\"([^
\"\\]*(?
:\\.[^
\"\\]*)*)\"")
50 value_re = re.compile(r"^\s
*=\s
*([^\s
;,]*)")
51 escape_re = re.compile(r"\\(.)")
52 def split_header_words(header_values):
53 r"""Parse header values into a list of lists containing key,value pairs.
55 The function knows how to deal with ",", ";" and "=" as well as quoted
56 values after "=". A list of space separated tokens are parsed as if they
57 were separated by ";".
59 If the header_values passed as argument contains multiple values, then they
60 are treated as if they were a single value separated by comma ",".
62 This means that this function is useful for parsing header fields that
63 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
64 the requirement for tokens).
67 header = (token | parameter) *( [";"] (token | parameter))
69 token = 1*<any CHAR except CTLs or separators>
70 separators = "(" | ")" | "<" | ">" | "@"
71 | "," | ";" | ":" | "\" |
<">
72 | "/" | "[" | "]" | "?
" | "="
75 quoted-string = ( <"> *(qdtext | quoted
-pair
) <"> )
76 qdtext = <any TEXT except <">>
77 quoted
-pair
= "\" CHAR
79 parameter = attribute "=" value
81 value = token | quoted-string
83 Each header is represented by a list of key/value pairs. The value for a
84 simple token (not part of a parameter) is None. Syntactically incorrect
85 headers will not necessarily be parsed as you would want.
87 This is easier to describe with some examples:
89 >>> split_header_words(['foo="bar
"; port="80,81"; discard, bar=baz'])
90 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
91 >>> split_header_words(['text/html; charset="iso
-8859-1"'])
92 [[('text/html', None), ('charset', 'iso-8859-1')]]
93 >>> split_header_words([r'Basic realm="\"foo
\bar
\""'])
94 [[('Basic', None), ('realm', '"foobar
"')]]
97 assert type(header_values) not in STRING_TYPES
99 for text in header_values:
103 m = token_re.search(text)
107 m = quoted_value_re.search(text)
111 value = escape_re.sub(r"\
1", value)
113 m = value_re.search(text)
114 if m: # unquoted value
117 value = value.rstrip()
119 # no value, a lone token
121 pairs.append((name, value))
122 elif text.lstrip().startswith(","):
123 # concatenated headers, as per RFC 2616 section 4.2
124 text = text.lstrip()[1:]
125 if pairs: result.append(pairs)
129 non_junk, nr_junk_chars = re.subn("^
[=\s
;]*", "", text)
130 assert nr_junk_chars > 0, (
131 "split_header_words bug
: '%s', '%s', %s" %
132 (orig_text, text, pairs))
134 if pairs: result.append(pairs)
137 join_escape_re = re.compile(r"([\"\\])")
138 def join_header_words(lists):
139 """Do the inverse of the conversion done by split_header_words.
141 Takes a list of lists of (key, value) pairs and produces a single header
142 value. Attribute values are quoted if needed.
144 >>> join_header_words([[("text
/plain
", None), ("charset
", "iso
-8859/1")]])
145 'text/plain; charset="iso
-8859/1"'
146 >>> join_header_words([[("text
/plain
", None)], [("charset
", "iso
-8859/1")]])
147 'text/plain, charset="iso
-8859/1"'
155 if not re.search(r"^\w
+$
", v):
156 v = join_escape_re.sub(r"\\\
1", v) # escape " and \
158 if k
is None: # Netscape cookies may have no name
163 if attr
: headers
.append("; ".join(attr
))
164 return ", ".join(headers
)
166 def parse_ns_headers(ns_headers
):
167 """Ad-hoc parser for Netscape protocol cookie-attributes.
169 The old Netscape cookie format for Set-Cookie can for instance contain
170 an unquoted "," in the expires field, so we have to use this ad-hoc
171 parser instead of split_header_words.
173 XXX This may not make the best possible effort to parse all the crap
174 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
175 parser is probably better, so could do worse than following that if
176 this ever gives any trouble.
178 Currently, this is also used for parsing RFC 2109 cookies.
181 known_attrs
= ("expires", "domain", "path", "secure",
182 # RFC 2109 attrs (may turn up in Netscape cookies, too)
186 for ns_header
in ns_headers
:
189 params
= re
.split(r
";\s*", ns_header
)
190 for ii
in range(len(params
)):
192 param
= param
.rstrip()
193 if param
== "": continue
197 k
, v
= re
.split(r
"\s*=\s*", param
, 1)
201 if lc
in known_attrs
:
204 # This is an RFC 2109 cookie.
207 # convert expires date to seconds since epoch
208 if v
.startswith('"'): v
= v
[1:]
209 if v
.endswith('"'): v
= v
[:-1]
210 v
= http2time(v
) # None if invalid
215 pairs
.append(("version", "0"))
222 import doctest
, _headersutil
223 return doctest
.testmod(_headersutil
)
225 if __name__
== "__main__":