1 # Copyright (C) 2001,2002 Python Software Foundation
2 # Author: barry@zope.com (Barry Warsaw)
4 """A parser of RFC 2822 and MIME email messages.
8 from cStringIO
import StringIO
9 from types
import ListType
11 from email
import Errors
12 from email
import Message
23 NLCRE
= re
.compile('\r\n|\r|\n')
28 def __init__(self
, _class
=Message
.Message
, strict
=False):
29 """Parser of RFC 2822 and MIME email messages.
31 Creates an in-memory object tree representing the email message, which
32 can then be manipulated and turned over to a Generator to return the
33 textual representation of the message.
35 The string must be formatted as a block of RFC 2822 headers and header
36 continuation lines, optionally preceeded by a `Unix-from' header. The
37 header block is terminated either by the end of the string or by a
40 _class is the class to instantiate for new message objects when they
41 must be created. This class must have a constructor that can take
42 zero arguments. Default is Message.Message.
44 Optional strict tells the parser to be strictly RFC compliant or to be
45 more forgiving in parsing of ill-formatted MIME documents. When
46 non-strict mode is used, the parser will try to make up for missing or
47 erroneous boundaries and other peculiarities seen in the wild.
48 Default is non-strict parsing.
53 def parse(self
, fp
, headersonly
=False):
54 """Create a message structure from the data in a file.
56 Reads all the data from the file and returns the root of the message
57 structure. Optional headersonly is a flag specifying whether to stop
58 parsing after reading the headers or not. The default is False,
59 meaning it parses the entire contents of the file.
62 firstbodyline
= self
._parseheaders
(root
, fp
)
64 self
._parsebody
(root
, fp
, firstbodyline
)
67 def parsestr(self
, text
, headersonly
=False):
68 """Create a message structure from a string.
70 Returns the root of the message structure. Optional headersonly is a
71 flag specifying whether to stop parsing after reading the headers or
72 not. The default is False, meaning it parses the entire contents of
75 return self
.parse(StringIO(text
), headersonly
=headersonly
)
77 def _parseheaders(self
, container
, fp
):
78 # Parse the headers, returning a list of header/value pairs. None as
79 # the header means the Unix-From header.
85 # Don't strip the line before we test for the end condition,
86 # because whitespace-only header lines are RFC compliant
91 line
= line
.splitlines()[0]
94 # Ignore the trailing newline
96 # Check for initial Unix From_ line
97 if line
.startswith('From '):
99 container
.set_unixfrom(line
)
102 raise Errors
.HeaderParseError(
103 'Unix-from in headers after first rfc822 header')
105 # ignore the wierdly placed From_ line
106 # XXX: maybe set unixfrom anyway? or only if not already?
108 # Header continuation line
111 raise Errors
.HeaderParseError(
112 'Continuation line seen before first header')
113 lastvalue
.append(line
)
115 # Normal, non-continuation header. BAW: this should check to make
116 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
117 # should expose the header matching algorithm in the API, and
118 # allow for a non-strict parsing mode (that ignores the line
119 # instead of raising the exception).
123 raise Errors
.HeaderParseError(
124 "Not a header, not a continuation: ``%s''" % line
)
125 elif lineno
== 1 and line
.startswith('--'):
126 # allow through duplicate boundary tags.
129 # There was no separating blank line as mandated by RFC
130 # 2822, but we're in non-strict mode. So just offer up
131 # this current line as the first body line.
135 container
[lastheader
] = NL
.join(lastvalue
)
136 lastheader
= line
[:i
]
137 lastvalue
= [line
[i
+1:].lstrip()]
138 # Make sure we retain the last header
140 container
[lastheader
] = NL
.join(lastvalue
)
143 def _parsebody(self
, container
, fp
, firstbodyline
=None):
144 # Parse the body, but first split the payload on the content-type
145 # boundary if present.
146 boundary
= container
.get_boundary()
147 isdigest
= (container
.get_content_type() == 'multipart/digest')
148 # If there's a boundary, split the payload text into its constituent
149 # parts and parse each separately. Otherwise, just parse the rest of
150 # the body as a single message. Note: any exceptions raised in the
151 # recursive parse need to have their line numbers coerced.
153 preamble
= epilogue
= None
154 # Split into subparts. The first boundary we're looking for won't
155 # always have a leading newline since we're at the start of the
156 # body text, and there's not always a preamble before the first
158 separator
= '--' + boundary
160 if firstbodyline
is not None:
161 payload
= firstbodyline
+ '\n' + payload
162 # We use an RE here because boundaries can have trailing
165 r
'(?P<sep>' + re
.escape(separator
) + r
')(?P<ws>[ \t]*)',
169 raise Errors
.BoundaryError(
170 "Couldn't find starting boundary: %s" % boundary
)
171 container
.set_payload(payload
)
175 # there's some pre-MIME boundary preamble
176 preamble
= payload
[0:start
]
177 # Find out what kind of line endings we're using
178 start
+= len(mo
.group('sep')) + len(mo
.group('ws'))
179 mo
= NLCRE
.search(payload
, start
)
181 start
+= len(mo
.group(0))
182 # We create a compiled regexp first because we need to be able to
183 # specify the start position, and the module function doesn't
184 # support this signature. :(
185 cre
= re
.compile('(?P<sep>\r\n|\r|\n)' +
186 re
.escape(separator
) + '--')
187 mo
= cre
.search(payload
, start
)
189 terminator
= mo
.start()
190 linesep
= mo
.group('sep')
191 if mo
.end() < len(payload
):
192 # There's some post-MIME boundary epilogue
193 epilogue
= payload
[mo
.end():]
195 raise Errors
.BoundaryError(
196 "Couldn't find terminating boundary: %s" % boundary
)
198 # Handle the case of no trailing boundary. Check that it ends
199 # in a blank line. Some cases (spamspamspam) don't even have
201 mo
= re
.search('(?P<sep>\r\n|\r|\n){2}$', payload
)
203 mo
= re
.search('(?P<sep>\r\n|\r|\n)$', payload
)
205 raise Errors
.BoundaryError(
206 'No terminating boundary and no trailing empty line')
207 linesep
= mo
.group('sep')
208 terminator
= len(payload
)
209 # We split the textual payload on the boundary separator, which
210 # includes the trailing newline. If the container is a
211 # multipart/digest then the subparts are by default message/rfc822
212 # instead of text/plain. In that case, they'll have a optional
213 # block of MIME headers, then an empty line followed by the
216 linesep
+ re
.escape(separator
) + r
'[ \t]*' + linesep
,
217 payload
[start
:terminator
])
220 if part
.startswith(linesep
):
221 # There's no header block so create an empty message
222 # object as the container, and lop off the newline so
223 # we can parse the sub-subobject
224 msgobj
= self
._class
()
225 part
= part
[len(linesep
):]
227 parthdrs
, part
= part
.split(linesep
+linesep
, 1)
228 # msgobj in this case is the "message/rfc822" container
229 msgobj
= self
.parsestr(parthdrs
, headersonly
=1)
230 # while submsgobj is the message itself
231 msgobj
.set_default_type('message/rfc822')
232 maintype
= msgobj
.get_content_maintype()
233 if maintype
in ('message', 'multipart'):
234 submsgobj
= self
.parsestr(part
)
235 msgobj
.attach(submsgobj
)
237 msgobj
.set_payload(part
)
239 msgobj
= self
.parsestr(part
)
240 container
.preamble
= preamble
241 container
.epilogue
= epilogue
242 container
.attach(msgobj
)
243 elif container
.get_main_type() == 'multipart':
244 # Very bad. A message is a multipart with no boundary!
245 raise Errors
.BoundaryError(
246 'multipart message with no defined boundary')
247 elif container
.get_type() == 'message/delivery-status':
248 # This special kind of type contains blocks of headers separated
249 # by a blank line. We'll represent each header block as a
250 # separate Message object
253 blockmsg
= self
._class
()
254 self
._parseheaders
(blockmsg
, fp
)
255 if not len(blockmsg
):
256 # No more header blocks left
258 blocks
.append(blockmsg
)
259 container
.set_payload(blocks
)
260 elif container
.get_main_type() == 'message':
261 # Create a container for the payload, but watch out for there not
262 # being any headers left
265 except Errors
.HeaderParseError
:
267 self
._parsebody
(msg
, fp
)
268 container
.attach(msg
)
271 if firstbodyline
is not None:
272 text
= firstbodyline
+ '\n' + text
273 container
.set_payload(text
)
277 class HeaderParser(Parser
):
278 """A subclass of Parser, this one only meaningfully parses message headers.
280 This class can be used if all you're interested in is the headers of a
281 message. While it consumes the message body, it does not parse it, but
282 simply makes it available as a string payload.
284 Parsing with this subclass can be considerably faster if all you're
285 interested in is the message headers.
287 def _parsebody(self
, container
, fp
, firstbodyline
=None):
288 # Consume but do not parse, the body
290 if firstbodyline
is not None:
291 text
= firstbodyline
+ '\n' + text
292 container
.set_payload(text
)