Update version number and release date.
[python/dscho.git] / Lib / email / Parser.py
blob09fac4552f9379f8a32d57dd74329527c0478037
1 # Copyright (C) 2001,2002 Python Software Foundation
2 # Author: barry@zope.com (Barry Warsaw)
4 """A parser of RFC 2822 and MIME email messages.
5 """
7 import re
8 from cStringIO import StringIO
9 from types import ListType
11 from email import Errors
12 from email import Message
14 EMPTYSTRING = ''
15 NL = '\n'
17 try:
18 True, False
19 except NameError:
20 True = 1
21 False = 0
23 NLCRE = re.compile('\r\n|\r|\n')
27 class Parser:
28 def __init__(self, _class=Message.Message, strict=False):
29 """Parser of RFC 2822 and MIME email messages.
31 Creates an in-memory object tree representing the email message, which
32 can then be manipulated and turned over to a Generator to return the
33 textual representation of the message.
35 The string must be formatted as a block of RFC 2822 headers and header
36 continuation lines, optionally preceeded by a `Unix-from' header. The
37 header block is terminated either by the end of the string or by a
38 blank line.
40 _class is the class to instantiate for new message objects when they
41 must be created. This class must have a constructor that can take
42 zero arguments. Default is Message.Message.
44 Optional strict tells the parser to be strictly RFC compliant or to be
45 more forgiving in parsing of ill-formatted MIME documents. When
46 non-strict mode is used, the parser will try to make up for missing or
47 erroneous boundaries and other peculiarities seen in the wild.
48 Default is non-strict parsing.
49 """
50 self._class = _class
51 self._strict = strict
53 def parse(self, fp, headersonly=False):
54 """Create a message structure from the data in a file.
56 Reads all the data from the file and returns the root of the message
57 structure. Optional headersonly is a flag specifying whether to stop
58 parsing after reading the headers or not. The default is False,
59 meaning it parses the entire contents of the file.
60 """
61 root = self._class()
62 firstbodyline = self._parseheaders(root, fp)
63 if not headersonly:
64 self._parsebody(root, fp, firstbodyline)
65 return root
67 def parsestr(self, text, headersonly=False):
68 """Create a message structure from a string.
70 Returns the root of the message structure. Optional headersonly is a
71 flag specifying whether to stop parsing after reading the headers or
72 not. The default is False, meaning it parses the entire contents of
73 the file.
74 """
75 return self.parse(StringIO(text), headersonly=headersonly)
77 def _parseheaders(self, container, fp):
78 # Parse the headers, returning a list of header/value pairs. None as
79 # the header means the Unix-From header.
80 lastheader = ''
81 lastvalue = []
82 lineno = 0
83 firstbodyline = None
84 while True:
85 # Don't strip the line before we test for the end condition,
86 # because whitespace-only header lines are RFC compliant
87 # continuation lines.
88 line = fp.readline()
89 if not line:
90 break
91 line = line.splitlines()[0]
92 if not line:
93 break
94 # Ignore the trailing newline
95 lineno += 1
96 # Check for initial Unix From_ line
97 if line.startswith('From '):
98 if lineno == 1:
99 container.set_unixfrom(line)
100 continue
101 elif self._strict:
102 raise Errors.HeaderParseError(
103 'Unix-from in headers after first rfc822 header')
104 else:
105 # ignore the wierdly placed From_ line
106 # XXX: maybe set unixfrom anyway? or only if not already?
107 continue
108 # Header continuation line
109 if line[0] in ' \t':
110 if not lastheader:
111 raise Errors.HeaderParseError(
112 'Continuation line seen before first header')
113 lastvalue.append(line)
114 continue
115 # Normal, non-continuation header. BAW: this should check to make
116 # sure it's a legal header, e.g. doesn't contain spaces. Also, we
117 # should expose the header matching algorithm in the API, and
118 # allow for a non-strict parsing mode (that ignores the line
119 # instead of raising the exception).
120 i = line.find(':')
121 if i < 0:
122 if self._strict:
123 raise Errors.HeaderParseError(
124 "Not a header, not a continuation: ``%s''" % line)
125 elif lineno == 1 and line.startswith('--'):
126 # allow through duplicate boundary tags.
127 continue
128 else:
129 # There was no separating blank line as mandated by RFC
130 # 2822, but we're in non-strict mode. So just offer up
131 # this current line as the first body line.
132 firstbodyline = line
133 break
134 if lastheader:
135 container[lastheader] = NL.join(lastvalue)
136 lastheader = line[:i]
137 lastvalue = [line[i+1:].lstrip()]
138 # Make sure we retain the last header
139 if lastheader:
140 container[lastheader] = NL.join(lastvalue)
141 return firstbodyline
143 def _parsebody(self, container, fp, firstbodyline=None):
144 # Parse the body, but first split the payload on the content-type
145 # boundary if present.
146 boundary = container.get_boundary()
147 isdigest = (container.get_content_type() == 'multipart/digest')
148 # If there's a boundary, split the payload text into its constituent
149 # parts and parse each separately. Otherwise, just parse the rest of
150 # the body as a single message. Note: any exceptions raised in the
151 # recursive parse need to have their line numbers coerced.
152 if boundary:
153 preamble = epilogue = None
154 # Split into subparts. The first boundary we're looking for won't
155 # always have a leading newline since we're at the start of the
156 # body text, and there's not always a preamble before the first
157 # boundary.
158 separator = '--' + boundary
159 payload = fp.read()
160 if firstbodyline is not None:
161 payload = firstbodyline + '\n' + payload
162 # We use an RE here because boundaries can have trailing
163 # whitespace.
164 mo = re.search(
165 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
166 payload)
167 if not mo:
168 if self._strict:
169 raise Errors.BoundaryError(
170 "Couldn't find starting boundary: %s" % boundary)
171 container.set_payload(payload)
172 return
173 start = mo.start()
174 if start > 0:
175 # there's some pre-MIME boundary preamble
176 preamble = payload[0:start]
177 # Find out what kind of line endings we're using
178 start += len(mo.group('sep')) + len(mo.group('ws'))
179 mo = NLCRE.search(payload, start)
180 if mo:
181 start += len(mo.group(0))
182 # We create a compiled regexp first because we need to be able to
183 # specify the start position, and the module function doesn't
184 # support this signature. :(
185 cre = re.compile('(?P<sep>\r\n|\r|\n)' +
186 re.escape(separator) + '--')
187 mo = cre.search(payload, start)
188 if mo:
189 terminator = mo.start()
190 linesep = mo.group('sep')
191 if mo.end() < len(payload):
192 # There's some post-MIME boundary epilogue
193 epilogue = payload[mo.end():]
194 elif self._strict:
195 raise Errors.BoundaryError(
196 "Couldn't find terminating boundary: %s" % boundary)
197 else:
198 # Handle the case of no trailing boundary. Check that it ends
199 # in a blank line. Some cases (spamspamspam) don't even have
200 # that!
201 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
202 if not mo:
203 mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
204 if not mo:
205 raise Errors.BoundaryError(
206 'No terminating boundary and no trailing empty line')
207 linesep = mo.group('sep')
208 terminator = len(payload)
209 # We split the textual payload on the boundary separator, which
210 # includes the trailing newline. If the container is a
211 # multipart/digest then the subparts are by default message/rfc822
212 # instead of text/plain. In that case, they'll have a optional
213 # block of MIME headers, then an empty line followed by the
214 # message headers.
215 parts = re.split(
216 linesep + re.escape(separator) + r'[ \t]*' + linesep,
217 payload[start:terminator])
218 for part in parts:
219 if isdigest:
220 if part.startswith(linesep):
221 # There's no header block so create an empty message
222 # object as the container, and lop off the newline so
223 # we can parse the sub-subobject
224 msgobj = self._class()
225 part = part[len(linesep):]
226 else:
227 parthdrs, part = part.split(linesep+linesep, 1)
228 # msgobj in this case is the "message/rfc822" container
229 msgobj = self.parsestr(parthdrs, headersonly=1)
230 # while submsgobj is the message itself
231 msgobj.set_default_type('message/rfc822')
232 maintype = msgobj.get_content_maintype()
233 if maintype in ('message', 'multipart'):
234 submsgobj = self.parsestr(part)
235 msgobj.attach(submsgobj)
236 else:
237 msgobj.set_payload(part)
238 else:
239 msgobj = self.parsestr(part)
240 container.preamble = preamble
241 container.epilogue = epilogue
242 container.attach(msgobj)
243 elif container.get_main_type() == 'multipart':
244 # Very bad. A message is a multipart with no boundary!
245 raise Errors.BoundaryError(
246 'multipart message with no defined boundary')
247 elif container.get_type() == 'message/delivery-status':
248 # This special kind of type contains blocks of headers separated
249 # by a blank line. We'll represent each header block as a
250 # separate Message object
251 blocks = []
252 while True:
253 blockmsg = self._class()
254 self._parseheaders(blockmsg, fp)
255 if not len(blockmsg):
256 # No more header blocks left
257 break
258 blocks.append(blockmsg)
259 container.set_payload(blocks)
260 elif container.get_main_type() == 'message':
261 # Create a container for the payload, but watch out for there not
262 # being any headers left
263 try:
264 msg = self.parse(fp)
265 except Errors.HeaderParseError:
266 msg = self._class()
267 self._parsebody(msg, fp)
268 container.attach(msg)
269 else:
270 text = fp.read()
271 if firstbodyline is not None:
272 text = firstbodyline + '\n' + text
273 container.set_payload(text)
277 class HeaderParser(Parser):
278 """A subclass of Parser, this one only meaningfully parses message headers.
280 This class can be used if all you're interested in is the headers of a
281 message. While it consumes the message body, it does not parse it, but
282 simply makes it available as a string payload.
284 Parsing with this subclass can be considerably faster if all you're
285 interested in is the message headers.
287 def _parsebody(self, container, fp, firstbodyline=None):
288 # Consume but do not parse, the body
289 text = fp.read()
290 if firstbodyline is not None:
291 text = firstbodyline + '\n' + text
292 container.set_payload(text)