Lib/email/Parser.py

   1 # Copyright (C) 2001,2002 Python Software Foundation
   2 # Author: barry@zope.com (Barry Warsaw)
   3
   4 """A parser of RFC 2822 and MIME email messages.
   5 """
   6
   7 import re
   8 from cStringIO import StringIO
   9 from types import ListType
  10
  11 from email import Errors
  12 from email import Message
  13
  14 EMPTYSTRING = ''
  15 NL = '\n'
  16
  17 try:
  18     True, False
  19 except NameError:
  20     True = 1
  21     False = 0
  22
  23 NLCRE = re.compile('\r\n|\r|\n')
  24
  25
  26 \f
  27 class Parser:
  28     def __init__(self, _class=Message.Message, strict=False):
  29         """Parser of RFC 2822 and MIME email messages.
  30
  31         Creates an in-memory object tree representing the email message, which
  32         can then be manipulated and turned over to a Generator to return the
  33         textual representation of the message.
  34
  35         The string must be formatted as a block of RFC 2822 headers and header
  36         continuation lines, optionally preceeded by a `Unix-from' header.  The
  37         header block is terminated either by the end of the string or by a
  38         blank line.
  39
  40         _class is the class to instantiate for new message objects when they
  41         must be created.  This class must have a constructor that can take
  42         zero arguments.  Default is Message.Message.
  43
  44         Optional strict tells the parser to be strictly RFC compliant or to be
  45         more forgiving in parsing of ill-formatted MIME documents.  When
  46         non-strict mode is used, the parser will try to make up for missing or
  47         erroneous boundaries and other peculiarities seen in the wild.
  48         Default is non-strict parsing.
  49         """
  50         self._class = _class
  51         self._strict = strict
  52
  53     def parse(self, fp, headersonly=False):
  54         """Create a message structure from the data in a file.
  55
  56         Reads all the data from the file and returns the root of the message
  57         structure.  Optional headersonly is a flag specifying whether to stop
  58         parsing after reading the headers or not.  The default is False,
  59         meaning it parses the entire contents of the file.
  60         """
  61         root = self._class()
  62         firstbodyline = self._parseheaders(root, fp)
  63         if not headersonly:
  64             self._parsebody(root, fp, firstbodyline)
  65         return root
  66
  67     def parsestr(self, text, headersonly=False):
  68         """Create a message structure from a string.
  69
  70         Returns the root of the message structure.  Optional headersonly is a
  71         flag specifying whether to stop parsing after reading the headers or
  72         not.  The default is False, meaning it parses the entire contents of
  73         the file.
  74         """
  75         return self.parse(StringIO(text), headersonly=headersonly)
  76
  77     def _parseheaders(self, container, fp):
  78         # Parse the headers, returning a list of header/value pairs.  None as
  79         # the header means the Unix-From header.
  80         lastheader = ''
  81         lastvalue = []
  82         lineno = 0
  83         firstbodyline = None
  84         while True:
  85             # Don't strip the line before we test for the end condition,
  86             # because whitespace-only header lines are RFC compliant
  87             # continuation lines.
  88             line = fp.readline()
  89             if not line:
  90                 break
  91             line = line.splitlines()[0]
  92             if not line:
  93                 break
  94             # Ignore the trailing newline
  95             lineno += 1
  96             # Check for initial Unix From_ line
  97             if line.startswith('From '):
  98                 if lineno == 1:
  99                     container.set_unixfrom(line)
 100                     continue
 101                 elif self._strict:
 102                     raise Errors.HeaderParseError(
 103                         'Unix-from in headers after first rfc822 header')
 104                 else:
 105                     # ignore the wierdly placed From_ line
 106                     # XXX: maybe set unixfrom anyway? or only if not already?
 107                     continue
 108             # Header continuation line
 109             if line[0] in ' \t':
 110                 if not lastheader:
 111                     raise Errors.HeaderParseError(
 112                         'Continuation line seen before first header')
 113                 lastvalue.append(line)
 114                 continue
 115             # Normal, non-continuation header.  BAW: this should check to make
 116             # sure it's a legal header, e.g. doesn't contain spaces.  Also, we
 117             # should expose the header matching algorithm in the API, and
 118             # allow for a non-strict parsing mode (that ignores the line
 119             # instead of raising the exception).
 120             i = line.find(':')
 121             if i < 0:
 122                 if self._strict:
 123                     raise Errors.HeaderParseError(
 124                         "Not a header, not a continuation: ``%s''" % line)
 125                 elif lineno == 1 and line.startswith('--'):
 126                     # allow through duplicate boundary tags.
 127                     continue
 128                 else:
 129                     # There was no separating blank line as mandated by RFC
 130                     # 2822, but we're in non-strict mode.  So just offer up
 131                     # this current line as the first body line.
 132                     firstbodyline = line
 133                     break
 134             if lastheader:
 135                 container[lastheader] = NL.join(lastvalue)
 136             lastheader = line[:i]
 137             lastvalue = [line[i+1:].lstrip()]
 138         # Make sure we retain the last header
 139         if lastheader:
 140             container[lastheader] = NL.join(lastvalue)
 141         return firstbodyline
 142
 143     def _parsebody(self, container, fp, firstbodyline=None):
 144         # Parse the body, but first split the payload on the content-type
 145         # boundary if present.
 146         boundary = container.get_boundary()
 147         isdigest = (container.get_content_type() == 'multipart/digest')
 148         # If there's a boundary, split the payload text into its constituent
 149         # parts and parse each separately.  Otherwise, just parse the rest of
 150         # the body as a single message.  Note: any exceptions raised in the
 151         # recursive parse need to have their line numbers coerced.
 152         if boundary:
 153             preamble = epilogue = None
 154             # Split into subparts.  The first boundary we're looking for won't
 155             # always have a leading newline since we're at the start of the
 156             # body text, and there's not always a preamble before the first
 157             # boundary.
 158             separator = '--' + boundary
 159             payload = fp.read()
 160             if firstbodyline is not None:
 161                 payload = firstbodyline + '\n' + payload
 162             # We use an RE here because boundaries can have trailing
 163             # whitespace.
 164             mo = re.search(
 165                 r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
 166                 payload)
 167             if not mo:
 168                 if self._strict:
 169                     raise Errors.BoundaryError(
 170                         "Couldn't find starting boundary: %s" % boundary)
 171                 container.set_payload(payload)
 172                 return
 173             start = mo.start()
 174             if start > 0:
 175                 # there's some pre-MIME boundary preamble
 176                 preamble = payload[0:start]
 177             # Find out what kind of line endings we're using
 178             start += len(mo.group('sep')) + len(mo.group('ws'))
 179             mo = NLCRE.search(payload, start)
 180             if mo:
 181                 start += len(mo.group(0))
 182             # We create a compiled regexp first because we need to be able to
 183             # specify the start position, and the module function doesn't
 184             # support this signature. :(
 185             cre = re.compile('(?P<sep>\r\n|\r|\n)' +
 186                              re.escape(separator) + '--')
 187             mo = cre.search(payload, start)
 188             if mo:
 189                 terminator = mo.start()
 190                 linesep = mo.group('sep')
 191                 if mo.end() < len(payload):
 192                     # There's some post-MIME boundary epilogue
 193                     epilogue = payload[mo.end():]
 194             elif self._strict:
 195                 raise Errors.BoundaryError(
 196                         "Couldn't find terminating boundary: %s" % boundary)
 197             else:
 198                 # Handle the case of no trailing boundary.  Check that it ends
 199                 # in a blank line.  Some cases (spamspamspam) don't even have
 200                 # that!
 201                 mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
 202                 if not mo:
 203                     mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
 204                     if not mo:
 205                         raise Errors.BoundaryError(
 206                           'No terminating boundary and no trailing empty line')
 207                 linesep = mo.group('sep')
 208                 terminator = len(payload)
 209             # We split the textual payload on the boundary separator, which
 210             # includes the trailing newline. If the container is a
 211             # multipart/digest then the subparts are by default message/rfc822
 212             # instead of text/plain.  In that case, they'll have a optional
 213             # block of MIME headers, then an empty line followed by the
 214             # message headers.
 215             parts = re.split(
 216                 linesep + re.escape(separator) + r'[ \t]*' + linesep,
 217                 payload[start:terminator])
 218             for part in parts:
 219                 if isdigest:
 220                     if part.startswith(linesep):
 221                         # There's no header block so create an empty message
 222                         # object as the container, and lop off the newline so
 223                         # we can parse the sub-subobject
 224                         msgobj = self._class()
 225                         part = part[len(linesep):]
 226                     else:
 227                         parthdrs, part = part.split(linesep+linesep, 1)
 228                         # msgobj in this case is the "message/rfc822" container
 229                         msgobj = self.parsestr(parthdrs, headersonly=1)
 230                     # while submsgobj is the message itself
 231                     msgobj.set_default_type('message/rfc822')
 232                     maintype = msgobj.get_content_maintype()
 233                     if maintype in ('message', 'multipart'):
 234                         submsgobj = self.parsestr(part)
 235                         msgobj.attach(submsgobj)
 236                     else:
 237                         msgobj.set_payload(part)
 238                 else:
 239                     msgobj = self.parsestr(part)
 240                 container.preamble = preamble
 241                 container.epilogue = epilogue
 242                 container.attach(msgobj)
 243         elif container.get_main_type() == 'multipart':
 244             # Very bad.  A message is a multipart with no boundary!
 245             raise Errors.BoundaryError(
 246                 'multipart message with no defined boundary')
 247         elif container.get_type() == 'message/delivery-status':
 248             # This special kind of type contains blocks of headers separated
 249             # by a blank line.  We'll represent each header block as a
 250             # separate Message object
 251             blocks = []
 252             while True:
 253                 blockmsg = self._class()
 254                 self._parseheaders(blockmsg, fp)
 255                 if not len(blockmsg):
 256                     # No more header blocks left
 257                     break
 258                 blocks.append(blockmsg)
 259             container.set_payload(blocks)
 260         elif container.get_main_type() == 'message':
 261             # Create a container for the payload, but watch out for there not
 262             # being any headers left
 263             try:
 264                 msg = self.parse(fp)
 265             except Errors.HeaderParseError:
 266                 msg = self._class()
 267                 self._parsebody(msg, fp)
 268             container.attach(msg)
 269         else:
 270             text = fp.read()
 271             if firstbodyline is not None:
 272                 text = firstbodyline + '\n' + text
 273             container.set_payload(text)
 274
 275
 276 \f
 277 class HeaderParser(Parser):
 278     """A subclass of Parser, this one only meaningfully parses message headers.
 279
 280     This class can be used if all you're interested in is the headers of a
 281     message.  While it consumes the message body, it does not parse it, but
 282     simply makes it available as a string payload.
 283
 284     Parsing with this subclass can be considerably faster if all you're
 285     interested in is the message headers.
 286     """
 287     def _parsebody(self, container, fp, firstbodyline=None):
 288         # Consume but do not parse, the body
 289         text = fp.read()
 290         if firstbodyline is not None:
 291             text = firstbodyline + '\n' + text
 292         container.set_payload(text)