Lib/rfc822.py

   1 """RFC 2822 message manipulation.
   2
   3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
   4 the tokenizing of addresses does not adhere to all the quoting rules.
   5
   6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
   7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
   8 effort at RFC 2822 updates have been made, but a thorough audit has not been
   9 performed.  Consider any RFC 2822 non-conformance to be a bug.
  10
  11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
  12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
  13
  14 Directions for use:
  15
  16 To create a Message object: first open a file, e.g.:
  17
  18   fp = open(file, 'r')
  19
  20 You can use any other legal way of getting an open file object, e.g. use
  21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
  22 constructor:
  23
  24   m = Message(fp)
  25
  26 This class can work with any input object that supports a readline method.  If
  27 the input object has seek and tell capability, the rewindbody method will
  28 work; also illegal lines will be pushed back onto the input stream.  If the
  29 input object lacks seek but has an `unread' method that can push back a line
  30 of input, Message will use that to push back illegal lines.  Thus this class
  31 can be used to parse messages coming from a buffered stream.
  32
  33 The optional `seekable' argument is provided as a workaround for certain stdio
  34 libraries in which tell() discards buffered data before discovering that the
  35 lseek() system call doesn't work.  For maximum portability, you should set the
  36 seekable argument to zero to prevent that initial \code{tell} when passing in
  37 an unseekable object such as a a file object created from a socket object.  If
  38 it is 1 on entry -- which it is by default -- the tell() method of the open
  39 file object is called once; if this raises an exception, seekable is reset to
  40 0.  For other nonzero values of seekable, this test is not made.
  41
  42 To get the text of a particular header there are several methods:
  43
  44   str = m.getheader(name)
  45   str = m.getrawheader(name)
  46
  47 where name is the name of the header, e.g. 'Subject'.  The difference is that
  48 getheader() strips the leading and trailing whitespace, while getrawheader()
  49 doesn't.  Both functions retain embedded whitespace (including newlines)
  50 exactly as they are specified in the header, and leave the case of the text
  51 unchanged.
  52
  53 For addresses and address lists there are functions
  54
  55   realname, mailaddress = m.getaddr(name)
  56   list = m.getaddrlist(name)
  57
  58 where the latter returns a list of (realname, mailaddr) tuples.
  59
  60 There is also a method
  61
  62   time = m.getdate(name)
  63
  64 which parses a Date-like field and returns a time-compatible tuple,
  65 i.e. a tuple such as returned by time.localtime() or accepted by
  66 time.mktime().
  67
  68 See the class definition for lower level access methods.
  69
  70 There are also some utility functions here.
  71 """
  72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
  73
  74 import time
  75
  76 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
  77
  78 _blanklines = ('\r\n', '\n')            # Optimization for islast()
  79
  80
  81 class Message:
  82     """Represents a single RFC 2822-compliant message."""
  83
  84     def __init__(self, fp, seekable = 1):
  85         """Initialize the class instance and read the headers."""
  86         if seekable == 1:
  87             # Exercise tell() to make sure it works
  88             # (and then assume seek() works, too)
  89             try:
  90                 fp.tell()
  91             except (AttributeError, IOError):
  92                 seekable = 0
  93             else:
  94                 seekable = 1
  95         self.fp = fp
  96         self.seekable = seekable
  97         self.startofheaders = None
  98         self.startofbody = None
  99         #
 100         if self.seekable:
 101             try:
 102                 self.startofheaders = self.fp.tell()
 103             except IOError:
 104                 self.seekable = 0
 105         #
 106         self.readheaders()
 107         #
 108         if self.seekable:
 109             try:
 110                 self.startofbody = self.fp.tell()
 111             except IOError:
 112                 self.seekable = 0
 113
 114     def rewindbody(self):
 115         """Rewind the file to the start of the body (if seekable)."""
 116         if not self.seekable:
 117             raise IOError, "unseekable file"
 118         self.fp.seek(self.startofbody)
 119
 120     def readheaders(self):
 121         """Read header lines.
 122
 123         Read header lines up to the entirely blank line that terminates them.
 124         The (normally blank) line that ends the headers is skipped, but not
 125         included in the returned list.  If a non-header line ends the headers,
 126         (which is an error), an attempt is made to backspace over it; it is
 127         never included in the returned list.
 128
 129         The variable self.status is set to the empty string if all went well,
 130         otherwise it is an error message.  The variable self.headers is a
 131         completely uninterpreted list of lines contained in the header (so
 132         printing them will reproduce the header exactly as it appears in the
 133         file).
 134         """
 135         self.dict = {}
 136         self.unixfrom = ''
 137         self.headers = list = []
 138         self.status = ''
 139         headerseen = ""
 140         firstline = 1
 141         startofline = unread = tell = None
 142         if hasattr(self.fp, 'unread'):
 143             unread = self.fp.unread
 144         elif self.seekable:
 145             tell = self.fp.tell
 146         while 1:
 147             if tell:
 148                 try:
 149                     startofline = tell()
 150                 except IOError:
 151                     startofline = tell = None
 152                     self.seekable = 0
 153             line = self.fp.readline()
 154             if not line:
 155                 self.status = 'EOF in headers'
 156                 break
 157             # Skip unix From name time lines
 158             if firstline and line.startswith('From '):
 159                 self.unixfrom = self.unixfrom + line
 160                 continue
 161             firstline = 0
 162             if headerseen and line[0] in ' \t':
 163                 # It's a continuation line.
 164                 list.append(line)
 165                 x = (self.dict[headerseen] + "\n " + line.strip())
 166                 self.dict[headerseen] = x.strip()
 167                 continue
 168             elif self.iscomment(line):
 169                 # It's a comment.  Ignore it.
 170                 continue
 171             elif self.islast(line):
 172                 # Note! No pushback here!  The delimiter line gets eaten.
 173                 break
 174             headerseen = self.isheader(line)
 175             if headerseen:
 176                 # It's a legal header line, save it.
 177                 list.append(line)
 178                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
 179                 continue
 180             else:
 181                 # It's not a header line; throw it back and stop here.
 182                 if not self.dict:
 183                     self.status = 'No headers'
 184                 else:
 185                     self.status = 'Non-header line where header expected'
 186                 # Try to undo the read.
 187                 if unread:
 188                     unread(line)
 189                 elif tell:
 190                     self.fp.seek(startofline)
 191                 else:
 192                     self.status = self.status + '; bad seek'
 193                 break
 194
 195     def isheader(self, line):
 196         """Determine whether a given line is a legal header.
 197
 198         This method should return the header name, suitably canonicalized.
 199         You may override this method in order to use Message parsing on tagged
 200         data in RFC 2822-like formats with special header formats.
 201         """
 202         i = line.find(':')
 203         if i > 0:
 204             return line[:i].lower()
 205         else:
 206             return None
 207
 208     def islast(self, line):
 209         """Determine whether a line is a legal end of RFC 2822 headers.
 210
 211         You may override this method if your application wants to bend the
 212         rules, e.g. to strip trailing whitespace, or to recognize MH template
 213         separators ('--------').  For convenience (e.g. for code reading from
 214         sockets) a line consisting of \r\n also matches.
 215         """
 216         return line in _blanklines
 217
 218     def iscomment(self, line):
 219         """Determine whether a line should be skipped entirely.
 220
 221         You may override this method in order to use Message parsing on tagged
 222         data in RFC 2822-like formats that support embedded comments or
 223         free-text data.
 224         """
 225         return False
 226
 227     def getallmatchingheaders(self, name):
 228         """Find all header lines matching a given header name.
 229
 230         Look through the list of headers and find all lines matching a given
 231         header name (and their continuation lines).  A list of the lines is
 232         returned, without interpretation.  If the header does not occur, an
 233         empty list is returned.  If the header occurs multiple times, all
 234         occurrences are returned.  Case is not important in the header name.
 235         """
 236         name = name.lower() + ':'
 237         n = len(name)
 238         list = []
 239         hit = 0
 240         for line in self.headers:
 241             if line[:n].lower() == name:
 242                 hit = 1
 243             elif not line[:1].isspace():
 244                 hit = 0
 245             if hit:
 246                 list.append(line)
 247         return list
 248
 249     def getfirstmatchingheader(self, name):
 250         """Get the first header line matching name.
 251
 252         This is similar to getallmatchingheaders, but it returns only the
 253         first matching header (and its continuation lines).
 254         """
 255         name = name.lower() + ':'
 256         n = len(name)
 257         list = []
 258         hit = 0
 259         for line in self.headers:
 260             if hit:
 261                 if not line[:1].isspace():
 262                     break
 263             elif line[:n].lower() == name:
 264                 hit = 1
 265             if hit:
 266                 list.append(line)
 267         return list
 268
 269     def getrawheader(self, name):
 270         """A higher-level interface to getfirstmatchingheader().
 271
 272         Return a string containing the literal text of the header but with the
 273         keyword stripped.  All leading, trailing and embedded whitespace is
 274         kept in the string, however.  Return None if the header does not
 275         occur.
 276         """
 277
 278         list = self.getfirstmatchingheader(name)
 279         if not list:
 280             return None
 281         list[0] = list[0][len(name) + 1:]
 282         return ''.join(list)
 283
 284     def getheader(self, name, default=None):
 285         """Get the header value for a name.
 286
 287         This is the normal interface: it returns a stripped version of the
 288         header value for a given header name, or None if it doesn't exist.
 289         This uses the dictionary version which finds the *last* such header.
 290         """
 291         try:
 292             return self.dict[name.lower()]
 293         except KeyError:
 294             return default
 295     get = getheader
 296
 297     def getheaders(self, name):
 298         """Get all values for a header.
 299
 300         This returns a list of values for headers given more than once; each
 301         value in the result list is stripped in the same way as the result of
 302         getheader().  If the header is not given, return an empty list.
 303         """
 304         result = []
 305         current = ''
 306         have_header = 0
 307         for s in self.getallmatchingheaders(name):
 308             if s[0].isspace():
 309                 if current:
 310                     current = "%s\n %s" % (current, s.strip())
 311                 else:
 312                     current = s.strip()
 313             else:
 314                 if have_header:
 315                     result.append(current)
 316                 current = s[s.find(":") + 1:].strip()
 317                 have_header = 1
 318         if have_header:
 319             result.append(current)
 320         return result
 321
 322     def getaddr(self, name):
 323         """Get a single address from a header, as a tuple.
 324
 325         An example return value:
 326         ('Guido van Rossum', 'guido@cwi.nl')
 327         """
 328         # New, by Ben Escoto
 329         alist = self.getaddrlist(name)
 330         if alist:
 331             return alist[0]
 332         else:
 333             return (None, None)
 334
 335     def getaddrlist(self, name):
 336         """Get a list of addresses from a header.
 337
 338         Retrieves a list of addresses from a header, where each address is a
 339         tuple as returned by getaddr().  Scans all named headers, so it works
 340         properly with multiple To: or Cc: headers for example.
 341         """
 342         raw = []
 343         for h in self.getallmatchingheaders(name):
 344             if h[0] in ' \t':
 345                 raw.append(h)
 346             else:
 347                 if raw:
 348                     raw.append(', ')
 349                 i = h.find(':')
 350                 if i > 0:
 351                     addr = h[i+1:]
 352                 raw.append(addr)
 353         alladdrs = ''.join(raw)
 354         a = AddressList(alladdrs)
 355         return a.addresslist
 356
 357     def getdate(self, name):
 358         """Retrieve a date field from a header.
 359
 360         Retrieves a date field from the named header, returning a tuple
 361         compatible with time.mktime().
 362         """
 363         try:
 364             data = self[name]
 365         except KeyError:
 366             return None
 367         return parsedate(data)
 368
 369     def getdate_tz(self, name):
 370         """Retrieve a date field from a header as a 10-tuple.
 371
 372         The first 9 elements make up a tuple compatible with time.mktime(),
 373         and the 10th is the offset of the poster's time zone from GMT/UTC.
 374         """
 375         try:
 376             data = self[name]
 377         except KeyError:
 378             return None
 379         return parsedate_tz(data)
 380
 381
 382     # Access as a dictionary (only finds *last* header of each type):
 383
 384     def __len__(self):
 385         """Get the number of headers in a message."""
 386         return len(self.dict)
 387
 388     def __getitem__(self, name):
 389         """Get a specific header, as from a dictionary."""
 390         return self.dict[name.lower()]
 391
 392     def __setitem__(self, name, value):
 393         """Set the value of a header.
 394
 395         Note: This is not a perfect inversion of __getitem__, because any
 396         changed headers get stuck at the end of the raw-headers list rather
 397         than where the altered header was.
 398         """
 399         del self[name] # Won't fail if it doesn't exist
 400         self.dict[name.lower()] = value
 401         text = name + ": " + value
 402         lines = text.split("\n")
 403         for line in lines:
 404             self.headers.append(line + "\n")
 405
 406     def __delitem__(self, name):
 407         """Delete all occurrences of a specific header, if it is present."""
 408         name = name.lower()
 409         if not name in self.dict:
 410             return
 411         del self.dict[name]
 412         name = name + ':'
 413         n = len(name)
 414         list = []
 415         hit = 0
 416         for i in range(len(self.headers)):
 417             line = self.headers[i]
 418             if line[:n].lower() == name:
 419                 hit = 1
 420             elif not line[:1].isspace():
 421                 hit = 0
 422             if hit:
 423                 list.append(i)
 424         list.reverse()
 425         for i in list:
 426             del self.headers[i]
 427
 428     def setdefault(self, name, default=""):
 429         lowername = name.lower()
 430         if lowername in self.dict:
 431             return self.dict[lowername]
 432         else:
 433             text = name + ": " + default
 434             lines = text.split("\n")
 435             for line in lines:
 436                 self.headers.append(line + "\n")
 437             self.dict[lowername] = default
 438             return default
 439
 440     def has_key(self, name):
 441         """Determine whether a message contains the named header."""
 442         return name.lower() in self.dict
 443
 444     def __contains__(self, name):
 445         """Determine whether a message contains the named header."""
 446         return name.lower() in self.dict
 447
 448     def keys(self):
 449         """Get all of a message's header field names."""
 450         return self.dict.keys()
 451
 452     def values(self):
 453         """Get all of a message's header field values."""
 454         return self.dict.values()
 455
 456     def items(self):
 457         """Get all of a message's headers.
 458
 459         Returns a list of name, value tuples.
 460         """
 461         return self.dict.items()
 462
 463     def __str__(self):
 464         str = ''
 465         for hdr in self.headers:
 466             str = str + hdr
 467         return str
 468
 469
 470 # Utility functions
 471 # -----------------
 472
 473 # XXX Should fix unquote() and quote() to be really conformant.
 474 # XXX The inverses of the parse functions may also be useful.
 475
 476
 477 def unquote(str):
 478     """Remove quotes from a string."""
 479     if len(str) > 1:
 480         if str.startswith('"') and str.endswith('"'):
 481             return str[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 482         if str.startswith('<') and str.endswith('>'):
 483             return str[1:-1]
 484     return str
 485
 486
 487 def quote(str):
 488     """Add quotes around a string."""
 489     return str.replace('\\', '\\\\').replace('"', '\\"')
 490
 491
 492 def parseaddr(address):
 493     """Parse an address into a (realname, mailaddr) tuple."""
 494     a = AddressList(address)
 495     list = a.addresslist
 496     if not list:
 497         return (None, None)
 498     else:
 499         return list[0]
 500
 501
 502 class AddrlistClass:
 503     """Address parser class by Ben Escoto.
 504
 505     To understand what this class does, it helps to have a copy of
 506     RFC 2822 in front of you.
 507
 508     http://www.faqs.org/rfcs/rfc2822.html
 509
 510     Note: this class interface is deprecated and may be removed in the future.
 511     Use rfc822.AddressList instead.
 512     """
 513
 514     def __init__(self, field):
 515         """Initialize a new instance.
 516
 517         `field' is an unparsed address header field, containing one or more
 518         addresses.
 519         """
 520         self.specials = '()<>@,:;.\"[]'
 521         self.pos = 0
 522         self.LWS = ' \t'
 523         self.CR = '\r\n'
 524         self.atomends = self.specials + self.LWS + self.CR
 525         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 526         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 527         # syntax, so allow dots in phrases.
 528         self.phraseends = self.atomends.replace('.', '')
 529         self.field = field
 530         self.commentlist = []
 531
 532     def gotonext(self):
 533         """Parse up to the start of the next address."""
 534         while self.pos < len(self.field):
 535             if self.field[self.pos] in self.LWS + '\n\r':
 536                 self.pos = self.pos + 1
 537             elif self.field[self.pos] == '(':
 538                 self.commentlist.append(self.getcomment())
 539             else: break
 540
 541     def getaddrlist(self):
 542         """Parse all addresses.
 543
 544         Returns a list containing all of the addresses.
 545         """
 546         result = []
 547         while 1:
 548             ad = self.getaddress()
 549             if ad:
 550                 result += ad
 551             else:
 552                 break
 553         return result
 554
 555     def getaddress(self):
 556         """Parse the next address."""
 557         self.commentlist = []
 558         self.gotonext()
 559
 560         oldpos = self.pos
 561         oldcl = self.commentlist
 562         plist = self.getphraselist()
 563
 564         self.gotonext()
 565         returnlist = []
 566
 567         if self.pos >= len(self.field):
 568             # Bad email address technically, no domain.
 569             if plist:
 570                 returnlist = [(' '.join(self.commentlist), plist[0])]
 571
 572         elif self.field[self.pos] in '.@':
 573             # email address is just an addrspec
 574             # this isn't very efficient since we start over
 575             self.pos = oldpos
 576             self.commentlist = oldcl
 577             addrspec = self.getaddrspec()
 578             returnlist = [(' '.join(self.commentlist), addrspec)]
 579
 580         elif self.field[self.pos] == ':':
 581             # address is a group
 582             returnlist = []
 583
 584             fieldlen = len(self.field)
 585             self.pos = self.pos + 1
 586             while self.pos < len(self.field):
 587                 self.gotonext()
 588                 if self.pos < fieldlen and self.field[self.pos] == ';':
 589                     self.pos = self.pos + 1
 590                     break
 591                 returnlist = returnlist + self.getaddress()
 592
 593         elif self.field[self.pos] == '<':
 594             # Address is a phrase then a route addr
 595             routeaddr = self.getrouteaddr()
 596
 597             if self.commentlist:
 598                 returnlist = [(' '.join(plist) + ' (' + \
 599                          ' '.join(self.commentlist) + ')', routeaddr)]
 600             else: returnlist = [(' '.join(plist), routeaddr)]
 601
 602         else:
 603             if plist:
 604                 returnlist = [(' '.join(self.commentlist), plist[0])]
 605             elif self.field[self.pos] in self.specials:
 606                 self.pos = self.pos + 1
 607
 608         self.gotonext()
 609         if self.pos < len(self.field) and self.field[self.pos] == ',':
 610             self.pos = self.pos + 1
 611         return returnlist
 612
 613     def getrouteaddr(self):
 614         """Parse a route address (Return-path value).
 615
 616         This method just skips all the route stuff and returns the addrspec.
 617         """
 618         if self.field[self.pos] != '<':
 619             return
 620
 621         expectroute = 0
 622         self.pos = self.pos + 1
 623         self.gotonext()
 624         adlist = ""
 625         while self.pos < len(self.field):
 626             if expectroute:
 627                 self.getdomain()
 628                 expectroute = 0
 629             elif self.field[self.pos] == '>':
 630                 self.pos = self.pos + 1
 631                 break
 632             elif self.field[self.pos] == '@':
 633                 self.pos = self.pos + 1
 634                 expectroute = 1
 635             elif self.field[self.pos] == ':':
 636                 self.pos = self.pos + 1
 637             else:
 638                 adlist = self.getaddrspec()
 639                 self.pos = self.pos + 1
 640                 break
 641             self.gotonext()
 642
 643         return adlist
 644
 645     def getaddrspec(self):
 646         """Parse an RFC 2822 addr-spec."""
 647         aslist = []
 648
 649         self.gotonext()
 650         while self.pos < len(self.field):
 651             if self.field[self.pos] == '.':
 652                 aslist.append('.')
 653                 self.pos = self.pos + 1
 654             elif self.field[self.pos] == '"':
 655                 aslist.append('"%s"' % self.getquote())
 656             elif self.field[self.pos] in self.atomends:
 657                 break
 658             else: aslist.append(self.getatom())
 659             self.gotonext()
 660
 661         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 662             return ''.join(aslist)
 663
 664         aslist.append('@')
 665         self.pos = self.pos + 1
 666         self.gotonext()
 667         return ''.join(aslist) + self.getdomain()
 668
 669     def getdomain(self):
 670         """Get the complete domain name from an address."""
 671         sdlist = []
 672         while self.pos < len(self.field):
 673             if self.field[self.pos] in self.LWS:
 674                 self.pos = self.pos + 1
 675             elif self.field[self.pos] == '(':
 676                 self.commentlist.append(self.getcomment())
 677             elif self.field[self.pos] == '[':
 678                 sdlist.append(self.getdomainliteral())
 679             elif self.field[self.pos] == '.':
 680                 self.pos = self.pos + 1
 681                 sdlist.append('.')
 682             elif self.field[self.pos] in self.atomends:
 683                 break
 684             else: sdlist.append(self.getatom())
 685         return ''.join(sdlist)
 686
 687     def getdelimited(self, beginchar, endchars, allowcomments = 1):
 688         """Parse a header fragment delimited by special characters.
 689
 690         `beginchar' is the start character for the fragment.  If self is not
 691         looking at an instance of `beginchar' then getdelimited returns the
 692         empty string.
 693
 694         `endchars' is a sequence of allowable end-delimiting characters.
 695         Parsing stops when one of these is encountered.
 696
 697         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 698         within the parsed fragment.
 699         """
 700         if self.field[self.pos] != beginchar:
 701             return ''
 702
 703         slist = ['']
 704         quote = 0
 705         self.pos = self.pos + 1
 706         while self.pos < len(self.field):
 707             if quote == 1:
 708                 slist.append(self.field[self.pos])
 709                 quote = 0
 710             elif self.field[self.pos] in endchars:
 711                 self.pos = self.pos + 1
 712                 break
 713             elif allowcomments and self.field[self.pos] == '(':
 714                 slist.append(self.getcomment())
 715             elif self.field[self.pos] == '\\':
 716                 quote = 1
 717             else:
 718                 slist.append(self.field[self.pos])
 719             self.pos = self.pos + 1
 720
 721         return ''.join(slist)
 722
 723     def getquote(self):
 724         """Get a quote-delimited fragment from self's field."""
 725         return self.getdelimited('"', '"\r', 0)
 726
 727     def getcomment(self):
 728         """Get a parenthesis-delimited fragment from self's field."""
 729         return self.getdelimited('(', ')\r', 1)
 730
 731     def getdomainliteral(self):
 732         """Parse an RFC 2822 domain-literal."""
 733         return '[%s]' % self.getdelimited('[', ']\r', 0)
 734
 735     def getatom(self, atomends=None):
 736         """Parse an RFC 2822 atom.
 737
 738         Optional atomends specifies a different set of end token delimiters
 739         (the default is to use self.atomends).  This is used e.g. in
 740         getphraselist() since phrase endings must not include the `.' (which
 741         is legal in phrases)."""
 742         atomlist = ['']
 743         if atomends is None:
 744             atomends = self.atomends
 745
 746         while self.pos < len(self.field):
 747             if self.field[self.pos] in atomends:
 748                 break
 749             else: atomlist.append(self.field[self.pos])
 750             self.pos = self.pos + 1
 751
 752         return ''.join(atomlist)
 753
 754     def getphraselist(self):
 755         """Parse a sequence of RFC 2822 phrases.
 756
 757         A phrase is a sequence of words, which are in turn either RFC 2822
 758         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 759         runs of continuous whitespace into one space.
 760         """
 761         plist = []
 762
 763         while self.pos < len(self.field):
 764             if self.field[self.pos] in self.LWS:
 765                 self.pos = self.pos + 1
 766             elif self.field[self.pos] == '"':
 767                 plist.append(self.getquote())
 768             elif self.field[self.pos] == '(':
 769                 self.commentlist.append(self.getcomment())
 770             elif self.field[self.pos] in self.phraseends:
 771                 break
 772             else:
 773                 plist.append(self.getatom(self.phraseends))
 774
 775         return plist
 776
 777 class AddressList(AddrlistClass):
 778     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 779     def __init__(self, field):
 780         AddrlistClass.__init__(self, field)
 781         if field:
 782             self.addresslist = self.getaddrlist()
 783         else:
 784             self.addresslist = []
 785
 786     def __len__(self):
 787         return len(self.addresslist)
 788
 789     def __str__(self):
 790         return ", ".join(map(dump_address_pair, self.addresslist))
 791
 792     def __add__(self, other):
 793         # Set union
 794         newaddr = AddressList(None)
 795         newaddr.addresslist = self.addresslist[:]
 796         for x in other.addresslist:
 797             if not x in self.addresslist:
 798                 newaddr.addresslist.append(x)
 799         return newaddr
 800
 801     def __iadd__(self, other):
 802         # Set union, in-place
 803         for x in other.addresslist:
 804             if not x in self.addresslist:
 805                 self.addresslist.append(x)
 806         return self
 807
 808     def __sub__(self, other):
 809         # Set difference
 810         newaddr = AddressList(None)
 811         for x in self.addresslist:
 812             if not x in other.addresslist:
 813                 newaddr.addresslist.append(x)
 814         return newaddr
 815
 816     def __isub__(self, other):
 817         # Set difference, in-place
 818         for x in other.addresslist:
 819             if x in self.addresslist:
 820                 self.addresslist.remove(x)
 821         return self
 822
 823     def __getitem__(self, index):
 824         # Make indexing, slices, and 'in' work
 825         return self.addresslist[index]
 826
 827 def dump_address_pair(pair):
 828     """Dump a (name, address) pair in a canonicalized form."""
 829     if pair[0]:
 830         return '"' + pair[0] + '" <' + pair[1] + '>'
 831     else:
 832         return pair[1]
 833
 834 # Parse a date field
 835
 836 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 837                'aug', 'sep', 'oct', 'nov', 'dec',
 838                'january', 'february', 'march', 'april', 'may', 'june', 'july',
 839                'august', 'september', 'october', 'november', 'december']
 840 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 841
 842 # The timezone table does not include the military time zones defined
 843 # in RFC822, other than Z.  According to RFC1123, the description in
 844 # RFC822 gets the signs wrong, so we can't rely on any such time
 845 # zones.  RFC1123 recommends that numeric timezone indicators be used
 846 # instead of timezone names.
 847
 848 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 849               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 850               'EST': -500, 'EDT': -400,  # Eastern
 851               'CST': -600, 'CDT': -500,  # Central
 852               'MST': -700, 'MDT': -600,  # Mountain
 853               'PST': -800, 'PDT': -700   # Pacific
 854               }
 855
 856
 857 def parsedate_tz(data):
 858     """Convert a date string to a time tuple.
 859
 860     Accounts for military timezones.
 861     """
 862     if not data:
 863         return None
 864     data = data.split()
 865     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
 866         # There's a dayname here. Skip it
 867         del data[0]
 868     if len(data) == 3: # RFC 850 date, deprecated
 869         stuff = data[0].split('-')
 870         if len(stuff) == 3:
 871             data = stuff + data[1:]
 872     if len(data) == 4:
 873         s = data[3]
 874         i = s.find('+')
 875         if i > 0:
 876             data[3:] = [s[:i], s[i+1:]]
 877         else:
 878             data.append('') # Dummy tz
 879     if len(data) < 5:
 880         return None
 881     data = data[:5]
 882     [dd, mm, yy, tm, tz] = data
 883     mm = mm.lower()
 884     if not mm in _monthnames:
 885         dd, mm = mm, dd.lower()
 886         if not mm in _monthnames:
 887             return None
 888     mm = _monthnames.index(mm)+1
 889     if mm > 12: mm = mm - 12
 890     if dd[-1] == ',':
 891         dd = dd[:-1]
 892     i = yy.find(':')
 893     if i > 0:
 894         yy, tm = tm, yy
 895     if yy[-1] == ',':
 896         yy = yy[:-1]
 897     if not yy[0].isdigit():
 898         yy, tz = tz, yy
 899     if tm[-1] == ',':
 900         tm = tm[:-1]
 901     tm = tm.split(':')
 902     if len(tm) == 2:
 903         [thh, tmm] = tm
 904         tss = '0'
 905     elif len(tm) == 3:
 906         [thh, tmm, tss] = tm
 907     else:
 908         return None
 909     try:
 910         yy = int(yy)
 911         dd = int(dd)
 912         thh = int(thh)
 913         tmm = int(tmm)
 914         tss = int(tss)
 915     except ValueError:
 916         return None
 917     tzoffset = None
 918     tz = tz.upper()
 919     if tz in _timezones:
 920         tzoffset = _timezones[tz]
 921     else:
 922         try:
 923             tzoffset = int(tz)
 924         except ValueError:
 925             pass
 926     # Convert a timezone offset into seconds ; -0500 -> -18000
 927     if tzoffset:
 928         if tzoffset < 0:
 929             tzsign = -1
 930             tzoffset = -tzoffset
 931         else:
 932             tzsign = 1
 933         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 934     tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
 935     return tuple
 936
 937
 938 def parsedate(data):
 939     """Convert a time string to a time tuple."""
 940     t = parsedate_tz(data)
 941     if type(t) == type( () ):
 942         return t[:9]
 943     else: return t
 944
 945
 946 def mktime_tz(data):
 947     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 948     if data[9] is None:
 949         # No zone info, so localtime is better assumption than GMT
 950         return time.mktime(data[:8] + (-1,))
 951     else:
 952         t = time.mktime(data[:8] + (0,))
 953         return t - data[9] - time.timezone
 954
 955 def formatdate(timeval=None):
 956     """Returns time format preferred for Internet standards.
 957
 958     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
 959
 960     According to RFC 1123, day and month names must always be in
 961     English.  If not for that, this code could use strftime().  It
 962     can't because strftime() honors the locale and could generated
 963     non-English names.
 964     """
 965     if timeval is None:
 966         timeval = time.time()
 967     timeval = time.gmtime(timeval)
 968     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
 969             ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][timeval[6]],
 970             timeval[2],
 971             ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
 972              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][timeval[1]-1],
 973                                 timeval[0], timeval[3], timeval[4], timeval[5])
 974
 975
 976 # When used as script, run a small test program.
 977 # The first command line argument must be a filename containing one
 978 # message in RFC-822 format.
 979
 980 if __name__ == '__main__':
 981     import sys, os
 982     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
 983     if sys.argv[1:]: file = sys.argv[1]
 984     f = open(file, 'r')
 985     m = Message(f)
 986     print 'From:', m.getaddr('from')
 987     print 'To:', m.getaddrlist('to')
 988     print 'Subject:', m.getheader('subject')
 989     print 'Date:', m.getheader('date')
 990     date = m.getdate_tz('date')
 991     tz = date[-1]
 992     date = time.localtime(mktime_tz(date))
 993     if date:
 994         print 'ParsedDate:', time.asctime(date),
 995         hhmmss = tz
 996         hhmm, ss = divmod(hhmmss, 60)
 997         hh, mm = divmod(hhmm, 60)
 998         print "%+03d%02d" % (hh, mm),
 999         if ss: print ".%02d" % ss,
1000         print
1001     else:
1002         print 'ParsedDate:', None
1003     m.rewindbody()
1004     n = 0
1005     while f.readline():
1006         n = n + 1
1007     print 'Lines:', n
1008     print '-'*70
1009     print 'len =', len(m)
1010     if 'Date' in m: print 'Date =', m['Date']
1011     if 'X-Nonsense' in m: pass
1012     print 'keys =', m.keys()
1013     print 'values =', m.values()
1014     print 'items =', m.items()