Lib/rfc822.py

   1 """RFC 2822 message manipulation.
   2
   3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
   4 the tokenizing of addresses does not adhere to all the quoting rules.
   5
   6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
   7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
   8 effort at RFC 2822 updates have been made, but a thorough audit has not been
   9 performed.  Consider any RFC 2822 non-conformance to be a bug.
  10
  11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
  12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
  13
  14 Directions for use:
  15
  16 To create a Message object: first open a file, e.g.:
  17
  18   fp = open(file, 'r')
  19
  20 You can use any other legal way of getting an open file object, e.g. use
  21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
  22 constructor:
  23
  24   m = Message(fp)
  25
  26 This class can work with any input object that supports a readline method.  If
  27 the input object has seek and tell capability, the rewindbody method will
  28 work; also illegal lines will be pushed back onto the input stream.  If the
  29 input object lacks seek but has an `unread' method that can push back a line
  30 of input, Message will use that to push back illegal lines.  Thus this class
  31 can be used to parse messages coming from a buffered stream.
  32
  33 The optional `seekable' argument is provided as a workaround for certain stdio
  34 libraries in which tell() discards buffered data before discovering that the
  35 lseek() system call doesn't work.  For maximum portability, you should set the
  36 seekable argument to zero to prevent that initial \code{tell} when passing in
  37 an unseekable object such as a a file object created from a socket object.  If
  38 it is 1 on entry -- which it is by default -- the tell() method of the open
  39 file object is called once; if this raises an exception, seekable is reset to
  40 0.  For other nonzero values of seekable, this test is not made.
  41
  42 To get the text of a particular header there are several methods:
  43
  44   str = m.getheader(name)
  45   str = m.getrawheader(name)
  46
  47 where name is the name of the header, e.g. 'Subject'.  The difference is that
  48 getheader() strips the leading and trailing whitespace, while getrawheader()
  49 doesn't.  Both functions retain embedded whitespace (including newlines)
  50 exactly as they are specified in the header, and leave the case of the text
  51 unchanged.
  52
  53 For addresses and address lists there are functions
  54
  55   realname, mailaddress = m.getaddr(name)
  56   list = m.getaddrlist(name)
  57
  58 where the latter returns a list of (realname, mailaddr) tuples.
  59
  60 There is also a method
  61
  62   time = m.getdate(name)
  63
  64 which parses a Date-like field and returns a time-compatible tuple,
  65 i.e. a tuple such as returned by time.localtime() or accepted by
  66 time.mktime().
  67
  68 See the class definition for lower level access methods.
  69
  70 There are also some utility functions here.
  71 """
  72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
  73
  74 import time
  75
  76 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
  77
  78 _blanklines = ('\r\n', '\n')            # Optimization for islast()
  79
  80
  81 class Message:
  82     """Represents a single RFC 2822-compliant message."""
  83
  84     def __init__(self, fp, seekable = 1):
  85         """Initialize the class instance and read the headers."""
  86         if seekable == 1:
  87             # Exercise tell() to make sure it works
  88             # (and then assume seek() works, too)
  89             try:
  90                 fp.tell()
  91             except (AttributeError, IOError):
  92                 seekable = 0
  93             else:
  94                 seekable = 1
  95         self.fp = fp
  96         self.seekable = seekable
  97         self.startofheaders = None
  98         self.startofbody = None
  99         #
 100         if self.seekable:
 101             try:
 102                 self.startofheaders = self.fp.tell()
 103             except IOError:
 104                 self.seekable = 0
 105         #
 106         self.readheaders()
 107         #
 108         if self.seekable:
 109             try:
 110                 self.startofbody = self.fp.tell()
 111             except IOError:
 112                 self.seekable = 0
 113
 114     def rewindbody(self):
 115         """Rewind the file to the start of the body (if seekable)."""
 116         if not self.seekable:
 117             raise IOError, "unseekable file"
 118         self.fp.seek(self.startofbody)
 119
 120     def readheaders(self):
 121         """Read header lines.
 122
 123         Read header lines up to the entirely blank line that terminates them.
 124         The (normally blank) line that ends the headers is skipped, but not
 125         included in the returned list.  If a non-header line ends the headers,
 126         (which is an error), an attempt is made to backspace over it; it is
 127         never included in the returned list.
 128
 129         The variable self.status is set to the empty string if all went well,
 130         otherwise it is an error message.  The variable self.headers is a
 131         completely uninterpreted list of lines contained in the header (so
 132         printing them will reproduce the header exactly as it appears in the
 133         file).
 134         """
 135         self.dict = {}
 136         self.unixfrom = ''
 137         self.headers = list = []
 138         self.status = ''
 139         headerseen = ""
 140         firstline = 1
 141         startofline = unread = tell = None
 142         if hasattr(self.fp, 'unread'):
 143             unread = self.fp.unread
 144         elif self.seekable:
 145             tell = self.fp.tell
 146         while 1:
 147             if tell:
 148                 try:
 149                     startofline = tell()
 150                 except IOError:
 151                     startofline = tell = None
 152                     self.seekable = 0
 153             line = self.fp.readline()
 154             if not line:
 155                 self.status = 'EOF in headers'
 156                 break
 157             # Skip unix From name time lines
 158             if firstline and line.startswith('From '):
 159                 self.unixfrom = self.unixfrom + line
 160                 continue
 161             firstline = 0
 162             if headerseen and line[0] in ' \t':
 163                 # It's a continuation line.
 164                 list.append(line)
 165                 x = (self.dict[headerseen] + "\n " + line.strip())
 166                 self.dict[headerseen] = x.strip()
 167                 continue
 168             elif self.iscomment(line):
 169                 # It's a comment.  Ignore it.
 170                 continue
 171             elif self.islast(line):
 172                 # Note! No pushback here!  The delimiter line gets eaten.
 173                 break
 174             headerseen = self.isheader(line)
 175             if headerseen:
 176                 # It's a legal header line, save it.
 177                 list.append(line)
 178                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
 179                 continue
 180             else:
 181                 # It's not a header line; throw it back and stop here.
 182                 if not self.dict:
 183                     self.status = 'No headers'
 184                 else:
 185                     self.status = 'Non-header line where header expected'
 186                 # Try to undo the read.
 187                 if unread:
 188                     unread(line)
 189                 elif tell:
 190                     self.fp.seek(startofline)
 191                 else:
 192                     self.status = self.status + '; bad seek'
 193                 break
 194
 195     def isheader(self, line):
 196         """Determine whether a given line is a legal header.
 197
 198         This method should return the header name, suitably canonicalized.
 199         You may override this method in order to use Message parsing on tagged
 200         data in RFC 2822-like formats with special header formats.
 201         """
 202         i = line.find(':')
 203         if i > 0:
 204             return line[:i].lower()
 205         else:
 206             return None
 207
 208     def islast(self, line):
 209         """Determine whether a line is a legal end of RFC 2822 headers.
 210
 211         You may override this method if your application wants to bend the
 212         rules, e.g. to strip trailing whitespace, or to recognize MH template
 213         separators ('--------').  For convenience (e.g. for code reading from
 214         sockets) a line consisting of \r\n also matches.
 215         """
 216         return line in _blanklines
 217
 218     def iscomment(self, line):
 219         """Determine whether a line should be skipped entirely.
 220
 221         You may override this method in order to use Message parsing on tagged
 222         data in RFC 2822-like formats that support embedded comments or
 223         free-text data.
 224         """
 225         return None
 226
 227     def getallmatchingheaders(self, name):
 228         """Find all header lines matching a given header name.
 229
 230         Look through the list of headers and find all lines matching a given
 231         header name (and their continuation lines).  A list of the lines is
 232         returned, without interpretation.  If the header does not occur, an
 233         empty list is returned.  If the header occurs multiple times, all
 234         occurrences are returned.  Case is not important in the header name.
 235         """
 236         name = name.lower() + ':'
 237         n = len(name)
 238         list = []
 239         hit = 0
 240         for line in self.headers:
 241             if line[:n].lower() == name:
 242                 hit = 1
 243             elif not line[:1].isspace():
 244                 hit = 0
 245             if hit:
 246                 list.append(line)
 247         return list
 248
 249     def getfirstmatchingheader(self, name):
 250         """Get the first header line matching name.
 251
 252         This is similar to getallmatchingheaders, but it returns only the
 253         first matching header (and its continuation lines).
 254         """
 255         name = name.lower() + ':'
 256         n = len(name)
 257         list = []
 258         hit = 0
 259         for line in self.headers:
 260             if hit:
 261                 if not line[:1].isspace():
 262                     break
 263             elif line[:n].lower() == name:
 264                 hit = 1
 265             if hit:
 266                 list.append(line)
 267         return list
 268
 269     def getrawheader(self, name):
 270         """A higher-level interface to getfirstmatchingheader().
 271
 272         Return a string containing the literal text of the header but with the
 273         keyword stripped.  All leading, trailing and embedded whitespace is
 274         kept in the string, however.  Return None if the header does not
 275         occur.
 276         """
 277
 278         list = self.getfirstmatchingheader(name)
 279         if not list:
 280             return None
 281         list[0] = list[0][len(name) + 1:]
 282         return ''.join(list)
 283
 284     def getheader(self, name, default=None):
 285         """Get the header value for a name.
 286
 287         This is the normal interface: it returns a stripped version of the
 288         header value for a given header name, or None if it doesn't exist.
 289         This uses the dictionary version which finds the *last* such header.
 290         """
 291         try:
 292             return self.dict[name.lower()]
 293         except KeyError:
 294             return default
 295     get = getheader
 296
 297     def getheaders(self, name):
 298         """Get all values for a header.
 299
 300         This returns a list of values for headers given more than once; each
 301         value in the result list is stripped in the same way as the result of
 302         getheader().  If the header is not given, return an empty list.
 303         """
 304         result = []
 305         current = ''
 306         have_header = 0
 307         for s in self.getallmatchingheaders(name):
 308             if s[0].isspace():
 309                 if current:
 310                     current = "%s\n %s" % (current, s.strip())
 311                 else:
 312                     current = s.strip()
 313             else:
 314                 if have_header:
 315                     result.append(current)
 316                 current = s[s.find(":") + 1:].strip()
 317                 have_header = 1
 318         if have_header:
 319             result.append(current)
 320         return result
 321
 322     def getaddr(self, name):
 323         """Get a single address from a header, as a tuple.
 324
 325         An example return value:
 326         ('Guido van Rossum', 'guido@cwi.nl')
 327         """
 328         # New, by Ben Escoto
 329         alist = self.getaddrlist(name)
 330         if alist:
 331             return alist[0]
 332         else:
 333             return (None, None)
 334
 335     def getaddrlist(self, name):
 336         """Get a list of addresses from a header.
 337
 338         Retrieves a list of addresses from a header, where each address is a
 339         tuple as returned by getaddr().  Scans all named headers, so it works
 340         properly with multiple To: or Cc: headers for example.
 341         """
 342         raw = []
 343         for h in self.getallmatchingheaders(name):
 344             if h[0] in ' \t':
 345                 raw.append(h)
 346             else:
 347                 if raw:
 348                     raw.append(', ')
 349                 i = h.find(':')
 350                 if i > 0:
 351                     addr = h[i+1:]
 352                 raw.append(addr)
 353         alladdrs = ''.join(raw)
 354         a = AddrlistClass(alladdrs)
 355         return a.getaddrlist()
 356
 357     def getdate(self, name):
 358         """Retrieve a date field from a header.
 359
 360         Retrieves a date field from the named header, returning a tuple
 361         compatible with time.mktime().
 362         """
 363         try:
 364             data = self[name]
 365         except KeyError:
 366             return None
 367         return parsedate(data)
 368
 369     def getdate_tz(self, name):
 370         """Retrieve a date field from a header as a 10-tuple.
 371
 372         The first 9 elements make up a tuple compatible with time.mktime(),
 373         and the 10th is the offset of the poster's time zone from GMT/UTC.
 374         """
 375         try:
 376             data = self[name]
 377         except KeyError:
 378             return None
 379         return parsedate_tz(data)
 380
 381
 382     # Access as a dictionary (only finds *last* header of each type):
 383
 384     def __len__(self):
 385         """Get the number of headers in a message."""
 386         return len(self.dict)
 387
 388     def __getitem__(self, name):
 389         """Get a specific header, as from a dictionary."""
 390         return self.dict[name.lower()]
 391
 392     def __setitem__(self, name, value):
 393         """Set the value of a header.
 394
 395         Note: This is not a perfect inversion of __getitem__, because any
 396         changed headers get stuck at the end of the raw-headers list rather
 397         than where the altered header was.
 398         """
 399         del self[name] # Won't fail if it doesn't exist
 400         self.dict[name.lower()] = value
 401         text = name + ": " + value
 402         lines = text.split("\n")
 403         for line in lines:
 404             self.headers.append(line + "\n")
 405
 406     def __delitem__(self, name):
 407         """Delete all occurrences of a specific header, if it is present."""
 408         name = name.lower()
 409         if not self.dict.has_key(name):
 410             return
 411         del self.dict[name]
 412         name = name + ':'
 413         n = len(name)
 414         list = []
 415         hit = 0
 416         for i in range(len(self.headers)):
 417             line = self.headers[i]
 418             if line[:n].lower() == name:
 419                 hit = 1
 420             elif not line[:1].isspace():
 421                 hit = 0
 422             if hit:
 423                 list.append(i)
 424         list.reverse()
 425         for i in list:
 426             del self.headers[i]
 427
 428     def get(self, name, default=""):
 429         name = name.lower()
 430         if self.dict.has_key(name):
 431             return self.dict[name]
 432         else:
 433             return default
 434
 435     def setdefault(self, name, default=""):
 436         lowername = name.lower()
 437         if self.dict.has_key(lowername):
 438             return self.dict[lowername]
 439         else:
 440             text = name + ": " + default
 441             lines = text.split("\n")
 442             for line in lines:
 443                 self.headers.append(line + "\n")
 444             self.dict[lowername] = default
 445             return default
 446
 447     def has_key(self, name):
 448         """Determine whether a message contains the named header."""
 449         return self.dict.has_key(name.lower())
 450
 451     def keys(self):
 452         """Get all of a message's header field names."""
 453         return self.dict.keys()
 454
 455     def values(self):
 456         """Get all of a message's header field values."""
 457         return self.dict.values()
 458
 459     def items(self):
 460         """Get all of a message's headers.
 461
 462         Returns a list of name, value tuples.
 463         """
 464         return self.dict.items()
 465
 466     def __str__(self):
 467         str = ''
 468         for hdr in self.headers:
 469             str = str + hdr
 470         return str
 471
 472
 473 # Utility functions
 474 # -----------------
 475
 476 # XXX Should fix unquote() and quote() to be really conformant.
 477 # XXX The inverses of the parse functions may also be useful.
 478
 479
 480 def unquote(str):
 481     """Remove quotes from a string."""
 482     if len(str) > 1:
 483         if str[0] == '"' and str[-1:] == '"':
 484             return str[1:-1]
 485         if str[0] == '<' and str[-1:] == '>':
 486             return str[1:-1]
 487     return str
 488
 489
 490 def quote(str):
 491     """Add quotes around a string."""
 492     return str.replace('\\', '\\\\').replace('"', '\\"')
 493
 494
 495 def parseaddr(address):
 496     """Parse an address into a (realname, mailaddr) tuple."""
 497     a = AddrlistClass(address)
 498     list = a.getaddrlist()
 499     if not list:
 500         return (None, None)
 501     else:
 502         return list[0]
 503
 504
 505 class AddrlistClass:
 506     """Address parser class by Ben Escoto.
 507
 508     To understand what this class does, it helps to have a copy of
 509     RFC 2822 in front of you.
 510
 511     http://www.faqs.org/rfcs/rfc2822.html
 512
 513     Note: this class interface is deprecated and may be removed in the future.
 514     Use rfc822.AddressList instead.
 515     """
 516
 517     def __init__(self, field):
 518         """Initialize a new instance.
 519
 520         `field' is an unparsed address header field, containing one or more
 521         addresses.
 522         """
 523         self.specials = '()<>@,:;.\"[]'
 524         self.pos = 0
 525         self.LWS = ' \t'
 526         self.CR = '\r\n'
 527         self.atomends = self.specials + self.LWS + self.CR
 528         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 529         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 530         # syntax, so allow dots in phrases.
 531         self.phraseends = self.atomends.replace('.', '')
 532         self.field = field
 533         self.commentlist = []
 534
 535     def gotonext(self):
 536         """Parse up to the start of the next address."""
 537         while self.pos < len(self.field):
 538             if self.field[self.pos] in self.LWS + '\n\r':
 539                 self.pos = self.pos + 1
 540             elif self.field[self.pos] == '(':
 541                 self.commentlist.append(self.getcomment())
 542             else: break
 543
 544     def getaddrlist(self):
 545         """Parse all addresses.
 546
 547         Returns a list containing all of the addresses.
 548         """
 549         result = []
 550         while 1:
 551             ad = self.getaddress()
 552             if ad:
 553                 result += ad
 554             else:
 555                 break
 556         return result
 557
 558     def getaddress(self):
 559         """Parse the next address."""
 560         self.commentlist = []
 561         self.gotonext()
 562
 563         oldpos = self.pos
 564         oldcl = self.commentlist
 565         plist = self.getphraselist()
 566
 567         self.gotonext()
 568         returnlist = []
 569
 570         if self.pos >= len(self.field):
 571             # Bad email address technically, no domain.
 572             if plist:
 573                 returnlist = [(' '.join(self.commentlist), plist[0])]
 574
 575         elif self.field[self.pos] in '.@':
 576             # email address is just an addrspec
 577             # this isn't very efficient since we start over
 578             self.pos = oldpos
 579             self.commentlist = oldcl
 580             addrspec = self.getaddrspec()
 581             returnlist = [(' '.join(self.commentlist), addrspec)]
 582
 583         elif self.field[self.pos] == ':':
 584             # address is a group
 585             returnlist = []
 586
 587             fieldlen = len(self.field)
 588             self.pos = self.pos + 1
 589             while self.pos < len(self.field):
 590                 self.gotonext()
 591                 if self.pos < fieldlen and self.field[self.pos] == ';':
 592                     self.pos = self.pos + 1
 593                     break
 594                 returnlist = returnlist + self.getaddress()
 595
 596         elif self.field[self.pos] == '<':
 597             # Address is a phrase then a route addr
 598             routeaddr = self.getrouteaddr()
 599
 600             if self.commentlist:
 601                 returnlist = [(' '.join(plist) + ' (' + \
 602                          ' '.join(self.commentlist) + ')', routeaddr)]
 603             else: returnlist = [(' '.join(plist), routeaddr)]
 604
 605         else:
 606             if plist:
 607                 returnlist = [(' '.join(self.commentlist), plist[0])]
 608             elif self.field[self.pos] in self.specials:
 609                 self.pos = self.pos + 1
 610
 611         self.gotonext()
 612         if self.pos < len(self.field) and self.field[self.pos] == ',':
 613             self.pos = self.pos + 1
 614         return returnlist
 615
 616     def getrouteaddr(self):
 617         """Parse a route address (Return-path value).
 618
 619         This method just skips all the route stuff and returns the addrspec.
 620         """
 621         if self.field[self.pos] != '<':
 622             return
 623
 624         expectroute = 0
 625         self.pos = self.pos + 1
 626         self.gotonext()
 627         adlist = ""
 628         while self.pos < len(self.field):
 629             if expectroute:
 630                 self.getdomain()
 631                 expectroute = 0
 632             elif self.field[self.pos] == '>':
 633                 self.pos = self.pos + 1
 634                 break
 635             elif self.field[self.pos] == '@':
 636                 self.pos = self.pos + 1
 637                 expectroute = 1
 638             elif self.field[self.pos] == ':':
 639                 self.pos = self.pos + 1
 640             else:
 641                 adlist = self.getaddrspec()
 642                 self.pos = self.pos + 1
 643                 break
 644             self.gotonext()
 645
 646         return adlist
 647
 648     def getaddrspec(self):
 649         """Parse an RFC 2822 addr-spec."""
 650         aslist = []
 651
 652         self.gotonext()
 653         while self.pos < len(self.field):
 654             if self.field[self.pos] == '.':
 655                 aslist.append('.')
 656                 self.pos = self.pos + 1
 657             elif self.field[self.pos] == '"':
 658                 aslist.append('"%s"' % self.getquote())
 659             elif self.field[self.pos] in self.atomends:
 660                 break
 661             else: aslist.append(self.getatom())
 662             self.gotonext()
 663
 664         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 665             return ''.join(aslist)
 666
 667         aslist.append('@')
 668         self.pos = self.pos + 1
 669         self.gotonext()
 670         return ''.join(aslist) + self.getdomain()
 671
 672     def getdomain(self):
 673         """Get the complete domain name from an address."""
 674         sdlist = []
 675         while self.pos < len(self.field):
 676             if self.field[self.pos] in self.LWS:
 677                 self.pos = self.pos + 1
 678             elif self.field[self.pos] == '(':
 679                 self.commentlist.append(self.getcomment())
 680             elif self.field[self.pos] == '[':
 681                 sdlist.append(self.getdomainliteral())
 682             elif self.field[self.pos] == '.':
 683                 self.pos = self.pos + 1
 684                 sdlist.append('.')
 685             elif self.field[self.pos] in self.atomends:
 686                 break
 687             else: sdlist.append(self.getatom())
 688         return ''.join(sdlist)
 689
 690     def getdelimited(self, beginchar, endchars, allowcomments = 1):
 691         """Parse a header fragment delimited by special characters.
 692
 693         `beginchar' is the start character for the fragment.  If self is not
 694         looking at an instance of `beginchar' then getdelimited returns the
 695         empty string.
 696
 697         `endchars' is a sequence of allowable end-delimiting characters.
 698         Parsing stops when one of these is encountered.
 699
 700         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 701         within the parsed fragment.
 702         """
 703         if self.field[self.pos] != beginchar:
 704             return ''
 705
 706         slist = ['']
 707         quote = 0
 708         self.pos = self.pos + 1
 709         while self.pos < len(self.field):
 710             if quote == 1:
 711                 slist.append(self.field[self.pos])
 712                 quote = 0
 713             elif self.field[self.pos] in endchars:
 714                 self.pos = self.pos + 1
 715                 break
 716             elif allowcomments and self.field[self.pos] == '(':
 717                 slist.append(self.getcomment())
 718             elif self.field[self.pos] == '\\':
 719                 quote = 1
 720             else:
 721                 slist.append(self.field[self.pos])
 722             self.pos = self.pos + 1
 723
 724         return ''.join(slist)
 725
 726     def getquote(self):
 727         """Get a quote-delimited fragment from self's field."""
 728         return self.getdelimited('"', '"\r', 0)
 729
 730     def getcomment(self):
 731         """Get a parenthesis-delimited fragment from self's field."""
 732         return self.getdelimited('(', ')\r', 1)
 733
 734     def getdomainliteral(self):
 735         """Parse an RFC 2822 domain-literal."""
 736         return '[%s]' % self.getdelimited('[', ']\r', 0)
 737
 738     def getatom(self, atomends=None):
 739         """Parse an RFC 2822 atom.
 740
 741         Optional atomends specifies a different set of end token delimiters
 742         (the default is to use self.atomends).  This is used e.g. in
 743         getphraselist() since phrase endings must not include the `.' (which
 744         is legal in phrases)."""
 745         atomlist = ['']
 746         if atomends is None:
 747             atomends = self.atomends
 748
 749         while self.pos < len(self.field):
 750             if self.field[self.pos] in atomends:
 751                 break
 752             else: atomlist.append(self.field[self.pos])
 753             self.pos = self.pos + 1
 754
 755         return ''.join(atomlist)
 756
 757     def getphraselist(self):
 758         """Parse a sequence of RFC 2822 phrases.
 759
 760         A phrase is a sequence of words, which are in turn either RFC 2822
 761         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 762         runs of continuous whitespace into one space.
 763         """
 764         plist = []
 765
 766         while self.pos < len(self.field):
 767             if self.field[self.pos] in self.LWS:
 768                 self.pos = self.pos + 1
 769             elif self.field[self.pos] == '"':
 770                 plist.append(self.getquote())
 771             elif self.field[self.pos] == '(':
 772                 self.commentlist.append(self.getcomment())
 773             elif self.field[self.pos] in self.phraseends:
 774                 break
 775             else:
 776                 plist.append(self.getatom(self.phraseends))
 777
 778         return plist
 779
 780 class AddressList(AddrlistClass):
 781     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 782     def __init__(self, field):
 783         AddrlistClass.__init__(self, field)
 784         if field:
 785             self.addresslist = self.getaddrlist()
 786         else:
 787             self.addresslist = []
 788
 789     def __len__(self):
 790         return len(self.addresslist)
 791
 792     def __str__(self):
 793         return ", ".join(map(dump_address_pair, self.addresslist))
 794
 795     def __add__(self, other):
 796         # Set union
 797         newaddr = AddressList(None)
 798         newaddr.addresslist = self.addresslist[:]
 799         for x in other.addresslist:
 800             if not x in self.addresslist:
 801                 newaddr.addresslist.append(x)
 802         return newaddr
 803
 804     def __iadd__(self, other):
 805         # Set union, in-place
 806         for x in other.addresslist:
 807             if not x in self.addresslist:
 808                 self.addresslist.append(x)
 809         return self
 810
 811     def __sub__(self, other):
 812         # Set difference
 813         newaddr = AddressList(None)
 814         for x in self.addresslist:
 815             if not x in other.addresslist:
 816                 newaddr.addresslist.append(x)
 817         return newaddr
 818
 819     def __isub__(self, other):
 820         # Set difference, in-place
 821         for x in other.addresslist:
 822             if x in self.addresslist:
 823                 self.addresslist.remove(x)
 824         return self
 825
 826     def __getitem__(self, index):
 827         # Make indexing, slices, and 'in' work
 828         return self.addresslist[index]
 829
 830 def dump_address_pair(pair):
 831     """Dump a (name, address) pair in a canonicalized form."""
 832     if pair[0]:
 833         return '"' + pair[0] + '" <' + pair[1] + '>'
 834     else:
 835         return pair[1]
 836
 837 # Parse a date field
 838
 839 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 840                'aug', 'sep', 'oct', 'nov', 'dec',
 841                'january', 'february', 'march', 'april', 'may', 'june', 'july',
 842                'august', 'september', 'october', 'november', 'december']
 843 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 844
 845 # The timezone table does not include the military time zones defined
 846 # in RFC822, other than Z.  According to RFC1123, the description in
 847 # RFC822 gets the signs wrong, so we can't rely on any such time
 848 # zones.  RFC1123 recommends that numeric timezone indicators be used
 849 # instead of timezone names.
 850
 851 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 852               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 853               'EST': -500, 'EDT': -400,  # Eastern
 854               'CST': -600, 'CDT': -500,  # Central
 855               'MST': -700, 'MDT': -600,  # Mountain
 856               'PST': -800, 'PDT': -700   # Pacific
 857               }
 858
 859
 860 def parsedate_tz(data):
 861     """Convert a date string to a time tuple.
 862
 863     Accounts for military timezones.
 864     """
 865     if not data:
 866         return None
 867     data = data.split()
 868     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
 869         # There's a dayname here. Skip it
 870         del data[0]
 871     if len(data) == 3: # RFC 850 date, deprecated
 872         stuff = data[0].split('-')
 873         if len(stuff) == 3:
 874             data = stuff + data[1:]
 875     if len(data) == 4:
 876         s = data[3]
 877         i = s.find('+')
 878         if i > 0:
 879             data[3:] = [s[:i], s[i+1:]]
 880         else:
 881             data.append('') # Dummy tz
 882     if len(data) < 5:
 883         return None
 884     data = data[:5]
 885     [dd, mm, yy, tm, tz] = data
 886     mm = mm.lower()
 887     if not mm in _monthnames:
 888         dd, mm = mm, dd.lower()
 889         if not mm in _monthnames:
 890             return None
 891     mm = _monthnames.index(mm)+1
 892     if mm > 12: mm = mm - 12
 893     if dd[-1] == ',':
 894         dd = dd[:-1]
 895     i = yy.find(':')
 896     if i > 0:
 897         yy, tm = tm, yy
 898     if yy[-1] == ',':
 899         yy = yy[:-1]
 900     if not yy[0].isdigit():
 901         yy, tz = tz, yy
 902     if tm[-1] == ',':
 903         tm = tm[:-1]
 904     tm = tm.split(':')
 905     if len(tm) == 2:
 906         [thh, tmm] = tm
 907         tss = '0'
 908     elif len(tm) == 3:
 909         [thh, tmm, tss] = tm
 910     else:
 911         return None
 912     try:
 913         yy = int(yy)
 914         dd = int(dd)
 915         thh = int(thh)
 916         tmm = int(tmm)
 917         tss = int(tss)
 918     except ValueError:
 919         return None
 920     tzoffset = None
 921     tz = tz.upper()
 922     if _timezones.has_key(tz):
 923         tzoffset = _timezones[tz]
 924     else:
 925         try:
 926             tzoffset = int(tz)
 927         except ValueError:
 928             pass
 929     # Convert a timezone offset into seconds ; -0500 -> -18000
 930     if tzoffset:
 931         if tzoffset < 0:
 932             tzsign = -1
 933             tzoffset = -tzoffset
 934         else:
 935             tzsign = 1
 936         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 937     tuple = (yy, mm, dd, thh, tmm, tss, 0, 0, 0, tzoffset)
 938     return tuple
 939
 940
 941 def parsedate(data):
 942     """Convert a time string to a time tuple."""
 943     t = parsedate_tz(data)
 944     if type(t) == type( () ):
 945         return t[:9]
 946     else: return t
 947
 948
 949 def mktime_tz(data):
 950     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 951     if data[9] is None:
 952         # No zone info, so localtime is better assumption than GMT
 953         return time.mktime(data[:8] + (-1,))
 954     else:
 955         t = time.mktime(data[:8] + (0,))
 956         return t - data[9] - time.timezone
 957
 958 def formatdate(timeval=None):
 959     """Returns time format preferred for Internet standards.
 960
 961     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
 962
 963     According to RFC 1123, day and month names must always be in
 964     English.  If not for that, this code could use strftime().  It
 965     can't because strftime() honors the locale and could generated
 966     non-English names.
 967     """
 968     if timeval is None:
 969         timeval = time.time()
 970     timeval = time.gmtime(timeval)
 971     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
 972             ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][timeval[6]],
 973             timeval[2],
 974             ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
 975              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"][timeval[1]-1],
 976                                 timeval[0], timeval[3], timeval[4], timeval[5])
 977
 978
 979 # When used as script, run a small test program.
 980 # The first command line argument must be a filename containing one
 981 # message in RFC-822 format.
 982
 983 if __name__ == '__main__':
 984     import sys, os
 985     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
 986     if sys.argv[1:]: file = sys.argv[1]
 987     f = open(file, 'r')
 988     m = Message(f)
 989     print 'From:', m.getaddr('from')
 990     print 'To:', m.getaddrlist('to')
 991     print 'Subject:', m.getheader('subject')
 992     print 'Date:', m.getheader('date')
 993     date = m.getdate_tz('date')
 994     tz = date[-1]
 995     date = time.localtime(mktime_tz(date))
 996     if date:
 997         print 'ParsedDate:', time.asctime(date),
 998         hhmmss = tz
 999         hhmm, ss = divmod(hhmmss, 60)
1000         hh, mm = divmod(hhmm, 60)
1001         print "%+03d%02d" % (hh, mm),
1002         if ss: print ".%02d" % ss,
1003         print
1004     else:
1005         print 'ParsedDate:', None
1006     m.rewindbody()
1007     n = 0
1008     while f.readline():
1009         n = n + 1
1010     print 'Lines:', n
1011     print '-'*70
1012     print 'len =', len(m)
1013     if m.has_key('Date'): print 'Date =', m['Date']
1014     if m.has_key('X-Nonsense'): pass
1015     print 'keys =', m.keys()
1016     print 'values =', m.values()
1017     print 'items =', m.items()