1 """RFC-822 message manipulation class.
3 XXX This is only a very rough sketch of a full RFC-822 parser;
4 in particular the tokenizing of addresses does not adhere to all the
9 To create a Message object: first open a file, e.g.:
11 You can use any other legal way of getting an open file object, e.g. use
12 sys.stdin or call os.popen().
13 Then pass the open file object to the Message() constructor:
16 This class can work with any input object that supports a readline
17 method. If the input object has seek and tell capability, the
18 rewindbody method will work; also illegal lines will be pushed back
19 onto the input stream. If the input object lacks seek but has an
20 `unread' method that can push back a line of input, Message will use
21 that to push back illegal lines. Thus this class can be used to parse
22 messages coming from a buffered stream.
24 The optional `seekable' argument is provided as a workaround for
25 certain stdio libraries in which tell() discards buffered data before
26 discovering that the lseek() system call doesn't work. For maximum
27 portability, you should set the seekable argument to zero to prevent
28 that initial \code{tell} when passing in an unseekable object such as
29 a a file object created from a socket object. If it is 1 on entry --
30 which it is by default -- the tell() method of the open file object is
31 called once; if this raises an exception, seekable is reset to 0. For
32 other nonzero values of seekable, this test is not made.
34 To get the text of a particular header there are several methods:
35 str = m.getheader(name)
36 str = m.getrawheader(name)
37 where name is the name of the header, e.g. 'Subject'.
38 The difference is that getheader() strips the leading and trailing
39 whitespace, while getrawheader() doesn't. Both functions retain
40 embedded whitespace (including newlines) exactly as they are
41 specified in the header, and leave the case of the text unchanged.
43 For addresses and address lists there are functions
44 realname, mailaddress = m.getaddr(name) and
45 list = m.getaddrlist(name)
46 where the latter returns a list of (realname, mailaddr) tuples.
48 There is also a method
49 time = m.getdate(name)
50 which parses a Date-like field and returns a time-compatible tuple,
51 i.e. a tuple such as returned by time.localtime() or accepted by
54 See the class definition for lower level access methods.
56 There are also some utility functions here.
58 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
64 _blanklines
= ('\r\n', '\n') # Optimization for islast()
68 """Represents a single RFC-822-compliant message."""
70 def __init__(self
, fp
, seekable
= 1):
71 """Initialize the class instance and read the headers."""
73 # Exercise tell() to make sure it works
74 # (and then assume seek() works, too)
82 self
.seekable
= seekable
83 self
.startofheaders
= None
84 self
.startofbody
= None
88 self
.startofheaders
= self
.fp
.tell()
96 self
.startofbody
= self
.fp
.tell()
100 def rewindbody(self
):
101 """Rewind the file to the start of the body (if seekable)."""
102 if not self
.seekable
:
103 raise IOError, "unseekable file"
104 self
.fp
.seek(self
.startofbody
)
106 def readheaders(self
):
107 """Read header lines.
109 Read header lines up to the entirely blank line that
110 terminates them. The (normally blank) line that ends the
111 headers is skipped, but not included in the returned list.
112 If a non-header line ends the headers, (which is an error),
113 an attempt is made to backspace over it; it is never
114 included in the returned list.
116 The variable self.status is set to the empty string if all
117 went well, otherwise it is an error message.
118 The variable self.headers is a completely uninterpreted list
119 of lines contained in the header (so printing them will
120 reproduce the header exactly as it appears in the file).
124 self
.headers
= list = []
128 startofline
= unread
= tell
= None
129 if hasattr(self
.fp
, 'unread'):
130 unread
= self
.fp
.unread
136 line
= self
.fp
.readline()
138 self
.status
= 'EOF in headers'
140 # Skip unix From name time lines
141 if firstline
and line
[:5] == 'From ':
142 self
.unixfrom
= self
.unixfrom
+ line
145 if headerseen
and line
[0] in ' \t':
146 # It's a continuation line.
148 x
= (self
.dict[headerseen
] + "\n " + string
.strip(line
))
149 self
.dict[headerseen
] = string
.strip(x
)
151 elif self
.iscomment(line
):
152 # It's a comment. Ignore it.
154 elif self
.islast(line
):
155 # Note! No pushback here! The delimiter line gets eaten.
157 headerseen
= self
.isheader(line
)
159 # It's a legal header line, save it.
161 self
.dict[headerseen
] = string
.strip(line
[len(headerseen
)+2:])
164 # It's not a header line; throw it back and stop here.
166 self
.status
= 'No headers'
168 self
.status
= 'Non-header line where header expected'
169 # Try to undo the read.
173 self
.fp
.seek(startofline
)
175 self
.status
= self
.status
+ '; bad seek'
178 def isheader(self
, line
):
179 """Determine whether a given line is a legal header.
181 This method should return the header name, suitably canonicalized.
182 You may override this method in order to use Message parsing
183 on tagged data in RFC822-like formats with special header formats.
185 i
= string
.find(line
, ':')
187 return string
.lower(line
[:i
])
191 def islast(self
, line
):
192 """Determine whether a line is a legal end of RFC-822 headers.
194 You may override this method if your application wants
195 to bend the rules, e.g. to strip trailing whitespace,
196 or to recognise MH template separators ('--------').
197 For convenience (e.g. for code reading from sockets) a
198 line consisting of \r\n also matches.
200 return line
in _blanklines
202 def iscomment(self
, line
):
203 """Determine whether a line should be skipped entirely.
205 You may override this method in order to use Message parsing
206 on tagged data in RFC822-like formats that support embedded
207 comments or free-text data.
211 def getallmatchingheaders(self
, name
):
212 """Find all header lines matching a given header name.
214 Look through the list of headers and find all lines
215 matching a given header name (and their continuation
216 lines). A list of the lines is returned, without
217 interpretation. If the header does not occur, an
218 empty list is returned. If the header occurs multiple
219 times, all occurrences are returned. Case is not
220 important in the header name.
222 name
= string
.lower(name
) + ':'
226 for line
in self
.headers
:
227 if string
.lower(line
[:n
]) == name
:
229 elif line
[:1] not in string
.whitespace
:
235 def getfirstmatchingheader(self
, name
):
236 """Get the first header line matching name.
238 This is similar to getallmatchingheaders, but it returns
239 only the first matching header (and its continuation
242 name
= string
.lower(name
) + ':'
246 for line
in self
.headers
:
248 if line
[:1] not in string
.whitespace
:
250 elif string
.lower(line
[:n
]) == name
:
256 def getrawheader(self
, name
):
257 """A higher-level interface to getfirstmatchingheader().
259 Return a string containing the literal text of the
260 header but with the keyword stripped. All leading,
261 trailing and embedded whitespace is kept in the
263 Return None if the header does not occur.
266 list = self
.getfirstmatchingheader(name
)
269 list[0] = list[0][len(name
) + 1:]
270 return string
.joinfields(list, '')
272 def getheader(self
, name
, default
=None):
273 """Get the header value for a name.
275 This is the normal interface: it return a stripped
276 version of the header value for a given header name,
277 or None if it doesn't exist. This uses the dictionary
278 version which finds the *last* such header.
281 return self
.dict[string
.lower(name
)]
286 def getaddr(self
, name
):
287 """Get a single address from a header, as a tuple.
289 An example return value:
290 ('Guido van Rossum', 'guido@cwi.nl')
293 alist
= self
.getaddrlist(name
)
299 def getaddrlist(self
, name
):
300 """Get a list of addresses from a header.
302 Retrieves a list of addresses from a header, where each address is a
303 tuple as returned by getaddr(). Scans all named headers, so it works
304 properly with multiple To: or Cc: headers for example.
308 for h
in self
.getallmatchingheaders(name
):
314 i
= string
.find(h
, ':')
318 alladdrs
= string
.join(raw
, '')
319 a
= AddrlistClass(alladdrs
)
320 return a
.getaddrlist()
322 def getdate(self
, name
):
323 """Retrieve a date field from a header.
325 Retrieves a date field from the named header, returning
326 a tuple compatible with time.mktime().
332 return parsedate(data
)
334 def getdate_tz(self
, name
):
335 """Retrieve a date field from a header as a 10-tuple.
337 The first 9 elements make up a tuple compatible with
338 time.mktime(), and the 10th is the offset of the poster's
339 time zone from GMT/UTC.
345 return parsedate_tz(data
)
348 # Access as a dictionary (only finds *last* header of each type):
351 """Get the number of headers in a message."""
352 return len(self
.dict)
354 def __getitem__(self
, name
):
355 """Get a specific header, as from a dictionary."""
356 return self
.dict[string
.lower(name
)]
358 def __setitem__(self
, name
, value
):
359 """Set the value of a header.
361 Note: This is not a perfect inversion of __getitem__, because
362 any changed headers get stuck at the end of the raw-headers list
363 rather than where the altered header was.
365 del self
[name
] # Won't fail if it doesn't exist
366 self
.dict[string
.lower(name
)] = value
367 text
= name
+ ": " + value
368 lines
= string
.split(text
, "\n")
370 self
.headers
.append(line
+ "\n")
372 def __delitem__(self
, name
):
373 """Delete all occurrences of a specific header, if it is present."""
374 name
= string
.lower(name
)
375 if not self
.dict.has_key(name
):
382 for i
in range(len(self
.headers
)):
383 line
= self
.headers
[i
]
384 if string
.lower(line
[:n
]) == name
:
386 elif line
[:1] not in string
.whitespace
:
394 def has_key(self
, name
):
395 """Determine whether a message contains the named header."""
396 return self
.dict.has_key(string
.lower(name
))
399 """Get all of a message's header field names."""
400 return self
.dict.keys()
403 """Get all of a message's header field values."""
404 return self
.dict.values()
407 """Get all of a message's headers.
409 Returns a list of name, value tuples.
411 return self
.dict.items()
415 for hdr
in self
.headers
:
423 # XXX Should fix unquote() and quote() to be really conformant.
424 # XXX The inverses of the parse functions may also be useful.
428 """Remove quotes from a string."""
430 if str[0] == '"' and str[-1:] == '"':
432 if str[0] == '<' and str[-1:] == '>':
438 """Add quotes around a string."""
439 return '"%s"' % string
.join(
442 string
.split(str, '\\'),
448 def parseaddr(address
):
449 """Parse an address into a (realname, mailaddr) tuple."""
450 a
= AddrlistClass(address
)
451 list = a
.getaddrlist()
459 """Address parser class by Ben Escoto.
461 To understand what this class does, it helps to have a copy of
462 RFC-822 in front of you.
464 Note: this class interface is deprecated and may be removed in the future.
465 Use rfc822.AddressList instead.
468 def __init__(self
, field
):
469 """Initialize a new instance.
471 `field' is an unparsed address header field, containing
472 one or more addresses.
474 self
.specials
= '()<>@,:;.\"[]'
478 self
.atomends
= self
.specials
+ self
.LWS
+ self
.CR
480 self
.commentlist
= []
483 """Parse up to the start of the next address."""
484 while self
.pos
< len(self
.field
):
485 if self
.field
[self
.pos
] in self
.LWS
+ '\n\r':
486 self
.pos
= self
.pos
+ 1
487 elif self
.field
[self
.pos
] == '(':
488 self
.commentlist
.append(self
.getcomment())
491 def getaddrlist(self
):
492 """Parse all addresses.
494 Returns a list containing all of the addresses.
496 ad
= self
.getaddress()
498 return ad
+ self
.getaddrlist()
501 def getaddress(self
):
502 """Parse the next address."""
503 self
.commentlist
= []
507 oldcl
= self
.commentlist
508 plist
= self
.getphraselist()
513 if self
.pos
>= len(self
.field
):
514 # Bad email address technically, no domain.
516 returnlist
= [(string
.join(self
.commentlist
), plist
[0])]
518 elif self
.field
[self
.pos
] in '.@':
519 # email address is just an addrspec
520 # this isn't very efficient since we start over
522 self
.commentlist
= oldcl
523 addrspec
= self
.getaddrspec()
524 returnlist
= [(string
.join(self
.commentlist
), addrspec
)]
526 elif self
.field
[self
.pos
] == ':':
530 self
.pos
= self
.pos
+ 1
531 while self
.pos
< len(self
.field
):
533 if self
.field
[self
.pos
] == ';':
534 self
.pos
= self
.pos
+ 1
536 returnlist
= returnlist
+ self
.getaddress()
538 elif self
.field
[self
.pos
] == '<':
539 # Address is a phrase then a route addr
540 routeaddr
= self
.getrouteaddr()
543 returnlist
= [(string
.join(plist
) + ' (' + \
544 string
.join(self
.commentlist
) + ')', routeaddr
)]
545 else: returnlist
= [(string
.join(plist
), routeaddr
)]
549 returnlist
= [(string
.join(self
.commentlist
), plist
[0])]
550 elif self
.field
[self
.pos
] in self
.specials
:
551 self
.pos
= self
.pos
+ 1
554 if self
.pos
< len(self
.field
) and self
.field
[self
.pos
] == ',':
555 self
.pos
= self
.pos
+ 1
558 def getrouteaddr(self
):
559 """Parse a route address (Return-path value).
561 This method just skips all the route stuff and returns the addrspec.
563 if self
.field
[self
.pos
] != '<':
567 self
.pos
= self
.pos
+ 1
570 while self
.pos
< len(self
.field
):
574 elif self
.field
[self
.pos
] == '>':
575 self
.pos
= self
.pos
+ 1
577 elif self
.field
[self
.pos
] == '@':
578 self
.pos
= self
.pos
+ 1
580 elif self
.field
[self
.pos
] == ':':
581 self
.pos
= self
.pos
+ 1
584 adlist
= self
.getaddrspec()
585 self
.pos
= self
.pos
+ 1
591 def getaddrspec(self
):
592 """Parse an RFC-822 addr-spec."""
596 while self
.pos
< len(self
.field
):
597 if self
.field
[self
.pos
] == '.':
599 self
.pos
= self
.pos
+ 1
600 elif self
.field
[self
.pos
] == '"':
601 aslist
.append(self
.getquote())
602 elif self
.field
[self
.pos
] in self
.atomends
:
604 else: aslist
.append(self
.getatom())
607 if self
.pos
>= len(self
.field
) or self
.field
[self
.pos
] != '@':
608 return string
.join(aslist
, '')
611 self
.pos
= self
.pos
+ 1
613 return string
.join(aslist
, '') + self
.getdomain()
616 """Get the complete domain name from an address."""
618 while self
.pos
< len(self
.field
):
619 if self
.field
[self
.pos
] in self
.LWS
:
620 self
.pos
= self
.pos
+ 1
621 elif self
.field
[self
.pos
] == '(':
622 self
.commentlist
.append(self
.getcomment())
623 elif self
.field
[self
.pos
] == '[':
624 sdlist
.append(self
.getdomainliteral())
625 elif self
.field
[self
.pos
] == '.':
626 self
.pos
= self
.pos
+ 1
628 elif self
.field
[self
.pos
] in self
.atomends
:
630 else: sdlist
.append(self
.getatom())
631 return string
.join(sdlist
, '')
633 def getdelimited(self
, beginchar
, endchars
, allowcomments
= 1):
634 """Parse a header fragment delimited by special characters.
636 `beginchar' is the start character for the fragment.
637 If self is not looking at an instance of `beginchar' then
638 getdelimited returns the empty string.
640 `endchars' is a sequence of allowable end-delimiting characters.
641 Parsing stops when one of these is encountered.
643 If `allowcomments' is non-zero, embedded RFC-822 comments
644 are allowed within the parsed fragment.
646 if self
.field
[self
.pos
] != beginchar
:
651 self
.pos
= self
.pos
+ 1
652 while self
.pos
< len(self
.field
):
654 slist
.append(self
.field
[self
.pos
])
656 elif self
.field
[self
.pos
] in endchars
:
657 self
.pos
= self
.pos
+ 1
659 elif allowcomments
and self
.field
[self
.pos
] == '(':
660 slist
.append(self
.getcomment())
661 elif self
.field
[self
.pos
] == '\\':
664 slist
.append(self
.field
[self
.pos
])
665 self
.pos
= self
.pos
+ 1
667 return string
.join(slist
, '')
670 """Get a quote-delimited fragment from self's field."""
671 return self
.getdelimited('"', '"\r', 0)
673 def getcomment(self
):
674 """Get a parenthesis-delimited fragment from self's field."""
675 return self
.getdelimited('(', ')\r', 1)
677 def getdomainliteral(self
):
678 """Parse an RFC-822 domain-literal."""
679 return self
.getdelimited('[', ']\r', 0)
682 """Parse an RFC-822 atom."""
685 while self
.pos
< len(self
.field
):
686 if self
.field
[self
.pos
] in self
.atomends
:
688 else: atomlist
.append(self
.field
[self
.pos
])
689 self
.pos
= self
.pos
+ 1
691 return string
.join(atomlist
, '')
693 def getphraselist(self
):
694 """Parse a sequence of RFC-822 phrases.
696 A phrase is a sequence of words, which are in turn either
697 RFC-822 atoms or quoted-strings. Phrases are canonicalized
698 by squeezing all runs of continuous whitespace into one space.
702 while self
.pos
< len(self
.field
):
703 if self
.field
[self
.pos
] in self
.LWS
:
704 self
.pos
= self
.pos
+ 1
705 elif self
.field
[self
.pos
] == '"':
706 plist
.append(self
.getquote())
707 elif self
.field
[self
.pos
] == '(':
708 self
.commentlist
.append(self
.getcomment())
709 elif self
.field
[self
.pos
] in self
.atomends
:
711 else: plist
.append(self
.getatom())
715 class AddressList(AddrlistClass
):
716 """An AddressList encapsulates a list of parsed RFC822 addresses."""
717 def __init__(self
, field
):
718 AddrlistClass
.__init
__(self
, field
)
720 self
.addresslist
= self
.getaddrlist()
722 self
.addresslist
= []
725 return len(self
.addresslist
)
728 return string
.joinfields(map(dump_address_pair
, self
.addresslist
),", ")
730 def __add__(self
, other
):
732 newaddr
= AddressList(None)
733 newaddr
.addresslist
= self
.addresslist
[:]
734 for x
in other
.addresslist
:
735 if not x
in self
.addresslist
:
736 newaddr
.addresslist
.append(x
)
739 def __sub__(self
, other
):
741 newaddr
= AddressList(None)
742 for x
in self
.addresslist
:
743 if not x
in other
.addresslist
:
744 newaddr
.addresslist
.append(x
)
747 def __getitem__(self
, index
):
748 # Make indexing, slices, and 'in' work
749 return self
.addrlist
[index
]
751 def dump_address_pair(pair
):
752 """Dump a (name, address) pair in a canonicalized form."""
754 return '"' + pair
[0] + '" <' + pair
[1] + '>'
760 _monthnames
= ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
761 'aug', 'sep', 'oct', 'nov', 'dec',
762 'january', 'february', 'march', 'april', 'may', 'june', 'july',
763 'august', 'september', 'october', 'november', 'december']
764 _daynames
= ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
766 # The timezone table does not include the military time zones defined
767 # in RFC822, other than Z. According to RFC1123, the description in
768 # RFC822 gets the signs wrong, so we can't rely on any such time
769 # zones. RFC1123 recommends that numeric timezone indicators be used
770 # instead of timezone names.
772 _timezones
= {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
773 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
774 'EST': -500, 'EDT': -400, # Eastern
775 'CST': -600, 'CDT': -500, # Central
776 'MST': -700, 'MDT': -600, # Mountain
777 'PST': -800, 'PDT': -700 # Pacific
781 def parsedate_tz(data
):
782 """Convert a date string to a time tuple.
784 Accounts for military timezones.
786 data
= string
.split(data
)
787 if data
[0][-1] in (',', '.') or string
.lower(data
[0]) in _daynames
:
788 # There's a dayname here. Skip it
790 if len(data
) == 3: # RFC 850 date, deprecated
791 stuff
= string
.split(data
[0], '-')
793 data
= stuff
+ data
[1:]
796 i
= string
.find(s
, '+')
798 data
[3:] = [s
[:i
], s
[i
+1:]]
800 data
.append('') # Dummy tz
804 [dd
, mm
, yy
, tm
, tz
] = data
805 mm
= string
.lower(mm
)
806 if not mm
in _monthnames
:
807 dd
, mm
= mm
, string
.lower(dd
)
808 if not mm
in _monthnames
:
810 mm
= _monthnames
.index(mm
)+1
813 i
= string
.find(yy
, ':')
818 if yy
[0] not in string
.digits
:
822 tm
= string
.splitfields(tm
, ':')
833 thh
= string
.atoi(thh
)
834 tmm
= string
.atoi(tmm
)
835 tss
= string
.atoi(tss
)
836 except string
.atoi_error
:
840 if _timezones
.has_key(tz
):
841 tzoffset
=_timezones
[tz
]
844 tzoffset
=string
.atoi(tz
)
845 except string
.atoi_error
:
847 # Convert a timezone offset into seconds ; -0500 -> -18000
854 tzoffset
= tzsign
* ( (tzoffset
/100)*3600 + (tzoffset
% 100)*60)
855 tuple = (yy
, mm
, dd
, thh
, tmm
, tss
, 0, 0, 0, tzoffset
)
860 """Convert a time string to a time tuple."""
862 if type(t
)==type( () ):
868 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
870 # No zone info, so localtime is better assumption than GMT
871 return time
.mktime(data
[:8] + (-1,))
873 t
= time
.mktime(data
[:8] + (0,))
874 return t
- data
[9] - time
.timezone
877 # When used as script, run a small test program.
878 # The first command line argument must be a filename containing one
879 # message in RFC-822 format.
881 if __name__
== '__main__':
883 file = os
.path
.join(os
.environ
['HOME'], 'Mail/inbox/1')
884 if sys
.argv
[1:]: file = sys
.argv
[1]
887 print 'From:', m
.getaddr('from')
888 print 'To:', m
.getaddrlist('to')
889 print 'Subject:', m
.getheader('subject')
890 print 'Date:', m
.getheader('date')
891 date
= m
.getdate_tz('date')
893 print 'ParsedDate:', time
.asctime(date
[:-1]),
895 hhmm
, ss
= divmod(hhmmss
, 60)
896 hh
, mm
= divmod(hhmm
, 60)
897 print "%+03d%02d" % (hh
, mm
),
898 if ss
: print ".%02d" % ss
,
901 print 'ParsedDate:', None
908 print 'len =', len(m
)
909 if m
.has_key('Date'): print 'Date =', m
['Date']
910 if m
.has_key('X-Nonsense'): pass
911 print 'keys =', m
.keys()
912 print 'values =', m
.values()
913 print 'items =', m
.items()