1 # RFC-822 message manipulation class.
3 # XXX This is only a very rough sketch of a full RFC-822 parser;
4 # in particular the tokenizing of addresses does not adhere to all the
9 # To create a Message object: first open a file, e.g.:
10 # fp = open(file, 'r')
11 # (or use any other legal way of getting an open file object, e.g. use
12 # sys.stdin or call os.popen()).
13 # Then pass the open file object to the Message() constructor:
16 # To get the text of a particular header there are several methods:
17 # str = m.getheader(name)
18 # str = m.getrawheader(name)
19 # where name is the name of the header, e.g. 'Subject'.
20 # The difference is that getheader() strips the leading and trailing
21 # whitespace, while getrawheader() doesn't. Both functions retain
22 # embedded whitespace (including newlines) exactly as they are
23 # specified in the header, and leave the case of the text unchanged.
25 # For addresses and address lists there are functions
26 # realname, mailaddress = m.getaddr(name) and
27 # list = m.getaddrlist(name)
28 # where the latter returns a list of (realname, mailaddr) tuples.
30 # There is also a method
31 # time = m.getdate(name)
32 # which parses a Date-like field and returns a time-compatible tuple,
33 # i.e. a tuple such as returned by time.localtime() or accepted by
36 # See the class definition for lower level access methods.
38 # There are also some utility functions here.
48 # Initialize the class instance and read the headers.
50 def __init__(self
, fp
):
54 self
.startofheaders
= self
.fp
.tell()
56 self
.startofheaders
= None
61 self
.startofbody
= self
.fp
.tell()
63 self
.startofbody
= None
66 # Rewind the file to the start of the body (if seekable).
69 self
.fp
.seek(self
.startofbody
)
72 # Read header lines up to the entirely blank line that
73 # terminates them. The (normally blank) line that ends the
74 # headers is skipped, but not included in the returned list.
75 # If a non-header line ends the headers, (which is an error),
76 # an attempt is made to backspace over it; it is never
77 # included in the returned list.
79 # The variable self.status is set to the empty string if all
80 # went well, otherwise it is an error message.
81 # The variable self.headers is a completely uninterpreted list
82 # of lines contained in the header (so printing them will
83 # reproduce the header exactly as it appears in the file).
85 def readheaders(self
):
86 self
.headers
= list = []
90 line
= self
.fp
.readline()
92 self
.status
= 'EOF in headers'
96 elif headerseen
and line
[0] in ' \t':
97 # It's a continuation line.
99 elif regex
.match('^[!-9;-~]+:', line
) >= 0:
100 # It's a header line.
104 # It's not a header line; stop here.
106 self
.status
= 'No headers'
108 self
.status
= 'Bad header'
109 # Try to undo the read.
111 self
.fp
.seek(-len(line
), 1)
114 self
.status
+ '; bad seek'
118 # Method to determine whether a line is a legal end of
119 # RFC-822 headers. You may override this method if your
120 # application wants to bend the rules, e.g. to strip trailing
121 # whitespace, or to recognise MH template separators
122 # ('--------'). For convenience (e.g. for code reading from
123 # sockets) a line consisting of \r\n also matches.
125 def islast(self
, line
):
126 return line
== '\n' or line
== '\r\n'
129 # Look through the list of headers and find all lines matching
130 # a given header name (and their continuation lines).
131 # A list of the lines is returned, without interpretation.
132 # If the header does not occur, an empty list is returned.
133 # If the header occurs multiple times, all occurrences are
134 # returned. Case is not important in the header name.
136 def getallmatchingheaders(self
, name
):
137 name
= string
.lower(name
) + ':'
141 for line
in self
.headers
:
142 if string
.lower(line
[:n
]) == name
:
144 elif line
[:1] not in string
.whitespace
:
151 # Similar, but return only the first matching header (and its
152 # continuation lines).
154 def getfirstmatchingheader(self
, name
):
155 name
= string
.lower(name
) + ':'
159 for line
in self
.headers
:
161 if line
[:1] not in string
.whitespace
:
163 elif string
.lower(line
[:n
]) == name
:
170 # A higher-level interface to getfirstmatchingheader().
171 # Return a string containing the literal text of the header
172 # but with the keyword stripped. All leading, trailing and
173 # embedded whitespace is kept in the string, however.
174 # Return None if the header does not occur.
176 def getrawheader(self
, name
):
177 list = self
.getfirstmatchingheader(name
)
180 list[0] = list[0][len(name
) + 1:]
181 return string
.joinfields(list, '')
184 # Going one step further: also strip leading and trailing
187 def getheader(self
, name
):
188 text
= self
.getrawheader(name
)
191 return string
.strip(text
)
194 # Retrieve a single address from a header as a tuple, e.g.
195 # ('Guido van Rossum', 'guido@cwi.nl').
197 def getaddr(self
, name
):
198 data
= self
.getheader(name
)
201 return parseaddr(data
)
203 # Retrieve a list of addresses from a header, where each
204 # address is a tuple as returned by getaddr().
206 def getaddrlist(self
, name
):
207 # XXX This function is not really correct. The split
208 # on ',' might fail in the case of commas within
210 data
= self
.getheader(name
)
213 data
= string
.splitfields(data
, ',')
214 for i
in range(len(data
)):
215 data
[i
] = parseaddr(data
[i
])
218 # Retrieve a date field from a header as a tuple compatible
219 # with time.mktime().
221 def getdate(self
, name
):
222 data
= self
.getheader(name
)
225 return parsedate(data
)
228 # Access as a dictionary (only finds first header of each type):
232 for line
in self
.headers
:
233 if line
[0] in string
.whitespace
: continue
234 i
= string
.find(line
, ':')
236 name
= string
.lower(line
[:i
])
240 def __getitem__(self
, name
):
241 value
= self
.getheader(name
)
242 if value
is None: raise KeyError, name
245 def has_key(self
, name
):
246 value
= self
.getheader(name
)
247 return value
is not None
251 for line
in self
.headers
:
252 if line
[0] in string
.whitespace
: continue
253 i
= string
.find(line
, ':')
256 key
= string
.lower(name
)
258 return types
.values()
262 for name
in self
.keys():
263 values
.append(self
[name
])
268 for name
in self
.keys():
269 items
.append(name
, self
[name
])
277 # XXX Should fix these to be really conformant.
278 # XXX The inverses of the parse functions may also be useful.
281 # Remove quotes from a string.
285 if str[0] == '"' and str[-1:] == '"':
287 if str[0] == '<' and str[-1:] == '>':
292 # Parse an address into (name, address) tuple
294 def parseaddr(address
):
295 # This is probably not perfect
296 address
= string
.strip(address
)
297 # Case 1: part of the address is in <xx@xx> form.
298 pos
= regex
.search('<.*>', address
)
301 address
= address
[pos
:]
302 length
= regex
.match('<.*>', address
)
303 name
= name
+ address
[length
:]
304 address
= address
[:length
]
306 # Case 2: part of the address is in (comment) form
307 pos
= regex
.search('(.*)', address
)
310 address
= address
[:pos
]
311 length
= regex
.match('(.*)', name
)
312 address
= address
+ name
[length
:]
315 # Case 3: neither. Only an address
317 name
= string
.strip(name
)
318 address
= string
.strip(address
)
319 if address
and address
[0] == '<' and address
[-1] == '>':
320 address
= address
[1:-1]
321 if name
and name
[0] == '(' and name
[-1] == ')':
328 _monthnames
= ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
329 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
332 # XXX This still mostly ignores timezone matters at the moment...
333 data
= string
.split(data
)
334 if data
[0][-1] == ',':
335 # There's a dayname here. Skip it
339 i
= string
.find(s
, '+')
341 data
[3:] = [s
[:i
], s
[i
+1:]]
343 data
.append('') # Dummy tz
347 [dd
, mm
, yy
, tm
, tz
] = data
348 if not mm
in _monthnames
:
350 mm
= _monthnames
.index(mm
)+1
351 tm
= string
.splitfields(tm
, ':')
360 thh
= string
.atoi(thh
)
361 tmm
= string
.atoi(tmm
)
362 tss
= string
.atoi(tss
)
363 except string
.atoi_error
:
365 tuple = (yy
, mm
, dd
, thh
, tmm
, tss
, 0, 0, 0)
369 # When used as script, run a small test program.
370 # The first command line argument must be a filename containing one
371 # message in RFC-822 format.
373 if __name__
== '__main__':
375 file = '/ufs/guido/Mail/drafts/,1'
376 if sys
.argv
[1:]: file = sys
.argv
[1]
379 print 'From:', m
.getaddr('from')
380 print 'To:', m
.getaddrlist('to')
381 print 'Subject:', m
.getheader('subject')
382 print 'Date:', m
.getheader('date')
383 date
= m
.getdate('date')
385 print 'ParsedDate:', time
.asctime(date
)
387 print 'ParsedDate:', None
394 print 'len =', len(m
)
395 if m
.has_key('Date'): print 'Date =', m
['Date']
396 if m
.has_key('X-Nonsense'): pass
397 print 'keys =', m
.keys()
398 print 'values =', m
.values()
399 print 'items =', m
.items()