1 # RFC-822 message manipulation class.
3 # XXX This is only a very rough sketch of a full RFC-822 parser;
4 # in particular the tokenizing of addresses does not adhere to all the
9 # To create a Message object: first open a file, e.g.:
10 # fp = open(file, 'r')
11 # (or use any other legal way of getting an open file object, e.g. use
12 # sys.stdin or call os.popen()).
13 # Then pass the open file object to the Message() constructor:
16 # To get the text of a particular header there are several methods:
17 # str = m.getheader(name)
18 # str = m.getrawheader(name)
19 # where name is the name of the header, e.g. 'Subject'.
20 # The difference is that getheader() strips the leading and trailing
21 # whitespace, while getrawheader() doesn't. Both functions retain
22 # embedded whitespace (including newlines) exactly as they are
23 # specified in the header, and leave the case of the text unchanged.
25 # For addresses and address lists there are functions
26 # realname, mailaddress = m.getaddr(name) and
27 # list = m.getaddrlist(name)
28 # where the latter returns a list of (realname, mailaddr) tuples.
30 # There is also a method
31 # time = m.getdate(name)
32 # which parses a Date-like field and returns a time-compatible tuple,
33 # i.e. a tuple such as returned by time.localtime() or accepted by
36 # See the class definition for lower level access methods.
38 # There are also some utility functions here.
46 _blanklines
= ('\r\n', '\n') # Optimization for islast()
51 # Initialize the class instance and read the headers.
53 def __init__(self
, fp
, seekable
= 1):
55 self
.seekable
= seekable
56 self
.startofheaders
= None
57 self
.startofbody
= None
61 self
.startofheaders
= self
.fp
.tell()
69 self
.startofbody
= self
.fp
.tell()
74 # Rewind the file to the start of the body (if seekable).
78 raise IOError, "unseekable file"
79 self
.fp
.seek(self
.startofbody
)
82 # Read header lines up to the entirely blank line that
83 # terminates them. The (normally blank) line that ends the
84 # headers is skipped, but not included in the returned list.
85 # If a non-header line ends the headers, (which is an error),
86 # an attempt is made to backspace over it; it is never
87 # included in the returned list.
89 # The variable self.status is set to the empty string if all
90 # went well, otherwise it is an error message.
91 # The variable self.headers is a completely uninterpreted list
92 # of lines contained in the header (so printing them will
93 # reproduce the header exactly as it appears in the file).
95 def readheaders(self
):
97 self
.headers
= list = []
102 line
= self
.fp
.readline()
104 self
.status
= 'EOF in headers'
106 # Skip unix From name time lines
107 if firstline
and line
[:5] == 'From ':
108 self
.unixfrom
= self
.unixfrom
+ line
111 if self
.islast(line
):
113 elif headerseen
and line
[0] in ' \t':
114 # It's a continuation line.
116 elif regex
.match('^[!-9;-~]+:', line
) >= 0:
117 # It's a header line.
121 # It's not a header line; stop here.
123 self
.status
= 'No headers'
125 self
.status
= 'Bad header'
126 # Try to undo the read.
128 self
.fp
.seek(-len(line
), 1)
131 self
.status
+ '; bad seek'
135 # Method to determine whether a line is a legal end of
136 # RFC-822 headers. You may override this method if your
137 # application wants to bend the rules, e.g. to strip trailing
138 # whitespace, or to recognise MH template separators
139 # ('--------'). For convenience (e.g. for code reading from
140 # sockets) a line consisting of \r\n also matches.
142 def islast(self
, line
):
143 return line
in _blanklines
146 # Look through the list of headers and find all lines matching
147 # a given header name (and their continuation lines).
148 # A list of the lines is returned, without interpretation.
149 # If the header does not occur, an empty list is returned.
150 # If the header occurs multiple times, all occurrences are
151 # returned. Case is not important in the header name.
153 def getallmatchingheaders(self
, name
):
154 name
= string
.lower(name
) + ':'
158 for line
in self
.headers
:
159 if string
.lower(line
[:n
]) == name
:
161 elif line
[:1] not in string
.whitespace
:
168 # Similar, but return only the first matching header (and its
169 # continuation lines).
171 def getfirstmatchingheader(self
, name
):
172 name
= string
.lower(name
) + ':'
176 for line
in self
.headers
:
178 if line
[:1] not in string
.whitespace
:
180 elif string
.lower(line
[:n
]) == name
:
187 # A higher-level interface to getfirstmatchingheader().
188 # Return a string containing the literal text of the header
189 # but with the keyword stripped. All leading, trailing and
190 # embedded whitespace is kept in the string, however.
191 # Return None if the header does not occur.
193 def getrawheader(self
, name
):
194 list = self
.getfirstmatchingheader(name
)
197 list[0] = list[0][len(name
) + 1:]
198 return string
.joinfields(list, '')
201 # Going one step further: also strip leading and trailing
204 def getheader(self
, name
):
205 text
= self
.getrawheader(name
)
208 return string
.strip(text
)
211 # Retrieve a single address from a header as a tuple, e.g.
212 # ('Guido van Rossum', 'guido@cwi.nl').
214 def getaddr(self
, name
):
215 data
= self
.getheader(name
)
218 return parseaddr(data
)
220 # Retrieve a list of addresses from a header, where each
221 # address is a tuple as returned by getaddr().
223 def getaddrlist(self
, name
):
224 # XXX This function is not really correct. The split
225 # on ',' might fail in the case of commas within
227 data
= self
.getheader(name
)
230 data
= string
.splitfields(data
, ',')
231 for i
in range(len(data
)):
232 data
[i
] = parseaddr(data
[i
])
235 # Retrieve a date field from a header as a tuple compatible
236 # with time.mktime().
238 def getdate(self
, name
):
239 data
= self
.getheader(name
)
242 return parsedate(data
)
245 # Access as a dictionary (only finds first header of each type):
249 for line
in self
.headers
:
250 if line
[0] in string
.whitespace
: continue
251 i
= string
.find(line
, ':')
253 name
= string
.lower(line
[:i
])
257 def __getitem__(self
, name
):
258 value
= self
.getheader(name
)
259 if value
is None: raise KeyError, name
262 def has_key(self
, name
):
263 value
= self
.getheader(name
)
264 return value
is not None
268 for line
in self
.headers
:
269 if line
[0] in string
.whitespace
: continue
270 i
= string
.find(line
, ':')
273 key
= string
.lower(name
)
275 return types
.values()
279 for name
in self
.keys():
280 values
.append(self
[name
])
285 for name
in self
.keys():
286 items
.append(name
, self
[name
])
294 # XXX Should fix these to be really conformant.
295 # XXX The inverses of the parse functions may also be useful.
298 # Remove quotes from a string.
302 if str[0] == '"' and str[-1:] == '"':
304 if str[0] == '<' and str[-1:] == '>':
309 # Parse an address into (name, address) tuple
311 def parseaddr(address
):
312 # This is probably not perfect
313 address
= string
.strip(address
)
314 # Case 1: part of the address is in <xx@xx> form.
315 pos
= regex
.search('<.*>', address
)
318 address
= address
[pos
:]
319 length
= regex
.match('<.*>', address
)
320 name
= name
+ address
[length
:]
321 address
= address
[:length
]
323 # Case 2: part of the address is in (comment) form
324 pos
= regex
.search('(.*)', address
)
327 address
= address
[:pos
]
328 length
= regex
.match('(.*)', name
)
329 address
= address
+ name
[length
:]
332 # Case 3: neither. Only an address
334 name
= string
.strip(name
)
335 address
= string
.strip(address
)
336 if address
and address
[0] == '<' and address
[-1] == '>':
337 address
= address
[1:-1]
338 if name
and name
[0] == '(' and name
[-1] == ')':
345 _monthnames
= ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
346 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
349 # XXX This still mostly ignores timezone matters at the moment...
350 data
= string
.split(data
)
351 if data
[0][-1] == ',':
352 # There's a dayname here. Skip it
356 i
= string
.find(s
, '+')
358 data
[3:] = [s
[:i
], s
[i
+1:]]
360 data
.append('') # Dummy tz
364 [dd
, mm
, yy
, tm
, tz
] = data
365 if not mm
in _monthnames
:
367 mm
= _monthnames
.index(mm
)+1
368 tm
= string
.splitfields(tm
, ':')
377 thh
= string
.atoi(thh
)
378 tmm
= string
.atoi(tmm
)
379 tss
= string
.atoi(tss
)
380 except string
.atoi_error
:
382 tuple = (yy
, mm
, dd
, thh
, tmm
, tss
, 0, 0, 0)
386 # When used as script, run a small test program.
387 # The first command line argument must be a filename containing one
388 # message in RFC-822 format.
390 if __name__
== '__main__':
392 file = '/ufs/guido/Mail/drafts/,1'
393 if sys
.argv
[1:]: file = sys
.argv
[1]
396 print 'From:', m
.getaddr('from')
397 print 'To:', m
.getaddrlist('to')
398 print 'Subject:', m
.getheader('subject')
399 print 'Date:', m
.getheader('date')
400 date
= m
.getdate('date')
402 print 'ParsedDate:', time
.asctime(date
)
404 print 'ParsedDate:', None
411 print 'len =', len(m
)
412 if m
.has_key('Date'): print 'Date =', m
['Date']
413 if m
.has_key('X-Nonsense'): pass
414 print 'keys =', m
.keys()
415 print 'values =', m
.values()
416 print 'items =', m
.items()