- Got rid of newmodule.c
[python/dscho.git] / Doc / lib / librfc822.tex
blob6451b3e932870fd554bfe6a784b45ceb0c62ad7e
1 \section{\module{rfc822} ---
2 Parse RFC 2822 mail headers}
4 \declaremodule{standard}{rfc822}
5 \modulesynopsis{Parse \rfc{2822} style mail messages.}
7 This module defines a class, \class{Message}, which represents an
8 ``email message'' as defined by the Internet standard
9 \rfc{2822}.\footnote{This module originally conformed to \rfc{822},
10 hence the name. Since then, \rfc{2822} has been released as an
11 update to \rfc{822}. This module should be considered
12 \rfc{2822}-conformant, especially in cases where the
13 syntax or semantics have changed since \rfc{822}.} Such messages
14 consist of a collection of message headers, and a message body. This
15 module also defines a helper class
16 \class{AddressList} for parsing \rfc{2822} addresses. Please refer to
17 the RFC for information on the specific syntax of \rfc{2822} messages.
19 The \refmodule{mailbox}\refstmodindex{mailbox} module provides classes
20 to read mailboxes produced by various end-user mail programs.
22 \begin{classdesc}{Message}{file\optional{, seekable}}
23 A \class{Message} instance is instantiated with an input object as
24 parameter. Message relies only on the input object having a
25 \method{readline()} method; in particular, ordinary file objects
26 qualify. Instantiation reads headers from the input object up to a
27 delimiter line (normally a blank line) and stores them in the
28 instance. The message body, following the headers, is not consumed.
30 This class can work with any input object that supports a
31 \method{readline()} method. If the input object has seek and tell
32 capability, the \method{rewindbody()} method will work; also, illegal
33 lines will be pushed back onto the input stream. If the input object
34 lacks seek but has an \method{unread()} method that can push back a
35 line of input, \class{Message} will use that to push back illegal
36 lines. Thus this class can be used to parse messages coming from a
37 buffered stream.
39 The optional \var{seekable} argument is provided as a workaround for
40 certain stdio libraries in which \cfunction{tell()} discards buffered
41 data before discovering that the \cfunction{lseek()} system call
42 doesn't work. For maximum portability, you should set the seekable
43 argument to zero to prevent that initial \method{tell()} when passing
44 in an unseekable object such as a a file object created from a socket
45 object.
47 Input lines as read from the file may either be terminated by CR-LF or
48 by a single linefeed; a terminating CR-LF is replaced by a single
49 linefeed before the line is stored.
51 All header matching is done independent of upper or lower case;
52 e.g.\ \code{\var{m}['From']}, \code{\var{m}['from']} and
53 \code{\var{m}['FROM']} all yield the same result.
54 \end{classdesc}
56 \begin{classdesc}{AddressList}{field}
57 You may instantiate the \class{AddressList} helper class using a single
58 string parameter, a comma-separated list of \rfc{2822} addresses to be
59 parsed. (The parameter \code{None} yields an empty list.)
60 \end{classdesc}
62 \begin{funcdesc}{quote}{str}
63 Return a new string with backslashes in \var{str} replaced by two
64 backslashes and double quotes replaced by backslash-double quote.
65 \end{funcdesc}
67 \begin{funcdesc}{unquote}{str}
68 Return a new string which is an \emph{unquoted} version of \var{str}.
69 If \var{str} ends and begins with double quotes, they are stripped
70 off. Likewise if \var{str} ends and begins with angle brackets, they
71 are stripped off.
72 \end{funcdesc}
74 \begin{funcdesc}{parseaddr}{address}
75 Parse \var{address}, which should be the value of some
76 address-containing field such as \mailheader{To} or \mailheader{Cc},
77 into its constituent ``realname'' and ``email address'' parts.
78 Returns a tuple of that information, unless the parse fails, in which
79 case a 2-tuple \code{(None, None)} is returned.
80 \end{funcdesc}
82 \begin{funcdesc}{dump_address_pair}{pair}
83 The inverse of \method{parseaddr()}, this takes a 2-tuple of the form
84 \code{(\var{realname}, \var{email_address})} and returns the string
85 value suitable for a \mailheader{To} or \mailheader{Cc} header. If
86 the first element of \var{pair} is false, then the second element is
87 returned unmodified.
88 \end{funcdesc}
90 \begin{funcdesc}{parsedate}{date}
91 Attempts to parse a date according to the rules in \rfc{2822}.
92 however, some mailers don't follow that format as specified, so
93 \function{parsedate()} tries to guess correctly in such cases.
94 \var{date} is a string containing an \rfc{2822} date, such as
95 \code{'Mon, 20 Nov 1995 19:12:08 -0500'}. If it succeeds in parsing
96 the date, \function{parsedate()} returns a 9-tuple that can be passed
97 directly to \function{time.mktime()}; otherwise \code{None} will be
98 returned. Note that fields 6, 7, and 8 of the result tuple are not
99 usable.
100 \end{funcdesc}
102 \begin{funcdesc}{parsedate_tz}{date}
103 Performs the same function as \function{parsedate()}, but returns
104 either \code{None} or a 10-tuple; the first 9 elements make up a tuple
105 that can be passed directly to \function{time.mktime()}, and the tenth
106 is the offset of the date's timezone from UTC (which is the official
107 term for Greenwich Mean Time). (Note that the sign of the timezone
108 offset is the opposite of the sign of the \code{time.timezone}
109 variable for the same timezone; the latter variable follows the
110 \POSIX{} standard while this module follows \rfc{2822}.) If the input
111 string has no timezone, the last element of the tuple returned is
112 \code{None}. Note that fields 6, 7, and 8 of the result tuple are not
113 usable.
114 \end{funcdesc}
116 \begin{funcdesc}{mktime_tz}{tuple}
117 Turn a 10-tuple as returned by \function{parsedate_tz()} into a UTC
118 timestamp. If the timezone item in the tuple is \code{None}, assume
119 local time. Minor deficiency: this first interprets the first 8
120 elements as a local time and then compensates for the timezone
121 difference; this may yield a slight error around daylight savings time
122 switch dates. Not enough to worry about for common use.
123 \end{funcdesc}
126 \begin{seealso}
127 \seemodule{mailbox}{Classes to read various mailbox formats produced
128 by end-user mail programs.}
129 \seemodule{mimetools}{Subclass of rfc.Message that handles MIME encoded
130 messages.}
131 \end{seealso}
134 \subsection{Message Objects \label{message-objects}}
136 A \class{Message} instance has the following methods:
138 \begin{methoddesc}{rewindbody}{}
139 Seek to the start of the message body. This only works if the file
140 object is seekable.
141 \end{methoddesc}
143 \begin{methoddesc}{isheader}{line}
144 Returns a line's canonicalized fieldname (the dictionary key that will
145 be used to index it) if the line is a legal \rfc{2822} header; otherwise
146 returns \code{None} (implying that parsing should stop here and the
147 line be pushed back on the input stream). It is sometimes useful to
148 override this method in a subclass.
149 \end{methoddesc}
151 \begin{methoddesc}{islast}{line}
152 Return true if the given line is a delimiter on which Message should
153 stop. The delimiter line is consumed, and the file object's read
154 location positioned immediately after it. By default this method just
155 checks that the line is blank, but you can override it in a subclass.
156 \end{methoddesc}
158 \begin{methoddesc}{iscomment}{line}
159 Return \code{True} if the given line should be ignored entirely, just skipped.
160 By default this is a stub that always returns \code{False}, but you can
161 override it in a subclass.
162 \end{methoddesc}
164 \begin{methoddesc}{getallmatchingheaders}{name}
165 Return a list of lines consisting of all headers matching
166 \var{name}, if any. Each physical line, whether it is a continuation
167 line or not, is a separate list item. Return the empty list if no
168 header matches \var{name}.
169 \end{methoddesc}
171 \begin{methoddesc}{getfirstmatchingheader}{name}
172 Return a list of lines comprising the first header matching
173 \var{name}, and its continuation line(s), if any. Return
174 \code{None} if there is no header matching \var{name}.
175 \end{methoddesc}
177 \begin{methoddesc}{getrawheader}{name}
178 Return a single string consisting of the text after the colon in the
179 first header matching \var{name}. This includes leading whitespace,
180 the trailing linefeed, and internal linefeeds and whitespace if there
181 any continuation line(s) were present. Return \code{None} if there is
182 no header matching \var{name}.
183 \end{methoddesc}
185 \begin{methoddesc}{getheader}{name\optional{, default}}
186 Like \code{getrawheader(\var{name})}, but strip leading and trailing
187 whitespace. Internal whitespace is not stripped. The optional
188 \var{default} argument can be used to specify a different default to
189 be returned when there is no header matching \var{name}.
190 \end{methoddesc}
192 \begin{methoddesc}{get}{name\optional{, default}}
193 An alias for \method{getheader()}, to make the interface more compatible
194 with regular dictionaries.
195 \end{methoddesc}
197 \begin{methoddesc}{getaddr}{name}
198 Return a pair \code{(\var{full name}, \var{email address})} parsed
199 from the string returned by \code{getheader(\var{name})}. If no
200 header matching \var{name} exists, return \code{(None, None)};
201 otherwise both the full name and the address are (possibly empty)
202 strings.
204 Example: If \var{m}'s first \mailheader{From} header contains the
205 string \code{'jack@cwi.nl (Jack Jansen)'}, then
206 \code{m.getaddr('From')} will yield the pair
207 \code{('Jack Jansen', 'jack@cwi.nl')}.
208 If the header contained
209 \code{'Jack Jansen <jack@cwi.nl>'} instead, it would yield the
210 exact same result.
211 \end{methoddesc}
213 \begin{methoddesc}{getaddrlist}{name}
214 This is similar to \code{getaddr(\var{list})}, but parses a header
215 containing a list of email addresses (e.g.\ a \mailheader{To} header) and
216 returns a list of \code{(\var{full name}, \var{email address})} pairs
217 (even if there was only one address in the header). If there is no
218 header matching \var{name}, return an empty list.
220 If multiple headers exist that match the named header (e.g. if there
221 are several \mailheader{Cc} headers), all are parsed for addresses.
222 Any continuation lines the named headers contain are also parsed.
223 \end{methoddesc}
225 \begin{methoddesc}{getdate}{name}
226 Retrieve a header using \method{getheader()} and parse it into a 9-tuple
227 compatible with \function{time.mktime()}; note that fields 6, 7, and 8
228 are not usable. If there is no header matching
229 \var{name}, or it is unparsable, return \code{None}.
231 Date parsing appears to be a black art, and not all mailers adhere to
232 the standard. While it has been tested and found correct on a large
233 collection of email from many sources, it is still possible that this
234 function may occasionally yield an incorrect result.
235 \end{methoddesc}
237 \begin{methoddesc}{getdate_tz}{name}
238 Retrieve a header using \method{getheader()} and parse it into a
239 10-tuple; the first 9 elements will make a tuple compatible with
240 \function{time.mktime()}, and the 10th is a number giving the offset
241 of the date's timezone from UTC. Note that fields 6, 7, and 8
242 are not usable. Similarly to \method{getdate()}, if
243 there is no header matching \var{name}, or it is unparsable, return
244 \code{None}.
245 \end{methoddesc}
247 \class{Message} instances also support a limited mapping interface.
248 In particular: \code{\var{m}[name]} is like
249 \code{\var{m}.getheader(name)} but raises \exception{KeyError} if
250 there is no matching header; and \code{len(\var{m})},
251 \code{\var{m}.get(name\optional{, deafult})},
252 \code{\var{m}.has_key(name)}, \code{\var{m}.keys()},
253 \code{\var{m}.values()} \code{\var{m}.items()}, and
254 \code{\var{m}.setdefault(name\optional{, default})} act as expected,
255 with the one difference that \method{get()} and \method{setdefault()}
256 use an empty string as the default value. \class{Message} instances
257 also support the mapping writable interface \code{\var{m}[name] =
258 value} and \code{del \var{m}[name]}. \class{Message} objects do not
259 support the \method{clear()}, \method{copy()}, \method{popitem()}, or
260 \method{update()} methods of the mapping interface. (Support for
261 \method{get()} and \method{setdefault()} was only added in Python
262 2.2.)
264 Finally, \class{Message} instances have some public instance variables:
266 \begin{memberdesc}{headers}
267 A list containing the entire set of header lines, in the order in
268 which they were read (except that setitem calls may disturb this
269 order). Each line contains a trailing newline. The
270 blank line terminating the headers is not contained in the list.
271 \end{memberdesc}
273 \begin{memberdesc}{fp}
274 The file or file-like object passed at instantiation time. This can
275 be used to read the message content.
276 \end{memberdesc}
278 \begin{memberdesc}{unixfrom}
279 The \UNIX{} \samp{From~} line, if the message had one, or an empty
280 string. This is needed to regenerate the message in some contexts,
281 such as an \code{mbox}-style mailbox file.
282 \end{memberdesc}
285 \subsection{AddressList Objects \label{addresslist-objects}}
287 An \class{AddressList} instance has the following methods:
289 \begin{methoddesc}{__len__}{}
290 Return the number of addresses in the address list.
291 \end{methoddesc}
293 \begin{methoddesc}{__str__}{}
294 Return a canonicalized string representation of the address list.
295 Addresses are rendered in "name" <host@domain> form, comma-separated.
296 \end{methoddesc}
298 \begin{methoddesc}{__add__}{alist}
299 Return a new \class{AddressList} instance that contains all addresses
300 in both \class{AddressList} operands, with duplicates removed (set
301 union).
302 \end{methoddesc}
304 \begin{methoddesc}{__iadd__}{alist}
305 In-place version of \method{__add__()}; turns this \class{AddressList}
306 instance into the union of itself and the right-hand instance,
307 \var{alist}.
308 \end{methoddesc}
310 \begin{methoddesc}{__sub__}{alist}
311 Return a new \class{AddressList} instance that contains every address
312 in the left-hand \class{AddressList} operand that is not present in
313 the right-hand address operand (set difference).
314 \end{methoddesc}
316 \begin{methoddesc}{__isub__}{alist}
317 In-place version of \method{__sub__()}, removing addresses in this
318 list which are also in \var{alist}.
319 \end{methoddesc}
322 Finally, \class{AddressList} instances have one public instance variable:
324 \begin{memberdesc}{addresslist}
325 A list of tuple string pairs, one per address. In each member, the
326 first is the canonicalized name part, the second is the
327 actual route-address (\character{@}-separated username-host.domain
328 pair).
329 \end{memberdesc}