Doc/lib/emailcharsets.tex

   1 \declaremodule{standard}{email.Charset}
   2 \modulesynopsis{Character Sets}
   3
   4 This module provides a class \class{Charset} for representing
   5 character sets and character set conversions in email messages, as
   6 well as a character set registry and several convenience methods for
   7 manipulating this registry.  Instances of \class{Charset} are used in
   8 several other modules within the \module{email} package.
   9
  10 \versionadded{2.2.2}
  11
  12 \begin{classdesc}{Charset}{\optional{input_charset}}
  13 Map character sets to their email properties.
  14
  15 This class provides information about the requirements imposed on
  16 email for a specific character set.  It also provides convenience
  17 routines for converting between character sets, given the availability
  18 of the applicable codecs.  Given a character set, it will do its best
  19 to provide information on how to use that character set in an email
  20 message in an RFC-compliant way.
  21
  22 Certain character sets must be encoded with quoted-printable or base64
  23 when used in email headers or bodies.  Certain character sets must be
  24 converted outright, and are not allowed in email.
  25
  26 Optional \var{input_charset} is as described below; it is always
  27 coerced to lower case.  After being alias normalized it is also used
  28 as a lookup into the registry of character sets to find out the header
  29 encoding, body encoding, and output conversion codec to be used for
  30 the character set.  For example, if
  31 \var{input_charset} is \code{iso-8859-1}, then headers and bodies will
  32 be encoded using quoted-printable and no output conversion codec is
  33 necessary.  If \var{input_charset} is \code{euc-jp}, then headers will
  34 be encoded with base64, bodies will not be encoded, but output text
  35 will be converted from the \code{euc-jp} character set to the
  36 \code{iso-2022-jp} character set.
  37 \end{classdesc}
  38
  39 \class{Charset} instances have the following data attributes:
  40
  41 \begin{datadesc}{input_charset}
  42 The initial character set specified.  Common aliases are converted to
  43 their \emph{official} email names (e.g. \code{latin_1} is converted to
  44 \code{iso-8859-1}).  Defaults to 7-bit \code{us-ascii}.
  45 \end{datadesc}
  46
  47 \begin{datadesc}{header_encoding}
  48 If the character set must be encoded before it can be used in an
  49 email header, this attribute will be set to \code{Charset.QP} (for
  50 quoted-printable), \code{Charset.BASE64} (for base64 encoding), or
  51 \code{Charset.SHORTEST} for the shortest of QP or BASE64 encoding.
  52 Otherwise, it will be \code{None}.
  53 \end{datadesc}
  54
  55 \begin{datadesc}{body_encoding}
  56 Same as \var{header_encoding}, but describes the encoding for the
  57 mail message's body, which indeed may be different than the header
  58 encoding.  \code{Charset.SHORTEST} is not allowed for
  59 \var{body_encoding}.
  60 \end{datadesc}
  61
  62 \begin{datadesc}{output_charset}
  63 Some character sets must be converted before they can be used in
  64 email headers or bodies.  If the \var{input_charset} is one of
  65 them, this attribute will contain the name of the character set
  66 output will be converted to.  Otherwise, it will be \code{None}.
  67 \end{datadesc}
  68
  69 \begin{datadesc}{input_codec}
  70 The name of the Python codec used to convert the \var{input_charset} to
  71 Unicode.  If no conversion codec is necessary, this attribute will be
  72 \code{None}.
  73 \end{datadesc}
  74
  75 \begin{datadesc}{output_codec}
  76 The name of the Python codec used to convert Unicode to the
  77 \var{output_charset}.  If no conversion codec is necessary, this
  78 attribute will have the same value as the \var{input_codec}.
  79 \end{datadesc}
  80
  81 \class{Charset} instances also have the following methods:
  82
  83 \begin{methoddesc}[Charset]{get_body_encoding}{}
  84 Return the content transfer encoding used for body encoding.
  85
  86 This is either the string \samp{quoted-printable} or \samp{base64}
  87 depending on the encoding used, or it is a function, in which case you
  88 should call the function with a single argument, the Message object
  89 being encoded.  The function should then set the
  90 \mailheader{Content-Transfer-Encoding} header itself to whatever is
  91 appropriate.
  92
  93 Returns the string \samp{quoted-printable} if
  94 \var{body_encoding} is \code{QP}, returns the string
  95 \samp{base64} if \var{body_encoding} is \code{BASE64}, and returns the
  96 string \samp{7bit} otherwise.
  97 \end{methoddesc}
  98
  99 \begin{methoddesc}{convert}{s}
 100 Convert the string \var{s} from the \var{input_codec} to the
 101 \var{output_codec}.
 102 \end{methoddesc}
 103
 104 \begin{methoddesc}{to_splittable}{s}
 105 Convert a possibly multibyte string to a safely splittable format.
 106 \var{s} is the string to split.
 107
 108 Uses the \var{input_codec} to try and convert the string to Unicode,
 109 so it can be safely split on character boundaries (even for multibyte
 110 characters).
 111
 112 Returns the string as-is if it isn't known how to convert \var{s} to
 113 Unicode with the \var{input_charset}.
 114
 115 Characters that could not be converted to Unicode will be replaced
 116 with the Unicode replacement character \character{U+FFFD}.
 117 \end{methoddesc}
 118
 119 \begin{methoddesc}{from_splittable}{ustr\optional{, to_output}}
 120 Convert a splittable string back into an encoded string.  \var{ustr}
 121 is a Unicode string to ``unsplit''.
 122
 123 This method uses the proper codec to try and convert the string from
 124 Unicode back into an encoded format.  Return the string as-is if it is
 125 not Unicode, or if it could not be converted from Unicode.
 126
 127 Characters that could not be converted from Unicode will be replaced
 128 with an appropriate character (usually \character{?}).
 129
 130 If \var{to_output} is \code{True} (the default), uses
 131 \var{output_codec} to convert to an
 132 encoded format.  If \var{to_output} is \code{False}, it uses
 133 \var{input_codec}.
 134 \end{methoddesc}
 135
 136 \begin{methoddesc}{get_output_charset}{}
 137 Return the output character set.
 138
 139 This is the \var{output_charset} attribute if that is not \code{None},
 140 otherwise it is \var{input_charset}.
 141 \end{methoddesc}
 142
 143 \begin{methoddesc}{encoded_header_len}{}
 144 Return the length of the encoded header string, properly calculating
 145 for quoted-printable or base64 encoding.
 146 \end{methoddesc}
 147
 148 \begin{methoddesc}{header_encode}{s\optional{, convert}}
 149 Header-encode the string \var{s}.
 150
 151 If \var{convert} is \code{True}, the string will be converted from the
 152 input charset to the output charset automatically.  This is not useful
 153 for multibyte character sets, which have line length issues (multibyte
 154 characters must be split on a character, not a byte boundary); use the
 155 higher-level \class{Header} class to deal with these issues (see
 156 \refmodule{email.Header}).  \var{convert} defaults to \code{False}.
 157
 158 The type of encoding (base64 or quoted-printable) will be based on
 159 the \var{header_encoding} attribute.
 160 \end{methoddesc}
 161
 162 \begin{methoddesc}{body_encode}{s\optional{, convert}}
 163 Body-encode the string \var{s}.
 164
 165 If \var{convert} is \code{True} (the default), the string will be
 166 converted from the input charset to output charset automatically.
 167 Unlike \method{header_encode()}, there are no issues with byte
 168 boundaries and multibyte charsets in email bodies, so this is usually
 169 pretty safe.
 170
 171 The type of encoding (base64 or quoted-printable) will be based on
 172 the \var{body_encoding} attribute.
 173 \end{methoddesc}
 174
 175 The \class{Charset} class also provides a number of methods to support
 176 standard operations and built-in functions.
 177
 178 \begin{methoddesc}[Charset]{__str__}{}
 179 Returns \var{input_charset} as a string coerced to lower case.
 180 \end{methoddesc}
 181
 182 \begin{methoddesc}[Charset]{__eq__}{other}
 183 This method allows you to compare two \class{Charset} instances for equality.
 184 \end{methoddesc}
 185
 186 \begin{methoddesc}[Header]{__ne__}{other}
 187 This method allows you to compare two \class{Charset} instances for inequality.
 188 \end{methoddesc}
 189
 190 The \module{email.Charset} module also provides the following
 191 functions for adding new entries to the global character set, alias,
 192 and codec registries:
 193
 194 \begin{funcdesc}{add_charset}{charset\optional{, header_enc\optional{,
 195     body_enc\optional{, output_charset}}}}
 196 Add character properties to the global registry.
 197
 198 \var{charset} is the input character set, and must be the canonical
 199 name of a character set.
 200
 201 Optional \var{header_enc} and \var{body_enc} is either
 202 \code{Charset.QP} for quoted-printable, \code{Charset.BASE64} for
 203 base64 encoding, \code{Charset.SHORTEST} for the shortest of
 204 quoted-printable or base64 encoding, or \code{None} for no encoding.
 205 \code{SHORTEST} is only valid for \var{header_enc}. The default is
 206 \code{None} for no encoding.
 207
 208 Optional \var{output_charset} is the character set that the output
 209 should be in.  Conversions will proceed from input charset, to
 210 Unicode, to the output charset when the method
 211 \method{Charset.convert()} is called.  The default is to output in the
 212 same character set as the input.
 213
 214 Both \var{input_charset} and \var{output_charset} must have Unicode
 215 codec entries in the module's character set-to-codec mapping; use
 216 \function{add_codec()} to add codecs the module does
 217 not know about.  See the \refmodule{codecs} module's documentation for
 218 more information.
 219
 220 The global character set registry is kept in the module global
 221 dictionary \code{CHARSETS}.
 222 \end{funcdesc}
 223
 224 \begin{funcdesc}{add_alias}{alias, canonical}
 225 Add a character set alias.  \var{alias} is the alias name,
 226 e.g. \code{latin-1}.  \var{canonical} is the character set's canonical
 227 name, e.g. \code{iso-8859-1}.
 228
 229 The global charset alias registry is kept in the module global
 230 dictionary \code{ALIASES}.
 231 \end{funcdesc}
 232
 233 \begin{funcdesc}{add_codec}{charset, codecname}
 234 Add a codec that map characters in the given character set to and from
 235 Unicode.
 236
 237 \var{charset} is the canonical name of a character set.
 238 \var{codecname} is the name of a Python codec, as appropriate for the
 239 second argument to the \function{unicode()} built-in, or to the
 240 \method{encode()} method of a Unicode string.
 241 \end{funcdesc}