Lib/email/Header.py

   1 # Copyright (C) 2002 Python Software Foundation
   2 # Author: che@debian.org (Ben Gertzfield)
   3
   4 """Header encoding and decoding functionality."""
   5
   6 import re
   7 import email.quopriMIME
   8 import email.base64MIME
   9 from email.Charset import Charset
  10
  11 try:
  12     from email._compat22 import _floordiv
  13 except SyntaxError:
  14     # Python 2.1 spells integer division differently
  15     from email._compat21 import _floordiv
  16
  17 CRLFSPACE = '\r\n '
  18 CRLF = '\r\n'
  19 NLSPACE = '\n '
  20
  21 MAXLINELEN = 76
  22
  23 ENCODE = 1
  24 DECODE = 2
  25
  26 # Match encoded-word strings in the form =?charset?q?Hello_World?=
  27 ecre = re.compile(r'''
  28   =\?                   # literal =?
  29   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
  30   \?                    # literal ?
  31   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
  32   \?                    # literal ?
  33   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
  34   \?=                   # literal ?=
  35   ''', re.VERBOSE | re.IGNORECASE)
  36
  37
  38 \f
  39 # Helpers
  40 _max_append = email.quopriMIME._max_append
  41
  42
  43 \f
  44 def decode_header(header):
  45     """Decode a message header value without converting charset.
  46
  47     Returns a list of (decoded_string, charset) pairs containing each of the
  48     decoded parts of the header.  Charset is None for non-encoded parts of the
  49     header, otherwise a lower-case string containing the name of the character
  50     set specified in the encoded string.
  51     """
  52     # If no encoding, just return the header
  53     header = str(header)
  54     if not ecre.search(header):
  55         return [(header, None)]
  56
  57     decoded = []
  58     dec = ''
  59     for line in header.splitlines():
  60         # This line might not have an encoding in it
  61         if not ecre.search(line):
  62             decoded.append((line, None))
  63             continue
  64
  65         parts = ecre.split(line)
  66         while parts:
  67             unenc = parts.pop(0).strip()
  68             if unenc:
  69                 # Should we continue a long line?
  70                 if decoded and decoded[-1][1] is None:
  71                     decoded[-1] = (decoded[-1][0] + dec, None)
  72                 else:
  73                     decoded.append((unenc, None))
  74             if parts:
  75                 charset, encoding = [s.lower() for s in parts[0:2]]
  76                 encoded = parts[2]
  77                 dec = ''
  78                 if encoding == 'q':
  79                     dec = email.quopriMIME.header_decode(encoded)
  80                 elif encoding == 'b':
  81                     dec = email.base64MIME.decode(encoded)
  82                 else:
  83                     dec = encoded
  84
  85                 if decoded and decoded[-1][1] == charset:
  86                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
  87                 else:
  88                     decoded.append((dec, charset))
  89             del parts[0:3]
  90     return decoded
  91
  92
  93 \f
  94 class Header:
  95     def __init__(self, s, charset=None, maxlinelen=None, header_name=None):
  96         """Create a MIME-compliant header that can contain many languages.
  97
  98         Specify the initial header value in s.  Specify its character set as a
  99         Charset object in the charset argument.  If none, a default Charset
 100         instance will be used.
 101
 102         You can later append to the header with append(s, charset) below;
 103         charset does not have to be the same as the one initially specified
 104         here.  In fact, it's optional, and if not given, defaults to the
 105         charset specified in the constructor.
 106
 107         The maximum line length can be specified explicitly via maxlinelen.
 108         You can also pass None for maxlinelen and the name of a header field
 109         (e.g. "Subject") to let the constructor guess the best line length to
 110         use.  The default maxlinelen is 76.
 111         """
 112         if charset is None:
 113             charset = Charset()
 114         self._charset = charset
 115         # BAW: I believe `chunks' and `maxlinelen' should be non-public.
 116         self._chunks = []
 117         self.append(s, charset)
 118         if maxlinelen is None:
 119             if header_name is None:
 120                 self._maxlinelen = MAXLINELEN
 121             else:
 122                 self.guess_maxlinelen(header_name)
 123         else:
 124             self._maxlinelen = maxlinelen
 125
 126     def __str__(self):
 127         """A synonym for self.encode()."""
 128         return self.encode()
 129
 130     def guess_maxlinelen(self, s=None):
 131         """Guess the maximum length to make each header line.
 132
 133         Given a header name (e.g. "Subject"), set this header's maximum line
 134         length to an appropriate length to avoid line wrapping.  If s is not
 135         given, return the previous maximum line length and don't set it.
 136
 137         Returns the new maximum line length.
 138         """
 139         # BAW: is this semantic necessary?
 140         if s is not None:
 141             self._maxlinelen = MAXLINELEN - len(s) - 2
 142         return self._maxlinelen
 143
 144     def append(self, s, charset=None):
 145         """Append string s with Charset charset to the MIME header.
 146
 147         charset defaults to the one given in the class constructor.
 148         """
 149         if charset is None:
 150             charset = self._charset
 151         self._chunks.append((s, charset))
 152
 153     def _split(self, s, charset):
 154         # Split up a header safely for use with encode_chunks.  BAW: this
 155         # appears to be a private convenience method.
 156         splittable = charset.to_splittable(s)
 157         encoded = charset.from_splittable(splittable)
 158         elen = charset.encoded_header_len(encoded)
 159
 160         if elen <= self._maxlinelen:
 161             return [(encoded, charset)]
 162         # BAW: should we use encoded?
 163         elif elen == len(s):
 164             # We can split on _maxlinelen boundaries because we know that the
 165             # encoding won't change the size of the string
 166             splitpnt = self._maxlinelen
 167             first = charset.from_splittable(splittable[:splitpnt], 0)
 168             last = charset.from_splittable(splittable[splitpnt:], 0)
 169             return self._split(first, charset) + self._split(last, charset)
 170         else:
 171             # Divide and conquer.
 172             halfway = _floordiv(len(splittable), 2)
 173             first = charset.from_splittable(splittable[:halfway], 0)
 174             last = charset.from_splittable(splittable[halfway:], 0)
 175             return self._split(first, charset) + self._split(last, charset)
 176
 177     def encode(self):
 178         """Encode a message header, possibly converting charset and encoding.
 179
 180         There are many issues involved in converting a given string for use in
 181         an email header.  Only certain character sets are readable in most
 182         email clients, and as header strings can only contain a subset of
 183         7-bit ASCII, care must be taken to properly convert and encode (with
 184         Base64 or quoted-printable) header strings.  In addition, there is a
 185         75-character length limit on any given encoded header field, so
 186         line-wrapping must be performed, even with double-byte character sets.
 187
 188         This method will do its best to convert the string to the correct
 189         character set used in email, and encode and line wrap it safely with
 190         the appropriate scheme for that character set.
 191
 192         If the given charset is not known or an error occurs during
 193         conversion, this function will return the header untouched.
 194         """
 195         newchunks = []
 196         for s, charset in self._chunks:
 197             newchunks += self._split(s, charset)
 198         self._chunks = newchunks
 199         return self.encode_chunks()
 200
 201     def encode_chunks(self):
 202         """MIME-encode a header with many different charsets and/or encodings.
 203
 204         Given a list of pairs (string, charset), return a MIME-encoded string
 205         suitable for use in a header field.  Each pair may have different
 206         charsets and/or encodings, and the resulting header will accurately
 207         reflect each setting.
 208
 209         Each encoding can be email.Utils.QP (quoted-printable, for ASCII-like
 210         character sets like iso-8859-1), email.Utils.BASE64 (Base64, for
 211         non-ASCII like character sets like KOI8-R and iso-2022-jp), or None
 212         (no encoding).
 213
 214         Each pair will be represented on a separate line; the resulting string
 215         will be in the format:
 216
 217         "=?charset1?q?Mar=EDa_Gonz=E1lez_Alonso?=\n
 218           =?charset2?b?SvxyZ2VuIEL2aW5n?="
 219         """
 220         chunks = []
 221         for header, charset in self._chunks:
 222             if charset is None:
 223                 _max_append(chunks, header, self._maxlinelen, ' ')
 224             else:
 225                 _max_append(chunks, charset.header_encode(header, 0),
 226                             self._maxlinelen, ' ')
 227         return NLSPACE.join(chunks)