Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import struct,types,__builtin__
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError,why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 ### Constants
  21
  22 #
  23 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
  24 #
  25 BOM = struct.pack('=H',0xFEFF)
  26 #
  27 BOM_BE = BOM32_BE = '\376\377'
  28 #       corresponds to Unicode U+FEFF in UTF-16 on big endian
  29 #       platforms == ZERO WIDTH NO-BREAK SPACE
  30 BOM_LE = BOM32_LE = '\377\376'
  31 #       corresponds to Unicode U+FFFE in UTF-16 on little endian
  32 #       platforms == defined as being an illegal Unicode character
  33
  34 #
  35 # 64-bit Byte Order Marks
  36 #
  37 BOM64_BE = '\000\000\376\377'
  38 #       corresponds to Unicode U+0000FEFF in UCS-4
  39 BOM64_LE = '\377\376\000\000'
  40 #       corresponds to Unicode U+0000FFFE in UCS-4
  41
  42
  43 ### Codec base classes (defining the API)
  44
  45 class Codec:
  46
  47     """ Defines the interface for stateless encoders/decoders.
  48
  49         The .encode()/.decode() methods may implement different error
  50         handling schemes by providing the errors argument. These
  51         string values are defined:
  52
  53          'strict' - raise a ValueError error (or a subclass)
  54          'ignore' - ignore the character and continue with the next
  55          'replace' - replace with a suitable replacement character;
  56                     Python will use the official U+FFFD REPLACEMENT
  57                     CHARACTER for the builtin Unicode codecs.
  58
  59     """
  60     def encode(self,input,errors='strict'):
  61
  62         """ Encodes the object input and returns a tuple (output
  63             object, length consumed).
  64
  65             errors defines the error handling to apply. It defaults to
  66             'strict' handling.
  67
  68             The method may not store state in the Codec instance. Use
  69             StreamCodec for codecs which have to keep state in order to
  70             make encoding/decoding efficient.
  71
  72             The encoder must be able to handle zero length input and
  73             return an empty object of the output object type in this
  74             situation.
  75
  76         """
  77         raise NotImplementedError
  78
  79     def decode(self,input,errors='strict'):
  80
  81         """ Decodes the object input and returns a tuple (output
  82             object, length consumed).
  83
  84             input must be an object which provides the bf_getreadbuf
  85             buffer slot. Python strings, buffer objects and memory
  86             mapped files are examples of objects providing this slot.
  87
  88             errors defines the error handling to apply. It defaults to
  89             'strict' handling.
  90
  91             The method may not store state in the Codec instance. Use
  92             StreamCodec for codecs which have to keep state in order to
  93             make encoding/decoding efficient.
  94
  95             The decoder must be able to handle zero length input and
  96             return an empty object of the output object type in this
  97             situation.
  98
  99         """
 100         raise NotImplementedError
 101
 102 #
 103 # The StreamWriter and StreamReader class provide generic working
 104 # interfaces which can be used to implement new encodings submodules
 105 # very easily. See encodings/utf_8.py for an example on how this is
 106 # done.
 107 #
 108
 109 class StreamWriter(Codec):
 110
 111     def __init__(self,stream,errors='strict'):
 112
 113         """ Creates a StreamWriter instance.
 114
 115             stream must be a file-like object open for writing
 116             (binary) data.
 117
 118             The StreamWriter may implement different error handling
 119             schemes by providing the errors keyword argument. These
 120             parameters are defined:
 121
 122              'strict' - raise a ValueError (or a subclass)
 123              'ignore' - ignore the character and continue with the next
 124              'replace'- replace with a suitable replacement character
 125
 126         """
 127         self.stream = stream
 128         self.errors = errors
 129
 130     def write(self,object):
 131
 132         """ Writes the object's contents encoded to self.stream.
 133         """
 134         data, consumed = self.encode(object,self.errors)
 135         self.stream.write(data)
 136
 137     # XXX .writelines() ?
 138
 139     def reset(self):
 140
 141         """ Flushes and resets the codec buffers used for keeping state.
 142
 143             Calling this method should ensure that the data on the
 144             output is put into a clean state, that allows appending
 145             of new fresh data without having to rescan the whole
 146             stream to recover state.
 147
 148         """
 149         pass
 150
 151     def __getattr__(self,name,
 152
 153                     getattr=getattr):
 154
 155         """ Inherit all other methods from the underlying stream.
 156         """
 157         return getattr(self.stream,name)
 158
 159 ###
 160
 161 class StreamReader(Codec):
 162
 163     def __init__(self,stream,errors='strict'):
 164
 165         """ Creates a StreamReader instance.
 166
 167             stream must be a file-like object open for reading
 168             (binary) data.
 169
 170             The StreamReader may implement different error handling
 171             schemes by providing the errors keyword argument. These
 172             parameters are defined:
 173
 174              'strict' - raise a ValueError (or a subclass)
 175              'ignore' - ignore the character and continue with the next
 176              'replace'- replace with a suitable replacement character;
 177
 178         """
 179         self.stream = stream
 180         self.errors = errors
 181
 182     def read(self,size=-1):
 183
 184         """ Decodes data from the stream self.stream and returns the
 185             resulting object.
 186
 187             size indicates the approximate maximum number of bytes to
 188             read from the stream for decoding purposes. The decoder
 189             can modify this setting as appropriate. The default value
 190             -1 indicates to read and decode as much as possible.  size
 191             is intended to prevent having to decode huge files in one
 192             step.
 193
 194             The method should use a greedy read strategy meaning that
 195             it should read as much data as is allowed within the
 196             definition of the encoding and the given size, e.g.  if
 197             optional encoding endings or state markers are available
 198             on the stream, these should be read too.
 199
 200         """
 201         # Unsliced reading:
 202         if size < 0:
 203             return self.decode(self.stream.read())[0]
 204
 205         # Sliced reading:
 206         read = self.stream.read
 207         decode = self.decode
 208         data = read(size)
 209         i = 0
 210         while 1:
 211             try:
 212                 object, decodedbytes = decode(data)
 213             except ValueError,why:
 214                 # This method is slow but should work under pretty much
 215                 # all conditions; at most 10 tries are made
 216                 i = i + 1
 217                 newdata = read(1)
 218                 if not newdata or i > 10:
 219                     raise
 220                 data = data + newdata
 221             else:
 222                 return object
 223
 224     # XXX .readline() and .readlines() (these are hard to implement
 225     #     without using buffers for keeping read-ahead data)
 226
 227     def reset(self):
 228
 229         """ Resets the codec buffers used for keeping state.
 230
 231             Note that no stream repositioning should take place.
 232             This method is primarely intended to be able to recover
 233             from decoding errors.
 234
 235         """
 236         pass
 237
 238     def __getattr__(self,name,
 239
 240                     getattr=getattr):
 241
 242         """ Inherit all other methods from the underlying stream.
 243         """
 244         return getattr(self.stream,name)
 245
 246 ###
 247
 248 class StreamReaderWriter:
 249
 250     def __init__(self,stream,Reader,Writer,errors='strict'):
 251
 252         """ Creates a StreamReaderWriter instance.
 253
 254             stream must be a Stream-like object.
 255
 256             Reader, Writer must be factory functions or classes
 257             providing the StreamReader, StreamWriter interface resp.
 258
 259             Error handling is done in the same way as defined for the
 260             StreamWriter/Readers.
 261
 262         """
 263         self.stream = stream
 264         self.reader = Reader(stream, errors)
 265         self.writer = Writer(stream, errors)
 266         self.errors = errors
 267
 268     def read(self,size=-1):
 269
 270         return self.reader.read(size)
 271
 272     def write(self,data):
 273
 274         return self.writer.write(data)
 275
 276     def reset(self):
 277
 278         self.reader.reset()
 279         self.writer.reset()
 280
 281     def __getattr__(self,name,
 282
 283                     getattr=getattr):
 284
 285         """ Inherit all other methods from the underlying stream.
 286         """
 287         return getattr(self.stream,name)
 288
 289 ###
 290
 291 class StreamRecoder:
 292
 293     def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
 294
 295         """ Creates a StreamRecoder instance which implements a two-way
 296             conversion: encode and decode work on the frontend (the
 297             input to .read() and output of .write()) while
 298             Reader and Writer work on the backend (reading and
 299             writing to the stream).
 300
 301             You can use these objects to do transparent direct
 302             recodings from e.g. latin-1 to utf-8 and back.
 303
 304             stream must be a file-like object.
 305
 306             encode, decode must adhere to the Codec interface, Reader,
 307             Writer must be factory functions or classes providing the
 308             StreamReader, StreamWriter interface resp.
 309
 310             encode and decode are needed for the frontend translation,
 311             Reader and Writer for the backend translation. Unicode is
 312             used as intermediate encoding.
 313
 314             Error handling is done in the same way as defined for the
 315             StreamWriter/Readers.
 316
 317         """
 318         self.stream = stream
 319         self.encode = encode
 320         self.decode = decode
 321         self.reader = Reader(stream, errors)
 322         self.writer = Writer(stream, errors)
 323         self.errors = errors
 324
 325     def read(self,size=-1):
 326
 327         data = self.reader.read(size)
 328         data, bytesencoded = self.encode(data, self.errors)
 329         return data
 330
 331     def write(self,data):
 332
 333         data, bytesdecoded = self.decode(data, self.errors)
 334         return self.writer.write(data)
 335
 336     # .writelines(), .readline() and .readlines() ... see notes
 337     # above.
 338
 339     def reset(self):
 340
 341         self.reader.reset()
 342         self.writer.reset()
 343
 344     def __getattr__(self,name,
 345
 346                     getattr=getattr):
 347
 348         """ Inherit all other methods from the underlying stream.
 349         """
 350         return getattr(self.stream,name)
 351
 352 ### Shortcuts
 353
 354 def open(filename, mode, encoding=None, errors='strict', buffering=1):
 355
 356     """ Open an encoded file using the given mode and return
 357         a wrapped version providing transparent encoding/decoding.
 358
 359         Note: The wrapped version will only accept the object format
 360         defined by the codecs, i.e. Unicode objects for most builtin
 361         codecs. Output is also codec dependent and will usually by
 362         Unicode as well.
 363
 364         encoding specifies the encoding which is to be used for the
 365         the file.
 366
 367         errors may be given to define the error handling. It defaults
 368         to 'strict' which causes ValueErrors to be raised in case an
 369         encoding error occurs.
 370
 371         buffering has the same meaning as for the builtin open() API.
 372         It defaults to line buffered.
 373
 374     """
 375     if encoding is not None and \
 376        'b' not in mode:
 377         # Force opening of the file in binary mode
 378         mode = mode + 'b'
 379     file = __builtin__.open(filename, mode, buffering)
 380     if encoding is None:
 381         return file
 382     (e,d,sr,sw) = lookup(encoding)
 383     return StreamReaderWriter(file, sr, sw, errors)
 384
 385 def EncodedFile(file, input, output=None, errors='strict'):
 386
 387     """ Return a wrapped version of file which provides transparent
 388         encoding translation.
 389
 390         Strings written to the wrapped file are interpreted according
 391         to the given input encoding and then written to the original
 392         file as string using the output encoding. The intermediate
 393         encoding will usually be Unicode but depends on the specified
 394         codecs.
 395
 396         If output is not given, it defaults to input.
 397
 398         errors may be given to define the error handling. It defaults
 399         to 'strict' which causes ValueErrors to be raised in case an
 400         encoding error occurs.
 401
 402     """
 403     if output is None:
 404         output = input
 405     encode, decode = lookup(input)[:2]
 406     Reader, Writer = lookup(output)[2:]
 407     return StreamRecoder(file,
 408                          encode,decode,Reader,Writer,
 409                          errors)
 410
 411 ### Tests
 412
 413 if __name__ == '__main__':
 414
 415     import sys
 416
 417     # Make stdout translate Latin-1 into Unicode-Escape
 418     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')