Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import struct,types,__builtin__
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError,why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 ### Constants
  21
  22 #
  23 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
  24 #
  25 BOM = struct.pack('=H',0xFEFF)
  26 #
  27 BOM_BE = BOM32_BE = '\376\377'
  28 #       corresponds to Unicode U+FEFF in UTF-16 on big endian
  29 #       platforms == ZERO WIDTH NO-BREAK SPACE
  30 BOM_LE = BOM32_LE = '\377\376'
  31 #       corresponds to Unicode U+FFFE in UTF-16 on little endian
  32 #       platforms == defined as being an illegal Unicode character
  33
  34 #
  35 # 64-bit Byte Order Marks
  36 #
  37 BOM64_BE = '\000\000\376\377'
  38 #       corresponds to Unicode U+0000FEFF in UCS-4
  39 BOM64_LE = '\377\376\000\000'
  40 #       corresponds to Unicode U+0000FFFE in UCS-4
  41
  42
  43 ### Codec base classes (defining the API)
  44
  45 class Codec:
  46
  47     """ Defines the interface for stateless encoders/decoders.
  48
  49         The .encode()/.decode() methods may implement different error
  50         handling schemes by providing the errors argument. These
  51         string values are defined:
  52
  53          'strict' - raise a ValueError error (or a subclass)
  54          'ignore' - ignore the character and continue with the next
  55          'replace' - replace with a suitable replacement character;
  56                     Python will use the official U+FFFD REPLACEMENT
  57                     CHARACTER for the builtin Unicode codecs.
  58
  59     """
  60     def encode(self,input,errors='strict'):
  61
  62         """ Encodes the object input and returns a tuple (output
  63             object, length consumed).
  64
  65             errors defines the error handling to apply. It defaults to
  66             'strict' handling.
  67
  68             The method may not store state in the Codec instance. Use
  69             StreamCodec for codecs which have to keep state in order to
  70             make encoding/decoding efficient.
  71
  72             The encoder must be able to handle zero length input and
  73             return an empty object of the output object type in this
  74             situation.
  75
  76         """
  77         raise NotImplementedError
  78
  79     def decode(self,input,errors='strict'):
  80
  81         """ Decodes the object input and returns a tuple (output
  82             object, length consumed).
  83
  84             input must be an object which provides the bf_getreadbuf
  85             buffer slot. Python strings, buffer objects and memory
  86             mapped files are examples of objects providing this slot.
  87
  88             errors defines the error handling to apply. It defaults to
  89             'strict' handling.
  90
  91             The method may not store state in the Codec instance. Use
  92             StreamCodec for codecs which have to keep state in order to
  93             make encoding/decoding efficient.
  94
  95             The decoder must be able to handle zero length input and
  96             return an empty object of the output object type in this
  97             situation.
  98
  99         """
 100         raise NotImplementedError
 101
 102 #
 103 # The StreamWriter and StreamReader class provide generic working
 104 # interfaces which can be used to implement new encodings submodules
 105 # very easily. See encodings/utf_8.py for an example on how this is
 106 # done.
 107 #
 108
 109 class StreamWriter(Codec):
 110
 111     def __init__(self,stream,errors='strict'):
 112
 113         """ Creates a StreamWriter instance.
 114
 115             stream must be a file-like object open for writing
 116             (binary) data.
 117
 118             The StreamWriter may implement different error handling
 119             schemes by providing the errors keyword argument. These
 120             parameters are defined:
 121
 122              'strict' - raise a ValueError (or a subclass)
 123              'ignore' - ignore the character and continue with the next
 124              'replace'- replace with a suitable replacement character
 125
 126         """
 127         self.stream = stream
 128         self.errors = errors
 129
 130     def write(self, object):
 131
 132         """ Writes the object's contents encoded to self.stream.
 133         """
 134         data, consumed = self.encode(object,self.errors)
 135         self.stream.write(data)
 136
 137     def writelines(self, list):
 138
 139         """ Writes the concatenated list of strings to the stream
 140             using .write().
 141         """
 142         self.write(''.join(list))
 143
 144     def reset(self):
 145
 146         """ Flushes and resets the codec buffers used for keeping state.
 147
 148             Calling this method should ensure that the data on the
 149             output is put into a clean state, that allows appending
 150             of new fresh data without having to rescan the whole
 151             stream to recover state.
 152
 153         """
 154         pass
 155
 156     def __getattr__(self,name,
 157
 158                     getattr=getattr):
 159
 160         """ Inherit all other methods from the underlying stream.
 161         """
 162         return getattr(self.stream,name)
 163
 164 ###
 165
 166 class StreamReader(Codec):
 167
 168     def __init__(self,stream,errors='strict'):
 169
 170         """ Creates a StreamReader instance.
 171
 172             stream must be a file-like object open for reading
 173             (binary) data.
 174
 175             The StreamReader may implement different error handling
 176             schemes by providing the errors keyword argument. These
 177             parameters are defined:
 178
 179              'strict' - raise a ValueError (or a subclass)
 180              'ignore' - ignore the character and continue with the next
 181              'replace'- replace with a suitable replacement character;
 182
 183         """
 184         self.stream = stream
 185         self.errors = errors
 186
 187     def read(self, size=-1):
 188
 189         """ Decodes data from the stream self.stream and returns the
 190             resulting object.
 191
 192             size indicates the approximate maximum number of bytes to
 193             read from the stream for decoding purposes. The decoder
 194             can modify this setting as appropriate. The default value
 195             -1 indicates to read and decode as much as possible.  size
 196             is intended to prevent having to decode huge files in one
 197             step.
 198
 199             The method should use a greedy read strategy meaning that
 200             it should read as much data as is allowed within the
 201             definition of the encoding and the given size, e.g.  if
 202             optional encoding endings or state markers are available
 203             on the stream, these should be read too.
 204
 205         """
 206         # Unsliced reading:
 207         if size < 0:
 208             return self.decode(self.stream.read())[0]
 209
 210         # Sliced reading:
 211         read = self.stream.read
 212         decode = self.decode
 213         data = read(size)
 214         i = 0
 215         while 1:
 216             try:
 217                 object, decodedbytes = decode(data)
 218             except ValueError,why:
 219                 # This method is slow but should work under pretty much
 220                 # all conditions; at most 10 tries are made
 221                 i = i + 1
 222                 newdata = read(1)
 223                 if not newdata or i > 10:
 224                     raise
 225                 data = data + newdata
 226             else:
 227                 return object
 228
 229     def readline(self, size=None):
 230
 231         """ Read one line from the input stream and return the
 232             decoded data.
 233
 234             Note: Unlike the .readlines() method, this method inherits
 235             the line breaking knowledge from the underlying stream's
 236             .readline() method -- there is currently no support for
 237             line breaking using the codec decoder due to lack of line
 238             buffering. Sublcasses should however, if possible, try to
 239             implement this method using their own knowledge of line
 240             breaking.
 241
 242             size, if given, is passed as size argument to the stream's
 243             .readline() method.
 244
 245         """
 246         if size is None:
 247             line = self.stream.readline()
 248         else:
 249             line = self.stream.readline(size)
 250         return self.decode(line)[0]
 251
 252
 253     def readlines(self, sizehint=0):
 254
 255         """ Read all lines available on the input stream
 256             and return them as list of lines.
 257
 258             Line breaks are implemented using the codec's decoder
 259             method and are included in the list entries.
 260
 261             sizehint, if given, is passed as size argument to the
 262             stream's .read() method.
 263
 264         """
 265         if sizehint is None:
 266             data = self.stream.read()
 267         else:
 268             data = self.stream.read(sizehint)
 269         return self.decode(data)[0].splitlines(1)
 270
 271     def reset(self):
 272
 273         """ Resets the codec buffers used for keeping state.
 274
 275             Note that no stream repositioning should take place.
 276             This method is primarily intended to be able to recover
 277             from decoding errors.
 278
 279         """
 280         pass
 281
 282     def __getattr__(self,name,
 283
 284                     getattr=getattr):
 285
 286         """ Inherit all other methods from the underlying stream.
 287         """
 288         return getattr(self.stream,name)
 289
 290 ###
 291
 292 class StreamReaderWriter:
 293
 294     """ StreamReaderWriter instances allow wrapping streams which
 295         work in both read and write modes.
 296
 297         The design is such that one can use the factory functions
 298         returned by the codec.lookup() function to construct the
 299         instance.
 300
 301     """
 302     # Optional attributes set by the file wrappers below
 303     encoding = 'unknown'
 304
 305     def __init__(self,stream,Reader,Writer,errors='strict'):
 306
 307         """ Creates a StreamReaderWriter instance.
 308
 309             stream must be a Stream-like object.
 310
 311             Reader, Writer must be factory functions or classes
 312             providing the StreamReader, StreamWriter interface resp.
 313
 314             Error handling is done in the same way as defined for the
 315             StreamWriter/Readers.
 316
 317         """
 318         self.stream = stream
 319         self.reader = Reader(stream, errors)
 320         self.writer = Writer(stream, errors)
 321         self.errors = errors
 322
 323     def read(self,size=-1):
 324
 325         return self.reader.read(size)
 326
 327     def readline(self, size=None):
 328
 329         return self.reader.readline(size)
 330
 331     def readlines(self, sizehint=None):
 332
 333         return self.reader.readlines(sizehint)
 334
 335     def write(self,data):
 336
 337         return self.writer.write(data)
 338
 339     def writelines(self,list):
 340
 341         return self.writer.writelines(list)
 342
 343     def reset(self):
 344
 345         self.reader.reset()
 346         self.writer.reset()
 347
 348     def __getattr__(self,name,
 349
 350                     getattr=getattr):
 351
 352         """ Inherit all other methods from the underlying stream.
 353         """
 354         return getattr(self.stream,name)
 355
 356 ###
 357
 358 class StreamRecoder:
 359
 360     """ StreamRecoder instances provide a frontend - backend
 361         view of encoding data.
 362
 363         They use the complete set of APIs returned by the
 364         codecs.lookup() function to implement their task.
 365
 366         Data written to the stream is first decoded into an
 367         intermediate format (which is dependent on the given codec
 368         combination) and then written to the stream using an instance
 369         of the provided Writer class.
 370
 371         In the other direction, data is read from the stream using a
 372         Reader instance and then return encoded data to the caller.
 373
 374     """
 375     # Optional attributes set by the file wrappers below
 376     data_encoding = 'unknown'
 377     file_encoding = 'unknown'
 378
 379     def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
 380
 381         """ Creates a StreamRecoder instance which implements a two-way
 382             conversion: encode and decode work on the frontend (the
 383             input to .read() and output of .write()) while
 384             Reader and Writer work on the backend (reading and
 385             writing to the stream).
 386
 387             You can use these objects to do transparent direct
 388             recodings from e.g. latin-1 to utf-8 and back.
 389
 390             stream must be a file-like object.
 391
 392             encode, decode must adhere to the Codec interface, Reader,
 393             Writer must be factory functions or classes providing the
 394             StreamReader, StreamWriter interface resp.
 395
 396             encode and decode are needed for the frontend translation,
 397             Reader and Writer for the backend translation. Unicode is
 398             used as intermediate encoding.
 399
 400             Error handling is done in the same way as defined for the
 401             StreamWriter/Readers.
 402
 403         """
 404         self.stream = stream
 405         self.encode = encode
 406         self.decode = decode
 407         self.reader = Reader(stream, errors)
 408         self.writer = Writer(stream, errors)
 409         self.errors = errors
 410
 411     def read(self,size=-1):
 412
 413         data = self.reader.read(size)
 414         data, bytesencoded = self.encode(data, self.errors)
 415         return data
 416
 417     def readline(self,size=None):
 418
 419         if size is None:
 420             data = self.reader.readline()
 421         else:
 422             data = self.reader.readline(size)
 423         data, bytesencoded = self.encode(data, self.errors)
 424         return data
 425
 426     def readlines(self,sizehint=None):
 427
 428         if sizehint is None:
 429             data = self.reader.read()
 430         else:
 431             data = self.reader.read(sizehint)
 432         data, bytesencoded = self.encode(data, self.errors)
 433         return data.splitlines(1)
 434
 435     def write(self,data):
 436
 437         data, bytesdecoded = self.decode(data, self.errors)
 438         return self.writer.write(data)
 439
 440     def writelines(self,list):
 441
 442         data = ''.join(list)
 443         data, bytesdecoded = self.decode(data, self.errors)
 444         return self.writer.write(data)
 445
 446     def reset(self):
 447
 448         self.reader.reset()
 449         self.writer.reset()
 450
 451     def __getattr__(self,name,
 452
 453                     getattr=getattr):
 454
 455         """ Inherit all other methods from the underlying stream.
 456         """
 457         return getattr(self.stream,name)
 458
 459 ### Shortcuts
 460
 461 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 462
 463     """ Open an encoded file using the given mode and return
 464         a wrapped version providing transparent encoding/decoding.
 465
 466         Note: The wrapped version will only accept the object format
 467         defined by the codecs, i.e. Unicode objects for most builtin
 468         codecs. Output is also codec dependent and will usually by
 469         Unicode as well.
 470
 471         Files are always opened in binary mode, even if no binary mode
 472         was specified. Thisis done to avoid data loss due to encodings
 473         using 8-bit values. The default file mode is 'rb' meaning to
 474         open the file in binary read mode.
 475
 476         encoding specifies the encoding which is to be used for the
 477         the file.
 478
 479         errors may be given to define the error handling. It defaults
 480         to 'strict' which causes ValueErrors to be raised in case an
 481         encoding error occurs.
 482
 483         buffering has the same meaning as for the builtin open() API.
 484         It defaults to line buffered.
 485
 486         The returned wrapped file object provides an extra attribute
 487         .encoding which allows querying the used encoding. This
 488         attribute is only available if an encoding was specified as
 489         parameter.
 490
 491     """
 492     if encoding is not None and \
 493        'b' not in mode:
 494         # Force opening of the file in binary mode
 495         mode = mode + 'b'
 496     file = __builtin__.open(filename, mode, buffering)
 497     if encoding is None:
 498         return file
 499     (e,d,sr,sw) = lookup(encoding)
 500     srw = StreamReaderWriter(file, sr, sw, errors)
 501     # Add attributes to simplify introspection
 502     srw.encoding = encoding
 503     return srw
 504
 505 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 506
 507     """ Return a wrapped version of file which provides transparent
 508         encoding translation.
 509
 510         Strings written to the wrapped file are interpreted according
 511         to the given data_encoding and then written to the original
 512         file as string using file_encoding. The intermediate encoding
 513         will usually be Unicode but depends on the specified codecs.
 514
 515         Strings are read from the file using file_encoding and then
 516         passed back to the caller as string using data_encoding.
 517
 518         If file_encoding is not given, it defaults to data_encoding.
 519
 520         errors may be given to define the error handling. It defaults
 521         to 'strict' which causes ValueErrors to be raised in case an
 522         encoding error occurs.
 523
 524         data_encoding and file_encoding are added to the wrapped file
 525         object as attributes .data_encoding and .file_encoding resp.
 526
 527         The returned wrapped file object provides two extra attributes
 528         .data_encoding and .file_encoding which reflect the given
 529         parameters of the same name. The attributes can be used for
 530         introspection by Python programs.
 531
 532     """
 533     if file_encoding is None:
 534         file_encoding = data_encoding
 535     encode, decode = lookup(data_encoding)[:2]
 536     Reader, Writer = lookup(file_encoding)[2:]
 537     sr = StreamRecoder(file,
 538                        encode,decode,Reader,Writer,
 539                        errors)
 540     # Add attributes to simplify introspection
 541     sr.data_encoding = data_encoding
 542     sr.file_encoding = file_encoding
 543     return sr
 544
 545 ### Tests
 546
 547 if __name__ == '__main__':
 548
 549     import sys
 550
 551     # Make stdout translate Latin-1 output into UTF-8 output
 552     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 553
 554     # Have stdin translate Latin-1 input into UTF-8 input
 555     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')