Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import struct,types,__builtin__
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError,why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 __all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
  21            "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
  22
  23 ### Constants
  24
  25 #
  26 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
  27 #
  28 BOM = struct.pack('=H',0xFEFF)
  29 #
  30 BOM_BE = BOM32_BE = '\376\377'
  31 #       corresponds to Unicode U+FEFF in UTF-16 on big endian
  32 #       platforms == ZERO WIDTH NO-BREAK SPACE
  33 BOM_LE = BOM32_LE = '\377\376'
  34 #       corresponds to Unicode U+FFFE in UTF-16 on little endian
  35 #       platforms == defined as being an illegal Unicode character
  36
  37 #
  38 # 64-bit Byte Order Marks
  39 #
  40 BOM64_BE = '\000\000\376\377'
  41 #       corresponds to Unicode U+0000FEFF in UCS-4
  42 BOM64_LE = '\377\376\000\000'
  43 #       corresponds to Unicode U+0000FFFE in UCS-4
  44
  45
  46 ### Codec base classes (defining the API)
  47
  48 class Codec:
  49
  50     """ Defines the interface for stateless encoders/decoders.
  51
  52         The .encode()/.decode() methods may implement different error
  53         handling schemes by providing the errors argument. These
  54         string values are defined:
  55
  56          'strict' - raise a ValueError error (or a subclass)
  57          'ignore' - ignore the character and continue with the next
  58          'replace' - replace with a suitable replacement character;
  59                     Python will use the official U+FFFD REPLACEMENT
  60                     CHARACTER for the builtin Unicode codecs.
  61
  62     """
  63     def encode(self,input,errors='strict'):
  64
  65         """ Encodes the object input and returns a tuple (output
  66             object, length consumed).
  67
  68             errors defines the error handling to apply. It defaults to
  69             'strict' handling.
  70
  71             The method may not store state in the Codec instance. Use
  72             StreamCodec for codecs which have to keep state in order to
  73             make encoding/decoding efficient.
  74
  75             The encoder must be able to handle zero length input and
  76             return an empty object of the output object type in this
  77             situation.
  78
  79         """
  80         raise NotImplementedError
  81
  82     def decode(self,input,errors='strict'):
  83
  84         """ Decodes the object input and returns a tuple (output
  85             object, length consumed).
  86
  87             input must be an object which provides the bf_getreadbuf
  88             buffer slot. Python strings, buffer objects and memory
  89             mapped files are examples of objects providing this slot.
  90
  91             errors defines the error handling to apply. It defaults to
  92             'strict' handling.
  93
  94             The method may not store state in the Codec instance. Use
  95             StreamCodec for codecs which have to keep state in order to
  96             make encoding/decoding efficient.
  97
  98             The decoder must be able to handle zero length input and
  99             return an empty object of the output object type in this
 100             situation.
 101
 102         """
 103         raise NotImplementedError
 104
 105 #
 106 # The StreamWriter and StreamReader class provide generic working
 107 # interfaces which can be used to implement new encodings submodules
 108 # very easily. See encodings/utf_8.py for an example on how this is
 109 # done.
 110 #
 111
 112 class StreamWriter(Codec):
 113
 114     def __init__(self,stream,errors='strict'):
 115
 116         """ Creates a StreamWriter instance.
 117
 118             stream must be a file-like object open for writing
 119             (binary) data.
 120
 121             The StreamWriter may implement different error handling
 122             schemes by providing the errors keyword argument. These
 123             parameters are defined:
 124
 125              'strict' - raise a ValueError (or a subclass)
 126              'ignore' - ignore the character and continue with the next
 127              'replace'- replace with a suitable replacement character
 128
 129         """
 130         self.stream = stream
 131         self.errors = errors
 132
 133     def write(self, object):
 134
 135         """ Writes the object's contents encoded to self.stream.
 136         """
 137         data, consumed = self.encode(object,self.errors)
 138         self.stream.write(data)
 139
 140     def writelines(self, list):
 141
 142         """ Writes the concatenated list of strings to the stream
 143             using .write().
 144         """
 145         self.write(''.join(list))
 146
 147     def reset(self):
 148
 149         """ Flushes and resets the codec buffers used for keeping state.
 150
 151             Calling this method should ensure that the data on the
 152             output is put into a clean state, that allows appending
 153             of new fresh data without having to rescan the whole
 154             stream to recover state.
 155
 156         """
 157         pass
 158
 159     def __getattr__(self,name,
 160
 161                     getattr=getattr):
 162
 163         """ Inherit all other methods from the underlying stream.
 164         """
 165         return getattr(self.stream,name)
 166
 167 ###
 168
 169 class StreamReader(Codec):
 170
 171     def __init__(self,stream,errors='strict'):
 172
 173         """ Creates a StreamReader instance.
 174
 175             stream must be a file-like object open for reading
 176             (binary) data.
 177
 178             The StreamReader may implement different error handling
 179             schemes by providing the errors keyword argument. These
 180             parameters are defined:
 181
 182              'strict' - raise a ValueError (or a subclass)
 183              'ignore' - ignore the character and continue with the next
 184              'replace'- replace with a suitable replacement character;
 185
 186         """
 187         self.stream = stream
 188         self.errors = errors
 189
 190     def read(self, size=-1):
 191
 192         """ Decodes data from the stream self.stream and returns the
 193             resulting object.
 194
 195             size indicates the approximate maximum number of bytes to
 196             read from the stream for decoding purposes. The decoder
 197             can modify this setting as appropriate. The default value
 198             -1 indicates to read and decode as much as possible.  size
 199             is intended to prevent having to decode huge files in one
 200             step.
 201
 202             The method should use a greedy read strategy meaning that
 203             it should read as much data as is allowed within the
 204             definition of the encoding and the given size, e.g.  if
 205             optional encoding endings or state markers are available
 206             on the stream, these should be read too.
 207
 208         """
 209         # Unsliced reading:
 210         if size < 0:
 211             return self.decode(self.stream.read(), self.errors)[0]
 212
 213         # Sliced reading:
 214         read = self.stream.read
 215         decode = self.decode
 216         data = read(size)
 217         i = 0
 218         while 1:
 219             try:
 220                 object, decodedbytes = decode(data, self.errors)
 221             except ValueError,why:
 222                 # This method is slow but should work under pretty much
 223                 # all conditions; at most 10 tries are made
 224                 i = i + 1
 225                 newdata = read(1)
 226                 if not newdata or i > 10:
 227                     raise
 228                 data = data + newdata
 229             else:
 230                 return object
 231
 232     def readline(self, size=None):
 233
 234         """ Read one line from the input stream and return the
 235             decoded data.
 236
 237             Note: Unlike the .readlines() method, this method inherits
 238             the line breaking knowledge from the underlying stream's
 239             .readline() method -- there is currently no support for
 240             line breaking using the codec decoder due to lack of line
 241             buffering. Sublcasses should however, if possible, try to
 242             implement this method using their own knowledge of line
 243             breaking.
 244
 245             size, if given, is passed as size argument to the stream's
 246             .readline() method.
 247
 248         """
 249         if size is None:
 250             line = self.stream.readline()
 251         else:
 252             line = self.stream.readline(size)
 253         return self.decode(line,self.errors)[0]
 254
 255
 256     def readlines(self, sizehint=0):
 257
 258         """ Read all lines available on the input stream
 259             and return them as list of lines.
 260
 261             Line breaks are implemented using the codec's decoder
 262             method and are included in the list entries.
 263
 264             sizehint, if given, is passed as size argument to the
 265             stream's .read() method.
 266
 267         """
 268         if sizehint is None:
 269             data = self.stream.read()
 270         else:
 271             data = self.stream.read(sizehint)
 272         return self.decode(data,self.errors)[0].splitlines(1)
 273
 274     def reset(self):
 275
 276         """ Resets the codec buffers used for keeping state.
 277
 278             Note that no stream repositioning should take place.
 279             This method is primarily intended to be able to recover
 280             from decoding errors.
 281
 282         """
 283         pass
 284
 285     def __getattr__(self,name,
 286
 287                     getattr=getattr):
 288
 289         """ Inherit all other methods from the underlying stream.
 290         """
 291         return getattr(self.stream,name)
 292
 293 ###
 294
 295 class StreamReaderWriter:
 296
 297     """ StreamReaderWriter instances allow wrapping streams which
 298         work in both read and write modes.
 299
 300         The design is such that one can use the factory functions
 301         returned by the codec.lookup() function to construct the
 302         instance.
 303
 304     """
 305     # Optional attributes set by the file wrappers below
 306     encoding = 'unknown'
 307
 308     def __init__(self,stream,Reader,Writer,errors='strict'):
 309
 310         """ Creates a StreamReaderWriter instance.
 311
 312             stream must be a Stream-like object.
 313
 314             Reader, Writer must be factory functions or classes
 315             providing the StreamReader, StreamWriter interface resp.
 316
 317             Error handling is done in the same way as defined for the
 318             StreamWriter/Readers.
 319
 320         """
 321         self.stream = stream
 322         self.reader = Reader(stream, errors)
 323         self.writer = Writer(stream, errors)
 324         self.errors = errors
 325
 326     def read(self,size=-1):
 327
 328         return self.reader.read(size)
 329
 330     def readline(self, size=None):
 331
 332         return self.reader.readline(size)
 333
 334     def readlines(self, sizehint=None):
 335
 336         return self.reader.readlines(sizehint)
 337
 338     def write(self,data):
 339
 340         return self.writer.write(data)
 341
 342     def writelines(self,list):
 343
 344         return self.writer.writelines(list)
 345
 346     def reset(self):
 347
 348         self.reader.reset()
 349         self.writer.reset()
 350
 351     def __getattr__(self,name,
 352
 353                     getattr=getattr):
 354
 355         """ Inherit all other methods from the underlying stream.
 356         """
 357         return getattr(self.stream,name)
 358
 359 ###
 360
 361 class StreamRecoder:
 362
 363     """ StreamRecoder instances provide a frontend - backend
 364         view of encoding data.
 365
 366         They use the complete set of APIs returned by the
 367         codecs.lookup() function to implement their task.
 368
 369         Data written to the stream is first decoded into an
 370         intermediate format (which is dependent on the given codec
 371         combination) and then written to the stream using an instance
 372         of the provided Writer class.
 373
 374         In the other direction, data is read from the stream using a
 375         Reader instance and then return encoded data to the caller.
 376
 377     """
 378     # Optional attributes set by the file wrappers below
 379     data_encoding = 'unknown'
 380     file_encoding = 'unknown'
 381
 382     def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
 383
 384         """ Creates a StreamRecoder instance which implements a two-way
 385             conversion: encode and decode work on the frontend (the
 386             input to .read() and output of .write()) while
 387             Reader and Writer work on the backend (reading and
 388             writing to the stream).
 389
 390             You can use these objects to do transparent direct
 391             recodings from e.g. latin-1 to utf-8 and back.
 392
 393             stream must be a file-like object.
 394
 395             encode, decode must adhere to the Codec interface, Reader,
 396             Writer must be factory functions or classes providing the
 397             StreamReader, StreamWriter interface resp.
 398
 399             encode and decode are needed for the frontend translation,
 400             Reader and Writer for the backend translation. Unicode is
 401             used as intermediate encoding.
 402
 403             Error handling is done in the same way as defined for the
 404             StreamWriter/Readers.
 405
 406         """
 407         self.stream = stream
 408         self.encode = encode
 409         self.decode = decode
 410         self.reader = Reader(stream, errors)
 411         self.writer = Writer(stream, errors)
 412         self.errors = errors
 413
 414     def read(self,size=-1):
 415
 416         data = self.reader.read(size)
 417         data, bytesencoded = self.encode(data, self.errors)
 418         return data
 419
 420     def readline(self,size=None):
 421
 422         if size is None:
 423             data = self.reader.readline()
 424         else:
 425             data = self.reader.readline(size)
 426         data, bytesencoded = self.encode(data, self.errors)
 427         return data
 428
 429     def readlines(self,sizehint=None):
 430
 431         if sizehint is None:
 432             data = self.reader.read()
 433         else:
 434             data = self.reader.read(sizehint)
 435         data, bytesencoded = self.encode(data, self.errors)
 436         return data.splitlines(1)
 437
 438     def write(self,data):
 439
 440         data, bytesdecoded = self.decode(data, self.errors)
 441         return self.writer.write(data)
 442
 443     def writelines(self,list):
 444
 445         data = ''.join(list)
 446         data, bytesdecoded = self.decode(data, self.errors)
 447         return self.writer.write(data)
 448
 449     def reset(self):
 450
 451         self.reader.reset()
 452         self.writer.reset()
 453
 454     def __getattr__(self,name,
 455
 456                     getattr=getattr):
 457
 458         """ Inherit all other methods from the underlying stream.
 459         """
 460         return getattr(self.stream,name)
 461
 462 ### Shortcuts
 463
 464 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 465
 466     """ Open an encoded file using the given mode and return
 467         a wrapped version providing transparent encoding/decoding.
 468
 469         Note: The wrapped version will only accept the object format
 470         defined by the codecs, i.e. Unicode objects for most builtin
 471         codecs. Output is also codec dependent and will usually by
 472         Unicode as well.
 473
 474         Files are always opened in binary mode, even if no binary mode
 475         was specified. Thisis done to avoid data loss due to encodings
 476         using 8-bit values. The default file mode is 'rb' meaning to
 477         open the file in binary read mode.
 478
 479         encoding specifies the encoding which is to be used for the
 480         the file.
 481
 482         errors may be given to define the error handling. It defaults
 483         to 'strict' which causes ValueErrors to be raised in case an
 484         encoding error occurs.
 485
 486         buffering has the same meaning as for the builtin open() API.
 487         It defaults to line buffered.
 488
 489         The returned wrapped file object provides an extra attribute
 490         .encoding which allows querying the used encoding. This
 491         attribute is only available if an encoding was specified as
 492         parameter.
 493
 494     """
 495     if encoding is not None and \
 496        'b' not in mode:
 497         # Force opening of the file in binary mode
 498         mode = mode + 'b'
 499     file = __builtin__.open(filename, mode, buffering)
 500     if encoding is None:
 501         return file
 502     (e,d,sr,sw) = lookup(encoding)
 503     srw = StreamReaderWriter(file, sr, sw, errors)
 504     # Add attributes to simplify introspection
 505     srw.encoding = encoding
 506     return srw
 507
 508 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 509
 510     """ Return a wrapped version of file which provides transparent
 511         encoding translation.
 512
 513         Strings written to the wrapped file are interpreted according
 514         to the given data_encoding and then written to the original
 515         file as string using file_encoding. The intermediate encoding
 516         will usually be Unicode but depends on the specified codecs.
 517
 518         Strings are read from the file using file_encoding and then
 519         passed back to the caller as string using data_encoding.
 520
 521         If file_encoding is not given, it defaults to data_encoding.
 522
 523         errors may be given to define the error handling. It defaults
 524         to 'strict' which causes ValueErrors to be raised in case an
 525         encoding error occurs.
 526
 527         The returned wrapped file object provides two extra attributes
 528         .data_encoding and .file_encoding which reflect the given
 529         parameters of the same name. The attributes can be used for
 530         introspection by Python programs.
 531
 532     """
 533     if file_encoding is None:
 534         file_encoding = data_encoding
 535     encode, decode = lookup(data_encoding)[:2]
 536     Reader, Writer = lookup(file_encoding)[2:]
 537     sr = StreamRecoder(file,
 538                        encode,decode,Reader,Writer,
 539                        errors)
 540     # Add attributes to simplify introspection
 541     sr.data_encoding = data_encoding
 542     sr.file_encoding = file_encoding
 543     return sr
 544
 545 ### Helpers for charmap-based codecs
 546
 547 def make_identity_dict(rng):
 548
 549     """ make_identity_dict(rng) -> dict
 550
 551         Return a dictionary where elements of the rng sequence are
 552         mapped to themselves.
 553
 554     """
 555     res = {}
 556     for i in rng:
 557         res[i]=i
 558     return res
 559
 560 ### Tests
 561
 562 if __name__ == '__main__':
 563
 564     import sys
 565
 566     # Make stdout translate Latin-1 output into UTF-8 output
 567     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 568
 569     # Have stdin translate Latin-1 input into UTF-8 input
 570     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')