Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import struct, __builtin__
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  21            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  22            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  23            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  24            "strict_errors", "ignore_errors", "replace_errors",
  25            "xmlcharrefreplace_errors",
  26            "register_error", "lookup_error"]
  27
  28 ### Constants
  29
  30 #
  31 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  32 # and its possible byte string values
  33 # for UTF8/UTF16/UTF32 output and little/big endian machines
  34 #
  35
  36 # UTF-8
  37 BOM_UTF8 = '\xef\xbb\xbf'
  38
  39 # UTF-16, little endian
  40 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  41
  42 # UTF-16, big endian
  43 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  44
  45 # UTF-32, little endian
  46 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  47
  48 # UTF-32, big endian
  49 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  50
  51 # UTF-16, native endianness
  52 BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
  53
  54 # UTF-32, native endianness
  55 BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
  56
  57 # Old broken names (don't use in new code)
  58 BOM32_LE = BOM_UTF16_LE
  59 BOM32_BE = BOM_UTF16_BE
  60 BOM64_LE = BOM_UTF32_LE
  61 BOM64_BE = BOM_UTF32_BE
  62
  63
  64 ### Codec base classes (defining the API)
  65
  66 class Codec:
  67
  68     """ Defines the interface for stateless encoders/decoders.
  69
  70         The .encode()/.decode() methods may implement different error
  71         handling schemes by providing the errors argument. These
  72         string values are defined:
  73
  74          'strict' - raise a ValueError error (or a subclass)
  75          'ignore' - ignore the character and continue with the next
  76          'replace' - replace with a suitable replacement character;
  77                     Python will use the official U+FFFD REPLACEMENT
  78                     CHARACTER for the builtin Unicode codecs.
  79
  80     """
  81     def encode(self, input, errors='strict'):
  82
  83         """ Encodes the object input and returns a tuple (output
  84             object, length consumed).
  85
  86             errors defines the error handling to apply. It defaults to
  87             'strict' handling.
  88
  89             The method may not store state in the Codec instance. Use
  90             StreamCodec for codecs which have to keep state in order to
  91             make encoding/decoding efficient.
  92
  93             The encoder must be able to handle zero length input and
  94             return an empty object of the output object type in this
  95             situation.
  96
  97         """
  98         raise NotImplementedError
  99
 100     def decode(self, input, errors='strict'):
 101
 102         """ Decodes the object input and returns a tuple (output
 103             object, length consumed).
 104
 105             input must be an object which provides the bf_getreadbuf
 106             buffer slot. Python strings, buffer objects and memory
 107             mapped files are examples of objects providing this slot.
 108
 109             errors defines the error handling to apply. It defaults to
 110             'strict' handling.
 111
 112             The method may not store state in the Codec instance. Use
 113             StreamCodec for codecs which have to keep state in order to
 114             make encoding/decoding efficient.
 115
 116             The decoder must be able to handle zero length input and
 117             return an empty object of the output object type in this
 118             situation.
 119
 120         """
 121         raise NotImplementedError
 122
 123 #
 124 # The StreamWriter and StreamReader class provide generic working
 125 # interfaces which can be used to implement new encoding submodules
 126 # very easily. See encodings/utf_8.py for an example on how this is
 127 # done.
 128 #
 129
 130 class StreamWriter(Codec):
 131
 132     def __init__(self, stream, errors='strict'):
 133
 134         """ Creates a StreamWriter instance.
 135
 136             stream must be a file-like object open for writing
 137             (binary) data.
 138
 139             The StreamWriter may implement different error handling
 140             schemes by providing the errors keyword argument. These
 141             parameters are defined:
 142
 143              'strict' - raise a ValueError (or a subclass)
 144              'ignore' - ignore the character and continue with the next
 145              'replace'- replace with a suitable replacement character
 146
 147         """
 148         self.stream = stream
 149         self.errors = errors
 150
 151     def write(self, object):
 152
 153         """ Writes the object's contents encoded to self.stream.
 154         """
 155         data, consumed = self.encode(object, self.errors)
 156         self.stream.write(data)
 157
 158     def writelines(self, list):
 159
 160         """ Writes the concatenated list of strings to the stream
 161             using .write().
 162         """
 163         self.write(''.join(list))
 164
 165     def reset(self):
 166
 167         """ Flushes and resets the codec buffers used for keeping state.
 168
 169             Calling this method should ensure that the data on the
 170             output is put into a clean state, that allows appending
 171             of new fresh data without having to rescan the whole
 172             stream to recover state.
 173
 174         """
 175         pass
 176
 177     def __getattr__(self, name,
 178                     getattr=getattr):
 179
 180         """ Inherit all other methods from the underlying stream.
 181         """
 182         return getattr(self.stream, name)
 183
 184 ###
 185
 186 class StreamReader(Codec):
 187
 188     def __init__(self, stream, errors='strict'):
 189
 190         """ Creates a StreamReader instance.
 191
 192             stream must be a file-like object open for reading
 193             (binary) data.
 194
 195             The StreamReader may implement different error handling
 196             schemes by providing the errors keyword argument. These
 197             parameters are defined:
 198
 199              'strict' - raise a ValueError (or a subclass)
 200              'ignore' - ignore the character and continue with the next
 201              'replace'- replace with a suitable replacement character;
 202
 203         """
 204         self.stream = stream
 205         self.errors = errors
 206
 207     def read(self, size=-1):
 208
 209         """ Decodes data from the stream self.stream and returns the
 210             resulting object.
 211
 212             size indicates the approximate maximum number of bytes to
 213             read from the stream for decoding purposes. The decoder
 214             can modify this setting as appropriate. The default value
 215             -1 indicates to read and decode as much as possible.  size
 216             is intended to prevent having to decode huge files in one
 217             step.
 218
 219             The method should use a greedy read strategy meaning that
 220             it should read as much data as is allowed within the
 221             definition of the encoding and the given size, e.g.  if
 222             optional encoding endings or state markers are available
 223             on the stream, these should be read too.
 224
 225         """
 226         # Unsliced reading:
 227         if size < 0:
 228             return self.decode(self.stream.read(), self.errors)[0]
 229
 230         # Sliced reading:
 231         read = self.stream.read
 232         decode = self.decode
 233         data = read(size)
 234         i = 0
 235         while 1:
 236             try:
 237                 object, decodedbytes = decode(data, self.errors)
 238             except ValueError, why:
 239                 # This method is slow but should work under pretty much
 240                 # all conditions; at most 10 tries are made
 241                 i = i + 1
 242                 newdata = read(1)
 243                 if not newdata or i > 10:
 244                     raise
 245                 data = data + newdata
 246             else:
 247                 return object
 248
 249     def readline(self, size=None):
 250
 251         """ Read one line from the input stream and return the
 252             decoded data.
 253
 254             Note: Unlike the .readlines() method, this method inherits
 255             the line breaking knowledge from the underlying stream's
 256             .readline() method -- there is currently no support for
 257             line breaking using the codec decoder due to lack of line
 258             buffering. Sublcasses should however, if possible, try to
 259             implement this method using their own knowledge of line
 260             breaking.
 261
 262             size, if given, is passed as size argument to the stream's
 263             .readline() method.
 264
 265         """
 266         if size is None:
 267             line = self.stream.readline()
 268         else:
 269             line = self.stream.readline(size)
 270         return self.decode(line, self.errors)[0]
 271
 272
 273     def readlines(self, sizehint=None):
 274
 275         """ Read all lines available on the input stream
 276             and return them as list of lines.
 277
 278             Line breaks are implemented using the codec's decoder
 279             method and are included in the list entries.
 280
 281             sizehint, if given, is passed as size argument to the
 282             stream's .read() method.
 283
 284         """
 285         if sizehint is None:
 286             data = self.stream.read()
 287         else:
 288             data = self.stream.read(sizehint)
 289         return self.decode(data, self.errors)[0].splitlines(1)
 290
 291     def reset(self):
 292
 293         """ Resets the codec buffers used for keeping state.
 294
 295             Note that no stream repositioning should take place.
 296             This method is primarily intended to be able to recover
 297             from decoding errors.
 298
 299         """
 300         pass
 301
 302     def __getattr__(self, name,
 303                     getattr=getattr):
 304
 305         """ Inherit all other methods from the underlying stream.
 306         """
 307         return getattr(self.stream, name)
 308
 309 ###
 310
 311 class StreamReaderWriter:
 312
 313     """ StreamReaderWriter instances allow wrapping streams which
 314         work in both read and write modes.
 315
 316         The design is such that one can use the factory functions
 317         returned by the codec.lookup() function to construct the
 318         instance.
 319
 320     """
 321     # Optional attributes set by the file wrappers below
 322     encoding = 'unknown'
 323
 324     def __init__(self, stream, Reader, Writer, errors='strict'):
 325
 326         """ Creates a StreamReaderWriter instance.
 327
 328             stream must be a Stream-like object.
 329
 330             Reader, Writer must be factory functions or classes
 331             providing the StreamReader, StreamWriter interface resp.
 332
 333             Error handling is done in the same way as defined for the
 334             StreamWriter/Readers.
 335
 336         """
 337         self.stream = stream
 338         self.reader = Reader(stream, errors)
 339         self.writer = Writer(stream, errors)
 340         self.errors = errors
 341
 342     def read(self, size=-1):
 343
 344         return self.reader.read(size)
 345
 346     def readline(self, size=None):
 347
 348         return self.reader.readline(size)
 349
 350     def readlines(self, sizehint=None):
 351
 352         return self.reader.readlines(sizehint)
 353
 354     def write(self, data):
 355
 356         return self.writer.write(data)
 357
 358     def writelines(self, list):
 359
 360         return self.writer.writelines(list)
 361
 362     def reset(self):
 363
 364         self.reader.reset()
 365         self.writer.reset()
 366
 367     def __getattr__(self, name,
 368                     getattr=getattr):
 369
 370         """ Inherit all other methods from the underlying stream.
 371         """
 372         return getattr(self.stream, name)
 373
 374 ###
 375
 376 class StreamRecoder:
 377
 378     """ StreamRecoder instances provide a frontend - backend
 379         view of encoding data.
 380
 381         They use the complete set of APIs returned by the
 382         codecs.lookup() function to implement their task.
 383
 384         Data written to the stream is first decoded into an
 385         intermediate format (which is dependent on the given codec
 386         combination) and then written to the stream using an instance
 387         of the provided Writer class.
 388
 389         In the other direction, data is read from the stream using a
 390         Reader instance and then return encoded data to the caller.
 391
 392     """
 393     # Optional attributes set by the file wrappers below
 394     data_encoding = 'unknown'
 395     file_encoding = 'unknown'
 396
 397     def __init__(self, stream, encode, decode, Reader, Writer,
 398                  errors='strict'):
 399
 400         """ Creates a StreamRecoder instance which implements a two-way
 401             conversion: encode and decode work on the frontend (the
 402             input to .read() and output of .write()) while
 403             Reader and Writer work on the backend (reading and
 404             writing to the stream).
 405
 406             You can use these objects to do transparent direct
 407             recodings from e.g. latin-1 to utf-8 and back.
 408
 409             stream must be a file-like object.
 410
 411             encode, decode must adhere to the Codec interface, Reader,
 412             Writer must be factory functions or classes providing the
 413             StreamReader, StreamWriter interface resp.
 414
 415             encode and decode are needed for the frontend translation,
 416             Reader and Writer for the backend translation. Unicode is
 417             used as intermediate encoding.
 418
 419             Error handling is done in the same way as defined for the
 420             StreamWriter/Readers.
 421
 422         """
 423         self.stream = stream
 424         self.encode = encode
 425         self.decode = decode
 426         self.reader = Reader(stream, errors)
 427         self.writer = Writer(stream, errors)
 428         self.errors = errors
 429
 430     def read(self, size=-1):
 431
 432         data = self.reader.read(size)
 433         data, bytesencoded = self.encode(data, self.errors)
 434         return data
 435
 436     def readline(self, size=None):
 437
 438         if size is None:
 439             data = self.reader.readline()
 440         else:
 441             data = self.reader.readline(size)
 442         data, bytesencoded = self.encode(data, self.errors)
 443         return data
 444
 445     def readlines(self, sizehint=None):
 446
 447         if sizehint is None:
 448             data = self.reader.read()
 449         else:
 450             data = self.reader.read(sizehint)
 451         data, bytesencoded = self.encode(data, self.errors)
 452         return data.splitlines(1)
 453
 454     def write(self, data):
 455
 456         data, bytesdecoded = self.decode(data, self.errors)
 457         return self.writer.write(data)
 458
 459     def writelines(self, list):
 460
 461         data = ''.join(list)
 462         data, bytesdecoded = self.decode(data, self.errors)
 463         return self.writer.write(data)
 464
 465     def reset(self):
 466
 467         self.reader.reset()
 468         self.writer.reset()
 469
 470     def __getattr__(self, name,
 471                     getattr=getattr):
 472
 473         """ Inherit all other methods from the underlying stream.
 474         """
 475         return getattr(self.stream, name)
 476
 477 ### Shortcuts
 478
 479 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 480
 481     """ Open an encoded file using the given mode and return
 482         a wrapped version providing transparent encoding/decoding.
 483
 484         Note: The wrapped version will only accept the object format
 485         defined by the codecs, i.e. Unicode objects for most builtin
 486         codecs. Output is also codec dependent and will usually by
 487         Unicode as well.
 488
 489         Files are always opened in binary mode, even if no binary mode
 490         was specified. Thisis done to avoid data loss due to encodings
 491         using 8-bit values. The default file mode is 'rb' meaning to
 492         open the file in binary read mode.
 493
 494         encoding specifies the encoding which is to be used for the
 495         the file.
 496
 497         errors may be given to define the error handling. It defaults
 498         to 'strict' which causes ValueErrors to be raised in case an
 499         encoding error occurs.
 500
 501         buffering has the same meaning as for the builtin open() API.
 502         It defaults to line buffered.
 503
 504         The returned wrapped file object provides an extra attribute
 505         .encoding which allows querying the used encoding. This
 506         attribute is only available if an encoding was specified as
 507         parameter.
 508
 509     """
 510     if encoding is not None and \
 511        'b' not in mode:
 512         # Force opening of the file in binary mode
 513         mode = mode + 'b'
 514     file = __builtin__.open(filename, mode, buffering)
 515     if encoding is None:
 516         return file
 517     (e, d, sr, sw) = lookup(encoding)
 518     srw = StreamReaderWriter(file, sr, sw, errors)
 519     # Add attributes to simplify introspection
 520     srw.encoding = encoding
 521     return srw
 522
 523 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 524
 525     """ Return a wrapped version of file which provides transparent
 526         encoding translation.
 527
 528         Strings written to the wrapped file are interpreted according
 529         to the given data_encoding and then written to the original
 530         file as string using file_encoding. The intermediate encoding
 531         will usually be Unicode but depends on the specified codecs.
 532
 533         Strings are read from the file using file_encoding and then
 534         passed back to the caller as string using data_encoding.
 535
 536         If file_encoding is not given, it defaults to data_encoding.
 537
 538         errors may be given to define the error handling. It defaults
 539         to 'strict' which causes ValueErrors to be raised in case an
 540         encoding error occurs.
 541
 542         The returned wrapped file object provides two extra attributes
 543         .data_encoding and .file_encoding which reflect the given
 544         parameters of the same name. The attributes can be used for
 545         introspection by Python programs.
 546
 547     """
 548     if file_encoding is None:
 549         file_encoding = data_encoding
 550     encode, decode = lookup(data_encoding)[:2]
 551     Reader, Writer = lookup(file_encoding)[2:]
 552     sr = StreamRecoder(file,
 553                        encode, decode, Reader, Writer,
 554                        errors)
 555     # Add attributes to simplify introspection
 556     sr.data_encoding = data_encoding
 557     sr.file_encoding = file_encoding
 558     return sr
 559
 560 ### Helpers for codec lookup
 561
 562 def getencoder(encoding):
 563
 564     """ Lookup up the codec for the given encoding and return
 565         its encoder function.
 566
 567         Raises a LookupError in case the encoding cannot be found.
 568
 569     """
 570     return lookup(encoding)[0]
 571
 572 def getdecoder(encoding):
 573
 574     """ Lookup up the codec for the given encoding and return
 575         its decoder function.
 576
 577         Raises a LookupError in case the encoding cannot be found.
 578
 579     """
 580     return lookup(encoding)[1]
 581
 582 def getreader(encoding):
 583
 584     """ Lookup up the codec for the given encoding and return
 585         its StreamReader class or factory function.
 586
 587         Raises a LookupError in case the encoding cannot be found.
 588
 589     """
 590     return lookup(encoding)[2]
 591
 592 def getwriter(encoding):
 593
 594     """ Lookup up the codec for the given encoding and return
 595         its StreamWriter class or factory function.
 596
 597         Raises a LookupError in case the encoding cannot be found.
 598
 599     """
 600     return lookup(encoding)[3]
 601
 602 ### Helpers for charmap-based codecs
 603
 604 def make_identity_dict(rng):
 605
 606     """ make_identity_dict(rng) -> dict
 607
 608         Return a dictionary where elements of the rng sequence are
 609         mapped to themselves.
 610
 611     """
 612     res = {}
 613     for i in rng:
 614         res[i]=i
 615     return res
 616
 617 def make_encoding_map(decoding_map):
 618
 619     """ Creates an encoding map from a decoding map.
 620
 621         If a target mapping in the decoding map occurrs multiple
 622         times, then that target is mapped to None (undefined mapping),
 623         causing an exception when encountered by the charmap codec
 624         during translation.
 625
 626         One example where this happens is cp875.py which decodes
 627         multiple character to \u001a.
 628
 629     """
 630     m = {}
 631     for k,v in decoding_map.items():
 632         if not v in m:
 633             m[v] = k
 634         else:
 635             m[v] = None
 636     return m
 637
 638 ### error handlers
 639
 640 strict_errors = lookup_error("strict")
 641 ignore_errors = lookup_error("ignore")
 642 replace_errors = lookup_error("replace")
 643 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
 644 backslashreplace_errors = lookup_error("backslashreplace")
 645
 646 # Tell modulefinder that using codecs probably needs the encodings
 647 # package
 648 _false = 0
 649 if _false:
 650     import encodings
 651
 652 ### Tests
 653
 654 if __name__ == '__main__':
 655
 656     import sys
 657
 658     # Make stdout translate Latin-1 output into UTF-8 output
 659     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 660
 661     # Have stdin translate Latin-1 input into UTF-8 input
 662     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')