Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import __builtin__, sys
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  21            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
  22            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
  23            "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
  24            "strict_errors", "ignore_errors", "replace_errors",
  25            "xmlcharrefreplace_errors",
  26            "register_error", "lookup_error"]
  27
  28 ### Constants
  29
  30 #
  31 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
  32 # and its possible byte string values
  33 # for UTF8/UTF16/UTF32 output and little/big endian machines
  34 #
  35
  36 # UTF-8
  37 BOM_UTF8 = '\xef\xbb\xbf'
  38
  39 # UTF-16, little endian
  40 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
  41
  42 # UTF-16, big endian
  43 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
  44
  45 # UTF-32, little endian
  46 BOM_UTF32_LE = '\xff\xfe\x00\x00'
  47
  48 # UTF-32, big endian
  49 BOM_UTF32_BE = '\x00\x00\xfe\xff'
  50
  51 if sys.byteorder == 'little':
  52
  53     # UTF-16, native endianness
  54     BOM = BOM_UTF16 = BOM_UTF16_LE
  55
  56     # UTF-32, native endianness
  57     BOM_UTF32 = BOM_UTF32_LE
  58
  59 else:
  60
  61     # UTF-16, native endianness
  62     BOM = BOM_UTF16 = BOM_UTF16_BE
  63
  64     # UTF-32, native endianness
  65     BOM_UTF32 = BOM_UTF32_BE
  66
  67 # Old broken names (don't use in new code)
  68 BOM32_LE = BOM_UTF16_LE
  69 BOM32_BE = BOM_UTF16_BE
  70 BOM64_LE = BOM_UTF32_LE
  71 BOM64_BE = BOM_UTF32_BE
  72
  73
  74 ### Codec base classes (defining the API)
  75
  76 class Codec:
  77
  78     """ Defines the interface for stateless encoders/decoders.
  79
  80         The .encode()/.decode() methods may use different error
  81         handling schemes by providing the errors argument. These
  82         string values are predefined:
  83
  84          'strict' - raise a ValueError error (or a subclass)
  85          'ignore' - ignore the character and continue with the next
  86          'replace' - replace with a suitable replacement character;
  87                     Python will use the official U+FFFD REPLACEMENT
  88                     CHARACTER for the builtin Unicode codecs on
  89                     decoding and '?' on encoding.
  90          'xmlcharrefreplace' - Replace with the appropriate XML
  91                                character reference (only for encoding).
  92          'backslashreplace'  - Replace with backslashed escape sequences
  93                                (only for encoding).
  94
  95         The set of allowed values can be extended via register_error.
  96
  97     """
  98     def encode(self, input, errors='strict'):
  99
 100         """ Encodes the object input and returns a tuple (output
 101             object, length consumed).
 102
 103             errors defines the error handling to apply. It defaults to
 104             'strict' handling.
 105
 106             The method may not store state in the Codec instance. Use
 107             StreamCodec for codecs which have to keep state in order to
 108             make encoding/decoding efficient.
 109
 110             The encoder must be able to handle zero length input and
 111             return an empty object of the output object type in this
 112             situation.
 113
 114         """
 115         raise NotImplementedError
 116
 117     def decode(self, input, errors='strict'):
 118
 119         """ Decodes the object input and returns a tuple (output
 120             object, length consumed).
 121
 122             input must be an object which provides the bf_getreadbuf
 123             buffer slot. Python strings, buffer objects and memory
 124             mapped files are examples of objects providing this slot.
 125
 126             errors defines the error handling to apply. It defaults to
 127             'strict' handling.
 128
 129             The method may not store state in the Codec instance. Use
 130             StreamCodec for codecs which have to keep state in order to
 131             make encoding/decoding efficient.
 132
 133             The decoder must be able to handle zero length input and
 134             return an empty object of the output object type in this
 135             situation.
 136
 137         """
 138         raise NotImplementedError
 139
 140 #
 141 # The StreamWriter and StreamReader class provide generic working
 142 # interfaces which can be used to implement new encoding submodules
 143 # very easily. See encodings/utf_8.py for an example on how this is
 144 # done.
 145 #
 146
 147 class StreamWriter(Codec):
 148
 149     def __init__(self, stream, errors='strict'):
 150
 151         """ Creates a StreamWriter instance.
 152
 153             stream must be a file-like object open for writing
 154             (binary) data.
 155
 156             The StreamWriter may use different error handling
 157             schemes by providing the errors keyword argument. These
 158             parameters are predefined:
 159
 160              'strict' - raise a ValueError (or a subclass)
 161              'ignore' - ignore the character and continue with the next
 162              'replace'- replace with a suitable replacement character
 163              'xmlcharrefreplace' - Replace with the appropriate XML
 164                                    character reference.
 165              'backslashreplace'  - Replace with backslashed escape
 166                                    sequences (only for encoding).
 167
 168             The set of allowed parameter values can be extended via
 169             register_error.
 170         """
 171         self.stream = stream
 172         self.errors = errors
 173
 174     def write(self, object):
 175
 176         """ Writes the object's contents encoded to self.stream.
 177         """
 178         data, consumed = self.encode(object, self.errors)
 179         self.stream.write(data)
 180
 181     def writelines(self, list):
 182
 183         """ Writes the concatenated list of strings to the stream
 184             using .write().
 185         """
 186         self.write(''.join(list))
 187
 188     def reset(self):
 189
 190         """ Flushes and resets the codec buffers used for keeping state.
 191
 192             Calling this method should ensure that the data on the
 193             output is put into a clean state, that allows appending
 194             of new fresh data without having to rescan the whole
 195             stream to recover state.
 196
 197         """
 198         pass
 199
 200     def __getattr__(self, name,
 201                     getattr=getattr):
 202
 203         """ Inherit all other methods from the underlying stream.
 204         """
 205         return getattr(self.stream, name)
 206
 207 ###
 208
 209 class StreamReader(Codec):
 210
 211     def __init__(self, stream, errors='strict'):
 212
 213         """ Creates a StreamReader instance.
 214
 215             stream must be a file-like object open for reading
 216             (binary) data.
 217
 218             The StreamReader may use different error handling
 219             schemes by providing the errors keyword argument. These
 220             parameters are predefined:
 221
 222              'strict' - raise a ValueError (or a subclass)
 223              'ignore' - ignore the character and continue with the next
 224              'replace'- replace with a suitable replacement character;
 225
 226             The set of allowed parameter values can be extended via
 227             register_error.
 228         """
 229         self.stream = stream
 230         self.errors = errors
 231
 232     def read(self, size=-1):
 233
 234         """ Decodes data from the stream self.stream and returns the
 235             resulting object.
 236
 237             size indicates the approximate maximum number of bytes to
 238             read from the stream for decoding purposes. The decoder
 239             can modify this setting as appropriate. The default value
 240             -1 indicates to read and decode as much as possible.  size
 241             is intended to prevent having to decode huge files in one
 242             step.
 243
 244             The method should use a greedy read strategy meaning that
 245             it should read as much data as is allowed within the
 246             definition of the encoding and the given size, e.g.  if
 247             optional encoding endings or state markers are available
 248             on the stream, these should be read too.
 249
 250         """
 251         # Unsliced reading:
 252         if size < 0:
 253             return self.decode(self.stream.read(), self.errors)[0]
 254
 255         # Sliced reading:
 256         read = self.stream.read
 257         decode = self.decode
 258         data = read(size)
 259         i = 0
 260         while 1:
 261             try:
 262                 object, decodedbytes = decode(data, self.errors)
 263             except ValueError, why:
 264                 # This method is slow but should work under pretty much
 265                 # all conditions; at most 10 tries are made
 266                 i = i + 1
 267                 newdata = read(1)
 268                 if not newdata or i > 10:
 269                     raise
 270                 data = data + newdata
 271             else:
 272                 return object
 273
 274     def readline(self, size=None):
 275
 276         """ Read one line from the input stream and return the
 277             decoded data.
 278
 279             Note: Unlike the .readlines() method, this method inherits
 280             the line breaking knowledge from the underlying stream's
 281             .readline() method -- there is currently no support for
 282             line breaking using the codec decoder due to lack of line
 283             buffering. Subclasses should however, if possible, try to
 284             implement this method using their own knowledge of line
 285             breaking.
 286
 287             size, if given, is passed as size argument to the stream's
 288             .readline() method.
 289
 290         """
 291         if size is None:
 292             line = self.stream.readline()
 293         else:
 294             line = self.stream.readline(size)
 295         return self.decode(line, self.errors)[0]
 296
 297
 298     def readlines(self, sizehint=None):
 299
 300         """ Read all lines available on the input stream
 301             and return them as list of lines.
 302
 303             Line breaks are implemented using the codec's decoder
 304             method and are included in the list entries.
 305
 306             sizehint, if given, is passed as size argument to the
 307             stream's .read() method.
 308
 309         """
 310         if sizehint is None:
 311             data = self.stream.read()
 312         else:
 313             data = self.stream.read(sizehint)
 314         return self.decode(data, self.errors)[0].splitlines(1)
 315
 316     def reset(self):
 317
 318         """ Resets the codec buffers used for keeping state.
 319
 320             Note that no stream repositioning should take place.
 321             This method is primarily intended to be able to recover
 322             from decoding errors.
 323
 324         """
 325         pass
 326
 327     def next(self):
 328
 329         """ Return the next decoded line from the input stream."""
 330         line = self.readline()
 331         if line:
 332             return line
 333         raise StopIteration
 334
 335     def __iter__(self):
 336         return self
 337
 338     def __getattr__(self, name,
 339                     getattr=getattr):
 340
 341         """ Inherit all other methods from the underlying stream.
 342         """
 343         return getattr(self.stream, name)
 344
 345 ###
 346
 347 class StreamReaderWriter:
 348
 349     """ StreamReaderWriter instances allow wrapping streams which
 350         work in both read and write modes.
 351
 352         The design is such that one can use the factory functions
 353         returned by the codec.lookup() function to construct the
 354         instance.
 355
 356     """
 357     # Optional attributes set by the file wrappers below
 358     encoding = 'unknown'
 359
 360     def __init__(self, stream, Reader, Writer, errors='strict'):
 361
 362         """ Creates a StreamReaderWriter instance.
 363
 364             stream must be a Stream-like object.
 365
 366             Reader, Writer must be factory functions or classes
 367             providing the StreamReader, StreamWriter interface resp.
 368
 369             Error handling is done in the same way as defined for the
 370             StreamWriter/Readers.
 371
 372         """
 373         self.stream = stream
 374         self.reader = Reader(stream, errors)
 375         self.writer = Writer(stream, errors)
 376         self.errors = errors
 377
 378     def read(self, size=-1):
 379
 380         return self.reader.read(size)
 381
 382     def readline(self, size=None):
 383
 384         return self.reader.readline(size)
 385
 386     def readlines(self, sizehint=None):
 387
 388         return self.reader.readlines(sizehint)
 389
 390     def next(self):
 391
 392         """ Return the next decoded line from the input stream."""
 393         return self.reader.next()
 394
 395     def __iter__(self):
 396         return self
 397
 398     def write(self, data):
 399
 400         return self.writer.write(data)
 401
 402     def writelines(self, list):
 403
 404         return self.writer.writelines(list)
 405
 406     def reset(self):
 407
 408         self.reader.reset()
 409         self.writer.reset()
 410
 411     def __getattr__(self, name,
 412                     getattr=getattr):
 413
 414         """ Inherit all other methods from the underlying stream.
 415         """
 416         return getattr(self.stream, name)
 417
 418 ###
 419
 420 class StreamRecoder:
 421
 422     """ StreamRecoder instances provide a frontend - backend
 423         view of encoding data.
 424
 425         They use the complete set of APIs returned by the
 426         codecs.lookup() function to implement their task.
 427
 428         Data written to the stream is first decoded into an
 429         intermediate format (which is dependent on the given codec
 430         combination) and then written to the stream using an instance
 431         of the provided Writer class.
 432
 433         In the other direction, data is read from the stream using a
 434         Reader instance and then return encoded data to the caller.
 435
 436     """
 437     # Optional attributes set by the file wrappers below
 438     data_encoding = 'unknown'
 439     file_encoding = 'unknown'
 440
 441     def __init__(self, stream, encode, decode, Reader, Writer,
 442                  errors='strict'):
 443
 444         """ Creates a StreamRecoder instance which implements a two-way
 445             conversion: encode and decode work on the frontend (the
 446             input to .read() and output of .write()) while
 447             Reader and Writer work on the backend (reading and
 448             writing to the stream).
 449
 450             You can use these objects to do transparent direct
 451             recodings from e.g. latin-1 to utf-8 and back.
 452
 453             stream must be a file-like object.
 454
 455             encode, decode must adhere to the Codec interface, Reader,
 456             Writer must be factory functions or classes providing the
 457             StreamReader, StreamWriter interface resp.
 458
 459             encode and decode are needed for the frontend translation,
 460             Reader and Writer for the backend translation. Unicode is
 461             used as intermediate encoding.
 462
 463             Error handling is done in the same way as defined for the
 464             StreamWriter/Readers.
 465
 466         """
 467         self.stream = stream
 468         self.encode = encode
 469         self.decode = decode
 470         self.reader = Reader(stream, errors)
 471         self.writer = Writer(stream, errors)
 472         self.errors = errors
 473
 474     def read(self, size=-1):
 475
 476         data = self.reader.read(size)
 477         data, bytesencoded = self.encode(data, self.errors)
 478         return data
 479
 480     def readline(self, size=None):
 481
 482         if size is None:
 483             data = self.reader.readline()
 484         else:
 485             data = self.reader.readline(size)
 486         data, bytesencoded = self.encode(data, self.errors)
 487         return data
 488
 489     def readlines(self, sizehint=None):
 490
 491         if sizehint is None:
 492             data = self.reader.read()
 493         else:
 494             data = self.reader.read(sizehint)
 495         data, bytesencoded = self.encode(data, self.errors)
 496         return data.splitlines(1)
 497
 498     def next(self):
 499
 500         """ Return the next decoded line from the input stream."""
 501         return self.reader.next()
 502
 503     def __iter__(self):
 504         return self
 505
 506     def write(self, data):
 507
 508         data, bytesdecoded = self.decode(data, self.errors)
 509         return self.writer.write(data)
 510
 511     def writelines(self, list):
 512
 513         data = ''.join(list)
 514         data, bytesdecoded = self.decode(data, self.errors)
 515         return self.writer.write(data)
 516
 517     def reset(self):
 518
 519         self.reader.reset()
 520         self.writer.reset()
 521
 522     def __getattr__(self, name,
 523                     getattr=getattr):
 524
 525         """ Inherit all other methods from the underlying stream.
 526         """
 527         return getattr(self.stream, name)
 528
 529 ### Shortcuts
 530
 531 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 532
 533     """ Open an encoded file using the given mode and return
 534         a wrapped version providing transparent encoding/decoding.
 535
 536         Note: The wrapped version will only accept the object format
 537         defined by the codecs, i.e. Unicode objects for most builtin
 538         codecs. Output is also codec dependent and will usually by
 539         Unicode as well.
 540
 541         Files are always opened in binary mode, even if no binary mode
 542         was specified. This is done to avoid data loss due to encodings
 543         using 8-bit values. The default file mode is 'rb' meaning to
 544         open the file in binary read mode.
 545
 546         encoding specifies the encoding which is to be used for the
 547         file.
 548
 549         errors may be given to define the error handling. It defaults
 550         to 'strict' which causes ValueErrors to be raised in case an
 551         encoding error occurs.
 552
 553         buffering has the same meaning as for the builtin open() API.
 554         It defaults to line buffered.
 555
 556         The returned wrapped file object provides an extra attribute
 557         .encoding which allows querying the used encoding. This
 558         attribute is only available if an encoding was specified as
 559         parameter.
 560
 561     """
 562     if encoding is not None and \
 563        'b' not in mode:
 564         # Force opening of the file in binary mode
 565         mode = mode + 'b'
 566     file = __builtin__.open(filename, mode, buffering)
 567     if encoding is None:
 568         return file
 569     (e, d, sr, sw) = lookup(encoding)
 570     srw = StreamReaderWriter(file, sr, sw, errors)
 571     # Add attributes to simplify introspection
 572     srw.encoding = encoding
 573     return srw
 574
 575 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 576
 577     """ Return a wrapped version of file which provides transparent
 578         encoding translation.
 579
 580         Strings written to the wrapped file are interpreted according
 581         to the given data_encoding and then written to the original
 582         file as string using file_encoding. The intermediate encoding
 583         will usually be Unicode but depends on the specified codecs.
 584
 585         Strings are read from the file using file_encoding and then
 586         passed back to the caller as string using data_encoding.
 587
 588         If file_encoding is not given, it defaults to data_encoding.
 589
 590         errors may be given to define the error handling. It defaults
 591         to 'strict' which causes ValueErrors to be raised in case an
 592         encoding error occurs.
 593
 594         The returned wrapped file object provides two extra attributes
 595         .data_encoding and .file_encoding which reflect the given
 596         parameters of the same name. The attributes can be used for
 597         introspection by Python programs.
 598
 599     """
 600     if file_encoding is None:
 601         file_encoding = data_encoding
 602     encode, decode = lookup(data_encoding)[:2]
 603     Reader, Writer = lookup(file_encoding)[2:]
 604     sr = StreamRecoder(file,
 605                        encode, decode, Reader, Writer,
 606                        errors)
 607     # Add attributes to simplify introspection
 608     sr.data_encoding = data_encoding
 609     sr.file_encoding = file_encoding
 610     return sr
 611
 612 ### Helpers for codec lookup
 613
 614 def getencoder(encoding):
 615
 616     """ Lookup up the codec for the given encoding and return
 617         its encoder function.
 618
 619         Raises a LookupError in case the encoding cannot be found.
 620
 621     """
 622     return lookup(encoding)[0]
 623
 624 def getdecoder(encoding):
 625
 626     """ Lookup up the codec for the given encoding and return
 627         its decoder function.
 628
 629         Raises a LookupError in case the encoding cannot be found.
 630
 631     """
 632     return lookup(encoding)[1]
 633
 634 def getreader(encoding):
 635
 636     """ Lookup up the codec for the given encoding and return
 637         its StreamReader class or factory function.
 638
 639         Raises a LookupError in case the encoding cannot be found.
 640
 641     """
 642     return lookup(encoding)[2]
 643
 644 def getwriter(encoding):
 645
 646     """ Lookup up the codec for the given encoding and return
 647         its StreamWriter class or factory function.
 648
 649         Raises a LookupError in case the encoding cannot be found.
 650
 651     """
 652     return lookup(encoding)[3]
 653
 654 ### Helpers for charmap-based codecs
 655
 656 def make_identity_dict(rng):
 657
 658     """ make_identity_dict(rng) -> dict
 659
 660         Return a dictionary where elements of the rng sequence are
 661         mapped to themselves.
 662
 663     """
 664     res = {}
 665     for i in rng:
 666         res[i]=i
 667     return res
 668
 669 def make_encoding_map(decoding_map):
 670
 671     """ Creates an encoding map from a decoding map.
 672
 673         If a target mapping in the decoding map occurs multiple
 674         times, then that target is mapped to None (undefined mapping),
 675         causing an exception when encountered by the charmap codec
 676         during translation.
 677
 678         One example where this happens is cp875.py which decodes
 679         multiple character to \u001a.
 680
 681     """
 682     m = {}
 683     for k,v in decoding_map.items():
 684         if not v in m:
 685             m[v] = k
 686         else:
 687             m[v] = None
 688     return m
 689
 690 ### error handlers
 691
 692 strict_errors = lookup_error("strict")
 693 ignore_errors = lookup_error("ignore")
 694 replace_errors = lookup_error("replace")
 695 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
 696 backslashreplace_errors = lookup_error("backslashreplace")
 697
 698 # Tell modulefinder that using codecs probably needs the encodings
 699 # package
 700 _false = 0
 701 if _false:
 702     import encodings
 703
 704 ### Tests
 705
 706 if __name__ == '__main__':
 707
 708     # Make stdout translate Latin-1 output into UTF-8 output
 709     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 710
 711     # Have stdin translate Latin-1 input into UTF-8 input
 712     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')