Lib/codecs.py

   1 """ codecs -- Python Codec Registry, API and helpers.
   2
   3
   4 Written by Marc-Andre Lemburg (mal@lemburg.com).
   5
   6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
   7
   8 """#"
   9
  10 import struct, __builtin__
  11
  12 ### Registry and builtin stateless codec functions
  13
  14 try:
  15     from _codecs import *
  16 except ImportError, why:
  17     raise SystemError,\
  18           'Failed to load the builtin codecs: %s' % why
  19
  20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
  21            "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE"]
  22
  23 ### Constants
  24
  25 #
  26 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
  27 #
  28 BOM = struct.pack('=H', 0xFEFF)
  29 #
  30 BOM_BE = BOM32_BE = '\376\377'
  31 #       corresponds to Unicode U+FEFF in UTF-16 on big endian
  32 #       platforms == ZERO WIDTH NO-BREAK SPACE
  33 BOM_LE = BOM32_LE = '\377\376'
  34 #       corresponds to Unicode U+FFFE in UTF-16 on little endian
  35 #       platforms == defined as being an illegal Unicode character
  36
  37 #
  38 # 64-bit Byte Order Marks
  39 #
  40 BOM64_BE = '\000\000\376\377'
  41 #       corresponds to Unicode U+0000FEFF in UCS-4
  42 BOM64_LE = '\377\376\000\000'
  43 #       corresponds to Unicode U+0000FFFE in UCS-4
  44
  45
  46 ### Codec base classes (defining the API)
  47
  48 class Codec:
  49
  50     """ Defines the interface for stateless encoders/decoders.
  51
  52         The .encode()/.decode() methods may implement different error
  53         handling schemes by providing the errors argument. These
  54         string values are defined:
  55
  56          'strict' - raise a ValueError error (or a subclass)
  57          'ignore' - ignore the character and continue with the next
  58          'replace' - replace with a suitable replacement character;
  59                     Python will use the official U+FFFD REPLACEMENT
  60                     CHARACTER for the builtin Unicode codecs.
  61
  62     """
  63     def encode(self, input, errors='strict'):
  64
  65         """ Encodes the object input and returns a tuple (output
  66             object, length consumed).
  67
  68             errors defines the error handling to apply. It defaults to
  69             'strict' handling.
  70
  71             The method may not store state in the Codec instance. Use
  72             StreamCodec for codecs which have to keep state in order to
  73             make encoding/decoding efficient.
  74
  75             The encoder must be able to handle zero length input and
  76             return an empty object of the output object type in this
  77             situation.
  78
  79         """
  80         raise NotImplementedError
  81
  82     def decode(self, input, errors='strict'):
  83
  84         """ Decodes the object input and returns a tuple (output
  85             object, length consumed).
  86
  87             input must be an object which provides the bf_getreadbuf
  88             buffer slot. Python strings, buffer objects and memory
  89             mapped files are examples of objects providing this slot.
  90
  91             errors defines the error handling to apply. It defaults to
  92             'strict' handling.
  93
  94             The method may not store state in the Codec instance. Use
  95             StreamCodec for codecs which have to keep state in order to
  96             make encoding/decoding efficient.
  97
  98             The decoder must be able to handle zero length input and
  99             return an empty object of the output object type in this
 100             situation.
 101
 102         """
 103         raise NotImplementedError
 104
 105 #
 106 # The StreamWriter and StreamReader class provide generic working
 107 # interfaces which can be used to implement new encodings submodules
 108 # very easily. See encodings/utf_8.py for an example on how this is
 109 # done.
 110 #
 111
 112 class StreamWriter(Codec):
 113
 114     def __init__(self, stream, errors='strict'):
 115
 116         """ Creates a StreamWriter instance.
 117
 118             stream must be a file-like object open for writing
 119             (binary) data.
 120
 121             The StreamWriter may implement different error handling
 122             schemes by providing the errors keyword argument. These
 123             parameters are defined:
 124
 125              'strict' - raise a ValueError (or a subclass)
 126              'ignore' - ignore the character and continue with the next
 127              'replace'- replace with a suitable replacement character
 128
 129         """
 130         self.stream = stream
 131         self.errors = errors
 132
 133     def write(self, object):
 134
 135         """ Writes the object's contents encoded to self.stream.
 136         """
 137         data, consumed = self.encode(object, self.errors)
 138         self.stream.write(data)
 139
 140     def writelines(self, list):
 141
 142         """ Writes the concatenated list of strings to the stream
 143             using .write().
 144         """
 145         self.write(''.join(list))
 146
 147     def reset(self):
 148
 149         """ Flushes and resets the codec buffers used for keeping state.
 150
 151             Calling this method should ensure that the data on the
 152             output is put into a clean state, that allows appending
 153             of new fresh data without having to rescan the whole
 154             stream to recover state.
 155
 156         """
 157         pass
 158
 159     def __getattr__(self, name,
 160                     getattr=getattr):
 161
 162         """ Inherit all other methods from the underlying stream.
 163         """
 164         return getattr(self.stream, name)
 165
 166 ###
 167
 168 class StreamReader(Codec):
 169
 170     def __init__(self, stream, errors='strict'):
 171
 172         """ Creates a StreamReader instance.
 173
 174             stream must be a file-like object open for reading
 175             (binary) data.
 176
 177             The StreamReader may implement different error handling
 178             schemes by providing the errors keyword argument. These
 179             parameters are defined:
 180
 181              'strict' - raise a ValueError (or a subclass)
 182              'ignore' - ignore the character and continue with the next
 183              'replace'- replace with a suitable replacement character;
 184
 185         """
 186         self.stream = stream
 187         self.errors = errors
 188
 189     def read(self, size=-1):
 190
 191         """ Decodes data from the stream self.stream and returns the
 192             resulting object.
 193
 194             size indicates the approximate maximum number of bytes to
 195             read from the stream for decoding purposes. The decoder
 196             can modify this setting as appropriate. The default value
 197             -1 indicates to read and decode as much as possible.  size
 198             is intended to prevent having to decode huge files in one
 199             step.
 200
 201             The method should use a greedy read strategy meaning that
 202             it should read as much data as is allowed within the
 203             definition of the encoding and the given size, e.g.  if
 204             optional encoding endings or state markers are available
 205             on the stream, these should be read too.
 206
 207         """
 208         # Unsliced reading:
 209         if size < 0:
 210             return self.decode(self.stream.read(), self.errors)[0]
 211
 212         # Sliced reading:
 213         read = self.stream.read
 214         decode = self.decode
 215         data = read(size)
 216         i = 0
 217         while 1:
 218             try:
 219                 object, decodedbytes = decode(data, self.errors)
 220             except ValueError, why:
 221                 # This method is slow but should work under pretty much
 222                 # all conditions; at most 10 tries are made
 223                 i = i + 1
 224                 newdata = read(1)
 225                 if not newdata or i > 10:
 226                     raise
 227                 data = data + newdata
 228             else:
 229                 return object
 230
 231     def readline(self, size=None):
 232
 233         """ Read one line from the input stream and return the
 234             decoded data.
 235
 236             Note: Unlike the .readlines() method, this method inherits
 237             the line breaking knowledge from the underlying stream's
 238             .readline() method -- there is currently no support for
 239             line breaking using the codec decoder due to lack of line
 240             buffering. Sublcasses should however, if possible, try to
 241             implement this method using their own knowledge of line
 242             breaking.
 243
 244             size, if given, is passed as size argument to the stream's
 245             .readline() method.
 246
 247         """
 248         if size is None:
 249             line = self.stream.readline()
 250         else:
 251             line = self.stream.readline(size)
 252         return self.decode(line, self.errors)[0]
 253
 254
 255     def readlines(self, sizehint=0):
 256
 257         """ Read all lines available on the input stream
 258             and return them as list of lines.
 259
 260             Line breaks are implemented using the codec's decoder
 261             method and are included in the list entries.
 262
 263             sizehint, if given, is passed as size argument to the
 264             stream's .read() method.
 265
 266         """
 267         if sizehint is None:
 268             data = self.stream.read()
 269         else:
 270             data = self.stream.read(sizehint)
 271         return self.decode(data, self.errors)[0].splitlines(1)
 272
 273     def reset(self):
 274
 275         """ Resets the codec buffers used for keeping state.
 276
 277             Note that no stream repositioning should take place.
 278             This method is primarily intended to be able to recover
 279             from decoding errors.
 280
 281         """
 282         pass
 283
 284     def __getattr__(self, name,
 285                     getattr=getattr):
 286
 287         """ Inherit all other methods from the underlying stream.
 288         """
 289         return getattr(self.stream, name)
 290
 291 ###
 292
 293 class StreamReaderWriter:
 294
 295     """ StreamReaderWriter instances allow wrapping streams which
 296         work in both read and write modes.
 297
 298         The design is such that one can use the factory functions
 299         returned by the codec.lookup() function to construct the
 300         instance.
 301
 302     """
 303     # Optional attributes set by the file wrappers below
 304     encoding = 'unknown'
 305
 306     def __init__(self, stream, Reader, Writer, errors='strict'):
 307
 308         """ Creates a StreamReaderWriter instance.
 309
 310             stream must be a Stream-like object.
 311
 312             Reader, Writer must be factory functions or classes
 313             providing the StreamReader, StreamWriter interface resp.
 314
 315             Error handling is done in the same way as defined for the
 316             StreamWriter/Readers.
 317
 318         """
 319         self.stream = stream
 320         self.reader = Reader(stream, errors)
 321         self.writer = Writer(stream, errors)
 322         self.errors = errors
 323
 324     def read(self, size=-1):
 325
 326         return self.reader.read(size)
 327
 328     def readline(self, size=None):
 329
 330         return self.reader.readline(size)
 331
 332     def readlines(self, sizehint=None):
 333
 334         return self.reader.readlines(sizehint)
 335
 336     def write(self, data):
 337
 338         return self.writer.write(data)
 339
 340     def writelines(self, list):
 341
 342         return self.writer.writelines(list)
 343
 344     def reset(self):
 345
 346         self.reader.reset()
 347         self.writer.reset()
 348
 349     def __getattr__(self, name,
 350                     getattr=getattr):
 351
 352         """ Inherit all other methods from the underlying stream.
 353         """
 354         return getattr(self.stream, name)
 355
 356 ###
 357
 358 class StreamRecoder:
 359
 360     """ StreamRecoder instances provide a frontend - backend
 361         view of encoding data.
 362
 363         They use the complete set of APIs returned by the
 364         codecs.lookup() function to implement their task.
 365
 366         Data written to the stream is first decoded into an
 367         intermediate format (which is dependent on the given codec
 368         combination) and then written to the stream using an instance
 369         of the provided Writer class.
 370
 371         In the other direction, data is read from the stream using a
 372         Reader instance and then return encoded data to the caller.
 373
 374     """
 375     # Optional attributes set by the file wrappers below
 376     data_encoding = 'unknown'
 377     file_encoding = 'unknown'
 378
 379     def __init__(self, stream, encode, decode, Reader, Writer,
 380                  errors='strict'):
 381
 382         """ Creates a StreamRecoder instance which implements a two-way
 383             conversion: encode and decode work on the frontend (the
 384             input to .read() and output of .write()) while
 385             Reader and Writer work on the backend (reading and
 386             writing to the stream).
 387
 388             You can use these objects to do transparent direct
 389             recodings from e.g. latin-1 to utf-8 and back.
 390
 391             stream must be a file-like object.
 392
 393             encode, decode must adhere to the Codec interface, Reader,
 394             Writer must be factory functions or classes providing the
 395             StreamReader, StreamWriter interface resp.
 396
 397             encode and decode are needed for the frontend translation,
 398             Reader and Writer for the backend translation. Unicode is
 399             used as intermediate encoding.
 400
 401             Error handling is done in the same way as defined for the
 402             StreamWriter/Readers.
 403
 404         """
 405         self.stream = stream
 406         self.encode = encode
 407         self.decode = decode
 408         self.reader = Reader(stream, errors)
 409         self.writer = Writer(stream, errors)
 410         self.errors = errors
 411
 412     def read(self, size=-1):
 413
 414         data = self.reader.read(size)
 415         data, bytesencoded = self.encode(data, self.errors)
 416         return data
 417
 418     def readline(self, size=None):
 419
 420         if size is None:
 421             data = self.reader.readline()
 422         else:
 423             data = self.reader.readline(size)
 424         data, bytesencoded = self.encode(data, self.errors)
 425         return data
 426
 427     def readlines(self, sizehint=None):
 428
 429         if sizehint is None:
 430             data = self.reader.read()
 431         else:
 432             data = self.reader.read(sizehint)
 433         data, bytesencoded = self.encode(data, self.errors)
 434         return data.splitlines(1)
 435
 436     def write(self, data):
 437
 438         data, bytesdecoded = self.decode(data, self.errors)
 439         return self.writer.write(data)
 440
 441     def writelines(self, list):
 442
 443         data = ''.join(list)
 444         data, bytesdecoded = self.decode(data, self.errors)
 445         return self.writer.write(data)
 446
 447     def reset(self):
 448
 449         self.reader.reset()
 450         self.writer.reset()
 451
 452     def __getattr__(self, name,
 453                     getattr=getattr):
 454
 455         """ Inherit all other methods from the underlying stream.
 456         """
 457         return getattr(self.stream, name)
 458
 459 ### Shortcuts
 460
 461 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
 462
 463     """ Open an encoded file using the given mode and return
 464         a wrapped version providing transparent encoding/decoding.
 465
 466         Note: The wrapped version will only accept the object format
 467         defined by the codecs, i.e. Unicode objects for most builtin
 468         codecs. Output is also codec dependent and will usually by
 469         Unicode as well.
 470
 471         Files are always opened in binary mode, even if no binary mode
 472         was specified. Thisis done to avoid data loss due to encodings
 473         using 8-bit values. The default file mode is 'rb' meaning to
 474         open the file in binary read mode.
 475
 476         encoding specifies the encoding which is to be used for the
 477         the file.
 478
 479         errors may be given to define the error handling. It defaults
 480         to 'strict' which causes ValueErrors to be raised in case an
 481         encoding error occurs.
 482
 483         buffering has the same meaning as for the builtin open() API.
 484         It defaults to line buffered.
 485
 486         The returned wrapped file object provides an extra attribute
 487         .encoding which allows querying the used encoding. This
 488         attribute is only available if an encoding was specified as
 489         parameter.
 490
 491     """
 492     if encoding is not None and \
 493        'b' not in mode:
 494         # Force opening of the file in binary mode
 495         mode = mode + 'b'
 496     file = __builtin__.open(filename, mode, buffering)
 497     if encoding is None:
 498         return file
 499     (e, d, sr, sw) = lookup(encoding)
 500     srw = StreamReaderWriter(file, sr, sw, errors)
 501     # Add attributes to simplify introspection
 502     srw.encoding = encoding
 503     return srw
 504
 505 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
 506
 507     """ Return a wrapped version of file which provides transparent
 508         encoding translation.
 509
 510         Strings written to the wrapped file are interpreted according
 511         to the given data_encoding and then written to the original
 512         file as string using file_encoding. The intermediate encoding
 513         will usually be Unicode but depends on the specified codecs.
 514
 515         Strings are read from the file using file_encoding and then
 516         passed back to the caller as string using data_encoding.
 517
 518         If file_encoding is not given, it defaults to data_encoding.
 519
 520         errors may be given to define the error handling. It defaults
 521         to 'strict' which causes ValueErrors to be raised in case an
 522         encoding error occurs.
 523
 524         The returned wrapped file object provides two extra attributes
 525         .data_encoding and .file_encoding which reflect the given
 526         parameters of the same name. The attributes can be used for
 527         introspection by Python programs.
 528
 529     """
 530     if file_encoding is None:
 531         file_encoding = data_encoding
 532     encode, decode = lookup(data_encoding)[:2]
 533     Reader, Writer = lookup(file_encoding)[2:]
 534     sr = StreamRecoder(file,
 535                        encode, decode, Reader, Writer,
 536                        errors)
 537     # Add attributes to simplify introspection
 538     sr.data_encoding = data_encoding
 539     sr.file_encoding = file_encoding
 540     return sr
 541
 542 ### Helpers for charmap-based codecs
 543
 544 def make_identity_dict(rng):
 545
 546     """ make_identity_dict(rng) -> dict
 547
 548         Return a dictionary where elements of the rng sequence are
 549         mapped to themselves.
 550
 551     """
 552     res = {}
 553     for i in rng:
 554         res[i]=i
 555     return res
 556
 557 def make_encoding_map(decoding_map):
 558
 559     """ Creates an encoding map from a decoding map.
 560
 561         If a target mapping in the decoding map occurrs multiple
 562         times, then that target is mapped to None (undefined mapping),
 563         causing an exception when encountered by the charmap codec
 564         during translation.
 565
 566         One example where this happens is cp875.py which decodes
 567         multiple character to \u001a.
 568
 569     """
 570     m = {}
 571     for k,v in decoding_map.items():
 572         if not m.has_key(v):
 573             m[v] = k
 574         else:
 575             m[v] = None
 576     return m
 577
 578 # Tell modulefinder that using codecs probably needs the encodings
 579 # package
 580 _false = 0
 581 if _false:
 582     import encodings
 583
 584 ### Tests
 585
 586 if __name__ == '__main__':
 587
 588     import sys
 589
 590     # Make stdout translate Latin-1 output into UTF-8 output
 591     sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
 592
 593     # Have stdin translate Latin-1 input into UTF-8 input
 594     sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')