1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
10 import __builtin__
, sys
12 ### Registry and builtin stateless codec functions
16 except ImportError, why
:
17 raise SystemError('Failed to load the builtin codecs: %s' % why
)
19 __all__
= ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23 "strict_errors", "ignore_errors", "replace_errors",
24 "xmlcharrefreplace_errors",
25 "register_error", "lookup_error"]
30 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
31 # and its possible byte string values
32 # for UTF8/UTF16/UTF32 output and little/big endian machines
36 BOM_UTF8
= '\xef\xbb\xbf'
38 # UTF-16, little endian
39 BOM_LE
= BOM_UTF16_LE
= '\xff\xfe'
42 BOM_BE
= BOM_UTF16_BE
= '\xfe\xff'
44 # UTF-32, little endian
45 BOM_UTF32_LE
= '\xff\xfe\x00\x00'
48 BOM_UTF32_BE
= '\x00\x00\xfe\xff'
50 if sys
.byteorder
== 'little':
52 # UTF-16, native endianness
53 BOM
= BOM_UTF16
= BOM_UTF16_LE
55 # UTF-32, native endianness
56 BOM_UTF32
= BOM_UTF32_LE
60 # UTF-16, native endianness
61 BOM
= BOM_UTF16
= BOM_UTF16_BE
63 # UTF-32, native endianness
64 BOM_UTF32
= BOM_UTF32_BE
66 # Old broken names (don't use in new code)
67 BOM32_LE
= BOM_UTF16_LE
68 BOM32_BE
= BOM_UTF16_BE
69 BOM64_LE
= BOM_UTF32_LE
70 BOM64_BE
= BOM_UTF32_BE
73 ### Codec base classes (defining the API)
75 class CodecInfo(tuple):
77 def __new__(cls
, encode
, decode
, streamreader
=None, streamwriter
=None,
78 incrementalencoder
=None, incrementaldecoder
=None, name
=None):
79 self
= tuple.__new
__(cls
, (encode
, decode
, streamreader
, streamwriter
))
83 self
.incrementalencoder
= incrementalencoder
84 self
.incrementaldecoder
= incrementaldecoder
85 self
.streamwriter
= streamwriter
86 self
.streamreader
= streamreader
90 return "<%s.%s object for encoding %s at 0x%x>" % (self
.__class
__.__module
__, self
.__class
__.__name
__, self
.name
, id(self
))
94 """ Defines the interface for stateless encoders/decoders.
96 The .encode()/.decode() methods may use different error
97 handling schemes by providing the errors argument. These
98 string values are predefined:
100 'strict' - raise a ValueError error (or a subclass)
101 'ignore' - ignore the character and continue with the next
102 'replace' - replace with a suitable replacement character;
103 Python will use the official U+FFFD REPLACEMENT
104 CHARACTER for the builtin Unicode codecs on
105 decoding and '?' on encoding.
106 'xmlcharrefreplace' - Replace with the appropriate XML
107 character reference (only for encoding).
108 'backslashreplace' - Replace with backslashed escape sequences
111 The set of allowed values can be extended via register_error.
114 def encode(self
, input, errors
='strict'):
116 """ Encodes the object input and returns a tuple (output
117 object, length consumed).
119 errors defines the error handling to apply. It defaults to
122 The method may not store state in the Codec instance. Use
123 StreamCodec for codecs which have to keep state in order to
124 make encoding/decoding efficient.
126 The encoder must be able to handle zero length input and
127 return an empty object of the output object type in this
131 raise NotImplementedError
133 def decode(self
, input, errors
='strict'):
135 """ Decodes the object input and returns a tuple (output
136 object, length consumed).
138 input must be an object which provides the bf_getreadbuf
139 buffer slot. Python strings, buffer objects and memory
140 mapped files are examples of objects providing this slot.
142 errors defines the error handling to apply. It defaults to
145 The method may not store state in the Codec instance. Use
146 StreamCodec for codecs which have to keep state in order to
147 make encoding/decoding efficient.
149 The decoder must be able to handle zero length input and
150 return an empty object of the output object type in this
154 raise NotImplementedError
156 class IncrementalEncoder(object):
158 An IncrementalEncoder encodes an input in multiple steps. The input can be
159 passed piece by piece to the encode() method. The IncrementalEncoder remembers
160 the state of the Encoding process between calls to encode().
162 def __init__(self
, errors
='strict'):
164 Creates an IncrementalEncoder instance.
166 The IncrementalEncoder may use different error handling schemes by
167 providing the errors keyword argument. See the module docstring
168 for a list of possible values.
173 def encode(self
, input, final
=False):
175 Encodes input and returns the resulting object.
177 raise NotImplementedError
181 Resets the encoder to the initial state.
186 Return the current state of the encoder.
190 def setstate(self
, state
):
192 Set the current state of the encoder. state must have been
193 returned by getstate().
196 class BufferedIncrementalEncoder(IncrementalEncoder
):
198 This subclass of IncrementalEncoder can be used as the baseclass for an
199 incremental encoder if the encoder must keep some of the output in a
200 buffer between calls to encode().
202 def __init__(self
, errors
='strict'):
203 IncrementalEncoder
.__init
__(self
, errors
)
204 self
.buffer = "" # unencoded input that is kept between calls to encode()
206 def _buffer_encode(self
, input, errors
, final
):
207 # Overwrite this method in subclasses: It must encode input
208 # and return an (output, length consumed) tuple
209 raise NotImplementedError
211 def encode(self
, input, final
=False):
212 # encode input (taking the buffer into account)
213 data
= self
.buffer + input
214 (result
, consumed
) = self
._buffer
_encode
(data
, self
.errors
, final
)
215 # keep unencoded input until the next call
216 self
.buffer = data
[consumed
:]
220 IncrementalEncoder
.reset(self
)
224 return self
.buffer or 0
226 def setstate(self
, state
):
227 self
.buffer = state
or ""
229 class IncrementalDecoder(object):
231 An IncrementalDecoder decodes an input in multiple steps. The input can be
232 passed piece by piece to the decode() method. The IncrementalDecoder
233 remembers the state of the decoding process between calls to decode().
235 def __init__(self
, errors
='strict'):
237 Creates a IncrementalDecoder instance.
239 The IncrementalDecoder may use different error handling schemes by
240 providing the errors keyword argument. See the module docstring
241 for a list of possible values.
245 def decode(self
, input, final
=False):
247 Decodes input and returns the resulting object.
249 raise NotImplementedError
253 Resets the decoder to the initial state.
258 Return the current state of the decoder.
260 This must be a (buffered_input, additional_state_info) tuple.
261 buffered_input must be a bytes object containing bytes that
262 were passed to decode() that have not yet been converted.
263 additional_state_info must be a non-negative integer
264 representing the state of the decoder WITHOUT yet having
265 processed the contents of buffered_input. In the initial state
266 and after reset(), getstate() must return (b"", 0).
270 def setstate(self
, state
):
272 Set the current state of the decoder.
274 state must have been returned by getstate(). The effect of
275 setstate((b"", 0)) must be equivalent to reset().
278 class BufferedIncrementalDecoder(IncrementalDecoder
):
280 This subclass of IncrementalDecoder can be used as the baseclass for an
281 incremental decoder if the decoder must be able to handle incomplete byte
284 def __init__(self
, errors
='strict'):
285 IncrementalDecoder
.__init
__(self
, errors
)
286 self
.buffer = "" # undecoded input that is kept between calls to decode()
288 def _buffer_decode(self
, input, errors
, final
):
289 # Overwrite this method in subclasses: It must decode input
290 # and return an (output, length consumed) tuple
291 raise NotImplementedError
293 def decode(self
, input, final
=False):
294 # decode input (taking the buffer into account)
295 data
= self
.buffer + input
296 (result
, consumed
) = self
._buffer
_decode
(data
, self
.errors
, final
)
297 # keep undecoded input until the next call
298 self
.buffer = data
[consumed
:]
302 IncrementalDecoder
.reset(self
)
306 # additional state info is always 0
307 return (self
.buffer, 0)
309 def setstate(self
, state
):
310 # ignore additional state info
311 self
.buffer = state
[0]
314 # The StreamWriter and StreamReader class provide generic working
315 # interfaces which can be used to implement new encoding submodules
316 # very easily. See encodings/utf_8.py for an example on how this is
320 class StreamWriter(Codec
):
322 def __init__(self
, stream
, errors
='strict'):
324 """ Creates a StreamWriter instance.
326 stream must be a file-like object open for writing
329 The StreamWriter may use different error handling
330 schemes by providing the errors keyword argument. These
331 parameters are predefined:
333 'strict' - raise a ValueError (or a subclass)
334 'ignore' - ignore the character and continue with the next
335 'replace'- replace with a suitable replacement character
336 'xmlcharrefreplace' - Replace with the appropriate XML
338 'backslashreplace' - Replace with backslashed escape
339 sequences (only for encoding).
341 The set of allowed parameter values can be extended via
347 def write(self
, object):
349 """ Writes the object's contents encoded to self.stream.
351 data
, consumed
= self
.encode(object, self
.errors
)
352 self
.stream
.write(data
)
354 def writelines(self
, list):
356 """ Writes the concatenated list of strings to the stream
359 self
.write(''.join(list))
363 """ Flushes and resets the codec buffers used for keeping state.
365 Calling this method should ensure that the data on the
366 output is put into a clean state, that allows appending
367 of new fresh data without having to rescan the whole
368 stream to recover state.
373 def seek(self
, offset
, whence
=0):
374 self
.stream
.seek(offset
, whence
)
375 if whence
== 0 and offset
== 0:
378 def __getattr__(self
, name
,
381 """ Inherit all other methods from the underlying stream.
383 return getattr(self
.stream
, name
)
388 def __exit__(self
, type, value
, tb
):
393 class StreamReader(Codec
):
395 def __init__(self
, stream
, errors
='strict'):
397 """ Creates a StreamReader instance.
399 stream must be a file-like object open for reading
402 The StreamReader may use different error handling
403 schemes by providing the errors keyword argument. These
404 parameters are predefined:
406 'strict' - raise a ValueError (or a subclass)
407 'ignore' - ignore the character and continue with the next
408 'replace'- replace with a suitable replacement character;
410 The set of allowed parameter values can be extended via
416 # For str->str decoding this will stay a str
417 # For str->unicode decoding the first read will promote it to unicode
419 self
.linebuffer
= None
421 def decode(self
, input, errors
='strict'):
422 raise NotImplementedError
424 def read(self
, size
=-1, chars
=-1, firstline
=False):
426 """ Decodes data from the stream self.stream and returns the
429 chars indicates the number of characters to read from the
430 stream. read() will never return more than chars
431 characters, but it might return less, if there are not enough
432 characters available.
434 size indicates the approximate maximum number of bytes to
435 read from the stream for decoding purposes. The decoder
436 can modify this setting as appropriate. The default value
437 -1 indicates to read and decode as much as possible. size
438 is intended to prevent having to decode huge files in one
441 If firstline is true, and a UnicodeDecodeError happens
442 after the first line terminator in the input only the first line
443 will be returned, the rest of the input will be kept until the
446 The method should use a greedy read strategy meaning that
447 it should read as much data as is allowed within the
448 definition of the encoding and the given size, e.g. if
449 optional encoding endings or state markers are available
450 on the stream, these should be read too.
452 # If we have lines cached, first merge them back into characters
454 self
.charbuffer
= "".join(self
.linebuffer
)
455 self
.linebuffer
= None
457 # read until we get the required number of characters (if available)
459 # can the request can be satisfied from the character buffer?
464 elif len(self
.charbuffer
) >= size
:
467 if len(self
.charbuffer
) >= chars
:
471 newdata
= self
.stream
.read()
473 newdata
= self
.stream
.read(size
)
474 # decode bytes (those remaining from the last call included)
475 data
= self
.bytebuffer
+ newdata
477 newchars
, decodedbytes
= self
.decode(data
, self
.errors
)
478 except UnicodeDecodeError, exc
:
480 newchars
, decodedbytes
= self
.decode(data
[:exc
.start
], self
.errors
)
481 lines
= newchars
.splitlines(True)
486 # keep undecoded bytes until the next call
487 self
.bytebuffer
= data
[decodedbytes
:]
488 # put new characters in the character buffer
489 self
.charbuffer
+= newchars
490 # there was no data available
494 # Return everything we've got
495 result
= self
.charbuffer
498 # Return the first chars characters
499 result
= self
.charbuffer
[:chars
]
500 self
.charbuffer
= self
.charbuffer
[chars
:]
503 def readline(self
, size
=None, keepends
=True):
505 """ Read one line from the input stream and return the
508 size, if given, is passed as size argument to the
512 # If we have lines cached from an earlier read, return
513 # them unconditionally
515 line
= self
.linebuffer
[0]
516 del self
.linebuffer
[0]
517 if len(self
.linebuffer
) == 1:
518 # revert to charbuffer mode; we might need more data
520 self
.charbuffer
= self
.linebuffer
[0]
521 self
.linebuffer
= None
523 line
= line
.splitlines(False)[0]
526 readsize
= size
or 72
528 # If size is given, we call read() only once
530 data
= self
.read(readsize
, firstline
=True)
532 # If we're at a "\r" read one extra character (which might
533 # be a "\n") to get a proper line ending. If the stream is
534 # temporarily exhausted we return the wrong line ending.
535 if data
.endswith("\r"):
536 data
+= self
.read(size
=1, chars
=1)
539 lines
= line
.splitlines(True)
542 # More than one line result; the first line is a full line
547 # cache the remaining lines
548 lines
[-1] += self
.charbuffer
549 self
.linebuffer
= lines
550 self
.charbuffer
= None
552 # only one remaining line, put it back into charbuffer
553 self
.charbuffer
= lines
[0] + self
.charbuffer
555 line
= line
.splitlines(False)[0]
557 line0withend
= lines
[0]
558 line0withoutend
= lines
[0].splitlines(False)[0]
559 if line0withend
!= line0withoutend
: # We really have a line end
560 # Put the rest back together and keep it until the next call
561 self
.charbuffer
= "".join(lines
[1:]) + self
.charbuffer
565 line
= line0withoutend
567 # we didn't get anything or this was our only try
568 if not data
or size
is not None:
569 if line
and not keepends
:
570 line
= line
.splitlines(False)[0]
576 def readlines(self
, sizehint
=None, keepends
=True):
578 """ Read all lines available on the input stream
579 and return them as list of lines.
581 Line breaks are implemented using the codec's decoder
582 method and are included in the list entries.
584 sizehint, if given, is ignored since there is no efficient
585 way to finding the true end-of-line.
589 return data
.splitlines(keepends
)
593 """ Resets the codec buffers used for keeping state.
595 Note that no stream repositioning should take place.
596 This method is primarily intended to be able to recover
597 from decoding errors.
601 self
.charbuffer
= u
""
602 self
.linebuffer
= None
604 def seek(self
, offset
, whence
=0):
605 """ Set the input stream's current position.
607 Resets the codec buffers used for keeping state.
609 self
.stream
.seek(offset
, whence
)
614 """ Return the next decoded line from the input stream."""
615 line
= self
.readline()
623 def __getattr__(self
, name
,
626 """ Inherit all other methods from the underlying stream.
628 return getattr(self
.stream
, name
)
633 def __exit__(self
, type, value
, tb
):
638 class StreamReaderWriter
:
640 """ StreamReaderWriter instances allow wrapping streams which
641 work in both read and write modes.
643 The design is such that one can use the factory functions
644 returned by the codec.lookup() function to construct the
648 # Optional attributes set by the file wrappers below
651 def __init__(self
, stream
, Reader
, Writer
, errors
='strict'):
653 """ Creates a StreamReaderWriter instance.
655 stream must be a Stream-like object.
657 Reader, Writer must be factory functions or classes
658 providing the StreamReader, StreamWriter interface resp.
660 Error handling is done in the same way as defined for the
661 StreamWriter/Readers.
665 self
.reader
= Reader(stream
, errors
)
666 self
.writer
= Writer(stream
, errors
)
669 def read(self
, size
=-1):
671 return self
.reader
.read(size
)
673 def readline(self
, size
=None):
675 return self
.reader
.readline(size
)
677 def readlines(self
, sizehint
=None):
679 return self
.reader
.readlines(sizehint
)
683 """ Return the next decoded line from the input stream."""
684 return self
.reader
.next()
689 def write(self
, data
):
691 return self
.writer
.write(data
)
693 def writelines(self
, list):
695 return self
.writer
.writelines(list)
702 def seek(self
, offset
, whence
=0):
703 self
.stream
.seek(offset
, whence
)
705 if whence
== 0 and offset
== 0:
708 def __getattr__(self
, name
,
711 """ Inherit all other methods from the underlying stream.
713 return getattr(self
.stream
, name
)
715 # these are needed to make "with codecs.open(...)" work properly
720 def __exit__(self
, type, value
, tb
):
727 """ StreamRecoder instances provide a frontend - backend
728 view of encoding data.
730 They use the complete set of APIs returned by the
731 codecs.lookup() function to implement their task.
733 Data written to the stream is first decoded into an
734 intermediate format (which is dependent on the given codec
735 combination) and then written to the stream using an instance
736 of the provided Writer class.
738 In the other direction, data is read from the stream using a
739 Reader instance and then return encoded data to the caller.
742 # Optional attributes set by the file wrappers below
743 data_encoding
= 'unknown'
744 file_encoding
= 'unknown'
746 def __init__(self
, stream
, encode
, decode
, Reader
, Writer
,
749 """ Creates a StreamRecoder instance which implements a two-way
750 conversion: encode and decode work on the frontend (the
751 input to .read() and output of .write()) while
752 Reader and Writer work on the backend (reading and
753 writing to the stream).
755 You can use these objects to do transparent direct
756 recodings from e.g. latin-1 to utf-8 and back.
758 stream must be a file-like object.
760 encode, decode must adhere to the Codec interface, Reader,
761 Writer must be factory functions or classes providing the
762 StreamReader, StreamWriter interface resp.
764 encode and decode are needed for the frontend translation,
765 Reader and Writer for the backend translation. Unicode is
766 used as intermediate encoding.
768 Error handling is done in the same way as defined for the
769 StreamWriter/Readers.
775 self
.reader
= Reader(stream
, errors
)
776 self
.writer
= Writer(stream
, errors
)
779 def read(self
, size
=-1):
781 data
= self
.reader
.read(size
)
782 data
, bytesencoded
= self
.encode(data
, self
.errors
)
785 def readline(self
, size
=None):
788 data
= self
.reader
.readline()
790 data
= self
.reader
.readline(size
)
791 data
, bytesencoded
= self
.encode(data
, self
.errors
)
794 def readlines(self
, sizehint
=None):
796 data
= self
.reader
.read()
797 data
, bytesencoded
= self
.encode(data
, self
.errors
)
798 return data
.splitlines(1)
802 """ Return the next decoded line from the input stream."""
803 data
= self
.reader
.next()
804 data
, bytesencoded
= self
.encode(data
, self
.errors
)
810 def write(self
, data
):
812 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
813 return self
.writer
.write(data
)
815 def writelines(self
, list):
818 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
819 return self
.writer
.write(data
)
826 def __getattr__(self
, name
,
829 """ Inherit all other methods from the underlying stream.
831 return getattr(self
.stream
, name
)
836 def __exit__(self
, type, value
, tb
):
841 def open(filename
, mode
='rb', encoding
=None, errors
='strict', buffering
=1):
843 """ Open an encoded file using the given mode and return
844 a wrapped version providing transparent encoding/decoding.
846 Note: The wrapped version will only accept the object format
847 defined by the codecs, i.e. Unicode objects for most builtin
848 codecs. Output is also codec dependent and will usually be
851 Files are always opened in binary mode, even if no binary mode
852 was specified. This is done to avoid data loss due to encodings
853 using 8-bit values. The default file mode is 'rb' meaning to
854 open the file in binary read mode.
856 encoding specifies the encoding which is to be used for the
859 errors may be given to define the error handling. It defaults
860 to 'strict' which causes ValueErrors to be raised in case an
861 encoding error occurs.
863 buffering has the same meaning as for the builtin open() API.
864 It defaults to line buffered.
866 The returned wrapped file object provides an extra attribute
867 .encoding which allows querying the used encoding. This
868 attribute is only available if an encoding was specified as
872 if encoding
is not None:
874 # No automatic conversion of '\n' is done on reading and writing
875 mode
= mode
.strip().replace('U', '')
876 if mode
[:1] not in set('rwa'):
879 # Force opening of the file in binary mode
881 file = __builtin__
.open(filename
, mode
, buffering
)
884 info
= lookup(encoding
)
885 srw
= StreamReaderWriter(file, info
.streamreader
, info
.streamwriter
, errors
)
886 # Add attributes to simplify introspection
887 srw
.encoding
= encoding
890 def EncodedFile(file, data_encoding
, file_encoding
=None, errors
='strict'):
892 """ Return a wrapped version of file which provides transparent
893 encoding translation.
895 Strings written to the wrapped file are interpreted according
896 to the given data_encoding and then written to the original
897 file as string using file_encoding. The intermediate encoding
898 will usually be Unicode but depends on the specified codecs.
900 Strings are read from the file using file_encoding and then
901 passed back to the caller as string using data_encoding.
903 If file_encoding is not given, it defaults to data_encoding.
905 errors may be given to define the error handling. It defaults
906 to 'strict' which causes ValueErrors to be raised in case an
907 encoding error occurs.
909 The returned wrapped file object provides two extra attributes
910 .data_encoding and .file_encoding which reflect the given
911 parameters of the same name. The attributes can be used for
912 introspection by Python programs.
915 if file_encoding
is None:
916 file_encoding
= data_encoding
917 data_info
= lookup(data_encoding
)
918 file_info
= lookup(file_encoding
)
919 sr
= StreamRecoder(file, data_info
.encode
, data_info
.decode
,
920 file_info
.streamreader
, file_info
.streamwriter
, errors
)
921 # Add attributes to simplify introspection
922 sr
.data_encoding
= data_encoding
923 sr
.file_encoding
= file_encoding
926 ### Helpers for codec lookup
928 def getencoder(encoding
):
930 """ Lookup up the codec for the given encoding and return
931 its encoder function.
933 Raises a LookupError in case the encoding cannot be found.
936 return lookup(encoding
).encode
938 def getdecoder(encoding
):
940 """ Lookup up the codec for the given encoding and return
941 its decoder function.
943 Raises a LookupError in case the encoding cannot be found.
946 return lookup(encoding
).decode
948 def getincrementalencoder(encoding
):
950 """ Lookup up the codec for the given encoding and return
951 its IncrementalEncoder class or factory function.
953 Raises a LookupError in case the encoding cannot be found
954 or the codecs doesn't provide an incremental encoder.
957 encoder
= lookup(encoding
).incrementalencoder
959 raise LookupError(encoding
)
962 def getincrementaldecoder(encoding
):
964 """ Lookup up the codec for the given encoding and return
965 its IncrementalDecoder class or factory function.
967 Raises a LookupError in case the encoding cannot be found
968 or the codecs doesn't provide an incremental decoder.
971 decoder
= lookup(encoding
).incrementaldecoder
973 raise LookupError(encoding
)
976 def getreader(encoding
):
978 """ Lookup up the codec for the given encoding and return
979 its StreamReader class or factory function.
981 Raises a LookupError in case the encoding cannot be found.
984 return lookup(encoding
).streamreader
986 def getwriter(encoding
):
988 """ Lookup up the codec for the given encoding and return
989 its StreamWriter class or factory function.
991 Raises a LookupError in case the encoding cannot be found.
994 return lookup(encoding
).streamwriter
996 def iterencode(iterator
, encoding
, errors
='strict', **kwargs
):
1000 Encodes the input strings from the iterator using a IncrementalEncoder.
1002 errors and kwargs are passed through to the IncrementalEncoder
1005 encoder
= getincrementalencoder(encoding
)(errors
, **kwargs
)
1006 for input in iterator
:
1007 output
= encoder
.encode(input)
1010 output
= encoder
.encode("", True)
1014 def iterdecode(iterator
, encoding
, errors
='strict', **kwargs
):
1018 Decodes the input strings from the iterator using a IncrementalDecoder.
1020 errors and kwargs are passed through to the IncrementalDecoder
1023 decoder
= getincrementaldecoder(encoding
)(errors
, **kwargs
)
1024 for input in iterator
:
1025 output
= decoder
.decode(input)
1028 output
= decoder
.decode("", True)
1032 ### Helpers for charmap-based codecs
1034 def make_identity_dict(rng
):
1036 """ make_identity_dict(rng) -> dict
1038 Return a dictionary where elements of the rng sequence are
1039 mapped to themselves.
1047 def make_encoding_map(decoding_map
):
1049 """ Creates an encoding map from a decoding map.
1051 If a target mapping in the decoding map occurs multiple
1052 times, then that target is mapped to None (undefined mapping),
1053 causing an exception when encountered by the charmap codec
1056 One example where this happens is cp875.py which decodes
1057 multiple character to \u001a.
1061 for k
,v
in decoding_map
.items():
1071 strict_errors
= lookup_error("strict")
1072 ignore_errors
= lookup_error("ignore")
1073 replace_errors
= lookup_error("replace")
1074 xmlcharrefreplace_errors
= lookup_error("xmlcharrefreplace")
1075 backslashreplace_errors
= lookup_error("backslashreplace")
1077 # In --disable-unicode builds, these error handler are missing
1078 strict_errors
= None
1079 ignore_errors
= None
1080 replace_errors
= None
1081 xmlcharrefreplace_errors
= None
1082 backslashreplace_errors
= None
1084 # Tell modulefinder that using codecs probably needs the encodings
1092 if __name__
== '__main__':
1094 # Make stdout translate Latin-1 output into UTF-8 output
1095 sys
.stdout
= EncodedFile(sys
.stdout
, 'latin-1', 'utf-8')
1097 # Have stdin translate Latin-1 input into UTF-8 input
1098 sys
.stdin
= EncodedFile(sys
.stdin
, 'utf-8', 'latin-1')