1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
10 import struct
, __builtin__
12 ### Registry and builtin stateless codec functions
16 except ImportError, why
:
18 'Failed to load the builtin codecs: %s' % why
20 __all__
= ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
31 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32 # and its possible byte string values
33 # for UTF8/UTF16/UTF32 output and little/big endian machines
37 BOM_UTF8
= '\xef\xbb\xbf'
39 # UTF-16, little endian
40 BOM_LE
= BOM_UTF16_LE
= '\xff\xfe'
43 BOM_BE
= BOM_UTF16_BE
= '\xfe\xff'
45 # UTF-32, little endian
46 BOM_UTF32_LE
= '\xff\xfe\x00\x00'
49 BOM_UTF32_BE
= '\x00\x00\xfe\xff'
51 # UTF-16, native endianness
52 BOM
= BOM_UTF16
= struct
.pack('=H', 0xFEFF)
54 # UTF-32, native endianness
55 BOM_UTF32
= struct
.pack('=L', 0x0000FEFF)
57 # Old broken names (don't use in new code)
58 BOM32_LE
= BOM_UTF16_LE
59 BOM32_BE
= BOM_UTF16_BE
60 BOM64_LE
= BOM_UTF32_LE
61 BOM64_BE
= BOM_UTF32_BE
64 ### Codec base classes (defining the API)
68 """ Defines the interface for stateless encoders/decoders.
70 The .encode()/.decode() methods may implement different error
71 handling schemes by providing the errors argument. These
72 string values are defined:
74 'strict' - raise a ValueError error (or a subclass)
75 'ignore' - ignore the character and continue with the next
76 'replace' - replace with a suitable replacement character;
77 Python will use the official U+FFFD REPLACEMENT
78 CHARACTER for the builtin Unicode codecs.
81 def encode(self
, input, errors
='strict'):
83 """ Encodes the object input and returns a tuple (output
84 object, length consumed).
86 errors defines the error handling to apply. It defaults to
89 The method may not store state in the Codec instance. Use
90 StreamCodec for codecs which have to keep state in order to
91 make encoding/decoding efficient.
93 The encoder must be able to handle zero length input and
94 return an empty object of the output object type in this
98 raise NotImplementedError
100 def decode(self
, input, errors
='strict'):
102 """ Decodes the object input and returns a tuple (output
103 object, length consumed).
105 input must be an object which provides the bf_getreadbuf
106 buffer slot. Python strings, buffer objects and memory
107 mapped files are examples of objects providing this slot.
109 errors defines the error handling to apply. It defaults to
112 The method may not store state in the Codec instance. Use
113 StreamCodec for codecs which have to keep state in order to
114 make encoding/decoding efficient.
116 The decoder must be able to handle zero length input and
117 return an empty object of the output object type in this
121 raise NotImplementedError
124 # The StreamWriter and StreamReader class provide generic working
125 # interfaces which can be used to implement new encoding submodules
126 # very easily. See encodings/utf_8.py for an example on how this is
130 class StreamWriter(Codec
):
132 def __init__(self
, stream
, errors
='strict'):
134 """ Creates a StreamWriter instance.
136 stream must be a file-like object open for writing
139 The StreamWriter may implement different error handling
140 schemes by providing the errors keyword argument. These
141 parameters are defined:
143 'strict' - raise a ValueError (or a subclass)
144 'ignore' - ignore the character and continue with the next
145 'replace'- replace with a suitable replacement character
151 def write(self
, object):
153 """ Writes the object's contents encoded to self.stream.
155 data
, consumed
= self
.encode(object, self
.errors
)
156 self
.stream
.write(data
)
158 def writelines(self
, list):
160 """ Writes the concatenated list of strings to the stream
163 self
.write(''.join(list))
167 """ Flushes and resets the codec buffers used for keeping state.
169 Calling this method should ensure that the data on the
170 output is put into a clean state, that allows appending
171 of new fresh data without having to rescan the whole
172 stream to recover state.
177 def __getattr__(self
, name
,
180 """ Inherit all other methods from the underlying stream.
182 return getattr(self
.stream
, name
)
186 class StreamReader(Codec
):
188 def __init__(self
, stream
, errors
='strict'):
190 """ Creates a StreamReader instance.
192 stream must be a file-like object open for reading
195 The StreamReader may implement different error handling
196 schemes by providing the errors keyword argument. These
197 parameters are defined:
199 'strict' - raise a ValueError (or a subclass)
200 'ignore' - ignore the character and continue with the next
201 'replace'- replace with a suitable replacement character;
207 def read(self
, size
=-1):
209 """ Decodes data from the stream self.stream and returns the
212 size indicates the approximate maximum number of bytes to
213 read from the stream for decoding purposes. The decoder
214 can modify this setting as appropriate. The default value
215 -1 indicates to read and decode as much as possible. size
216 is intended to prevent having to decode huge files in one
219 The method should use a greedy read strategy meaning that
220 it should read as much data as is allowed within the
221 definition of the encoding and the given size, e.g. if
222 optional encoding endings or state markers are available
223 on the stream, these should be read too.
228 return self
.decode(self
.stream
.read(), self
.errors
)[0]
231 read
= self
.stream
.read
237 object, decodedbytes
= decode(data
, self
.errors
)
238 except ValueError, why
:
239 # This method is slow but should work under pretty much
240 # all conditions; at most 10 tries are made
243 if not newdata
or i
> 10:
245 data
= data
+ newdata
249 def readline(self
, size
=None):
251 """ Read one line from the input stream and return the
254 Note: Unlike the .readlines() method, this method inherits
255 the line breaking knowledge from the underlying stream's
256 .readline() method -- there is currently no support for
257 line breaking using the codec decoder due to lack of line
258 buffering. Sublcasses should however, if possible, try to
259 implement this method using their own knowledge of line
262 size, if given, is passed as size argument to the stream's
267 line
= self
.stream
.readline()
269 line
= self
.stream
.readline(size
)
270 return self
.decode(line
, self
.errors
)[0]
273 def readlines(self
, sizehint
=None):
275 """ Read all lines available on the input stream
276 and return them as list of lines.
278 Line breaks are implemented using the codec's decoder
279 method and are included in the list entries.
281 sizehint, if given, is passed as size argument to the
282 stream's .read() method.
286 data
= self
.stream
.read()
288 data
= self
.stream
.read(sizehint
)
289 return self
.decode(data
, self
.errors
)[0].splitlines(1)
293 """ Resets the codec buffers used for keeping state.
295 Note that no stream repositioning should take place.
296 This method is primarily intended to be able to recover
297 from decoding errors.
302 def __getattr__(self
, name
,
305 """ Inherit all other methods from the underlying stream.
307 return getattr(self
.stream
, name
)
311 class StreamReaderWriter
:
313 """ StreamReaderWriter instances allow wrapping streams which
314 work in both read and write modes.
316 The design is such that one can use the factory functions
317 returned by the codec.lookup() function to construct the
321 # Optional attributes set by the file wrappers below
324 def __init__(self
, stream
, Reader
, Writer
, errors
='strict'):
326 """ Creates a StreamReaderWriter instance.
328 stream must be a Stream-like object.
330 Reader, Writer must be factory functions or classes
331 providing the StreamReader, StreamWriter interface resp.
333 Error handling is done in the same way as defined for the
334 StreamWriter/Readers.
338 self
.reader
= Reader(stream
, errors
)
339 self
.writer
= Writer(stream
, errors
)
342 def read(self
, size
=-1):
344 return self
.reader
.read(size
)
346 def readline(self
, size
=None):
348 return self
.reader
.readline(size
)
350 def readlines(self
, sizehint
=None):
352 return self
.reader
.readlines(sizehint
)
354 def write(self
, data
):
356 return self
.writer
.write(data
)
358 def writelines(self
, list):
360 return self
.writer
.writelines(list)
367 def __getattr__(self
, name
,
370 """ Inherit all other methods from the underlying stream.
372 return getattr(self
.stream
, name
)
378 """ StreamRecoder instances provide a frontend - backend
379 view of encoding data.
381 They use the complete set of APIs returned by the
382 codecs.lookup() function to implement their task.
384 Data written to the stream is first decoded into an
385 intermediate format (which is dependent on the given codec
386 combination) and then written to the stream using an instance
387 of the provided Writer class.
389 In the other direction, data is read from the stream using a
390 Reader instance and then return encoded data to the caller.
393 # Optional attributes set by the file wrappers below
394 data_encoding
= 'unknown'
395 file_encoding
= 'unknown'
397 def __init__(self
, stream
, encode
, decode
, Reader
, Writer
,
400 """ Creates a StreamRecoder instance which implements a two-way
401 conversion: encode and decode work on the frontend (the
402 input to .read() and output of .write()) while
403 Reader and Writer work on the backend (reading and
404 writing to the stream).
406 You can use these objects to do transparent direct
407 recodings from e.g. latin-1 to utf-8 and back.
409 stream must be a file-like object.
411 encode, decode must adhere to the Codec interface, Reader,
412 Writer must be factory functions or classes providing the
413 StreamReader, StreamWriter interface resp.
415 encode and decode are needed for the frontend translation,
416 Reader and Writer for the backend translation. Unicode is
417 used as intermediate encoding.
419 Error handling is done in the same way as defined for the
420 StreamWriter/Readers.
426 self
.reader
= Reader(stream
, errors
)
427 self
.writer
= Writer(stream
, errors
)
430 def read(self
, size
=-1):
432 data
= self
.reader
.read(size
)
433 data
, bytesencoded
= self
.encode(data
, self
.errors
)
436 def readline(self
, size
=None):
439 data
= self
.reader
.readline()
441 data
= self
.reader
.readline(size
)
442 data
, bytesencoded
= self
.encode(data
, self
.errors
)
445 def readlines(self
, sizehint
=None):
448 data
= self
.reader
.read()
450 data
= self
.reader
.read(sizehint
)
451 data
, bytesencoded
= self
.encode(data
, self
.errors
)
452 return data
.splitlines(1)
454 def write(self
, data
):
456 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
457 return self
.writer
.write(data
)
459 def writelines(self
, list):
462 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
463 return self
.writer
.write(data
)
470 def __getattr__(self
, name
,
473 """ Inherit all other methods from the underlying stream.
475 return getattr(self
.stream
, name
)
479 def open(filename
, mode
='rb', encoding
=None, errors
='strict', buffering
=1):
481 """ Open an encoded file using the given mode and return
482 a wrapped version providing transparent encoding/decoding.
484 Note: The wrapped version will only accept the object format
485 defined by the codecs, i.e. Unicode objects for most builtin
486 codecs. Output is also codec dependent and will usually by
489 Files are always opened in binary mode, even if no binary mode
490 was specified. Thisis done to avoid data loss due to encodings
491 using 8-bit values. The default file mode is 'rb' meaning to
492 open the file in binary read mode.
494 encoding specifies the encoding which is to be used for the
497 errors may be given to define the error handling. It defaults
498 to 'strict' which causes ValueErrors to be raised in case an
499 encoding error occurs.
501 buffering has the same meaning as for the builtin open() API.
502 It defaults to line buffered.
504 The returned wrapped file object provides an extra attribute
505 .encoding which allows querying the used encoding. This
506 attribute is only available if an encoding was specified as
510 if encoding
is not None and \
512 # Force opening of the file in binary mode
514 file = __builtin__
.open(filename
, mode
, buffering
)
517 (e
, d
, sr
, sw
) = lookup(encoding
)
518 srw
= StreamReaderWriter(file, sr
, sw
, errors
)
519 # Add attributes to simplify introspection
520 srw
.encoding
= encoding
523 def EncodedFile(file, data_encoding
, file_encoding
=None, errors
='strict'):
525 """ Return a wrapped version of file which provides transparent
526 encoding translation.
528 Strings written to the wrapped file are interpreted according
529 to the given data_encoding and then written to the original
530 file as string using file_encoding. The intermediate encoding
531 will usually be Unicode but depends on the specified codecs.
533 Strings are read from the file using file_encoding and then
534 passed back to the caller as string using data_encoding.
536 If file_encoding is not given, it defaults to data_encoding.
538 errors may be given to define the error handling. It defaults
539 to 'strict' which causes ValueErrors to be raised in case an
540 encoding error occurs.
542 The returned wrapped file object provides two extra attributes
543 .data_encoding and .file_encoding which reflect the given
544 parameters of the same name. The attributes can be used for
545 introspection by Python programs.
548 if file_encoding
is None:
549 file_encoding
= data_encoding
550 encode
, decode
= lookup(data_encoding
)[:2]
551 Reader
, Writer
= lookup(file_encoding
)[2:]
552 sr
= StreamRecoder(file,
553 encode
, decode
, Reader
, Writer
,
555 # Add attributes to simplify introspection
556 sr
.data_encoding
= data_encoding
557 sr
.file_encoding
= file_encoding
560 ### Helpers for codec lookup
562 def getencoder(encoding
):
564 """ Lookup up the codec for the given encoding and return
565 its encoder function.
567 Raises a LookupError in case the encoding cannot be found.
570 return lookup(encoding
)[0]
572 def getdecoder(encoding
):
574 """ Lookup up the codec for the given encoding and return
575 its decoder function.
577 Raises a LookupError in case the encoding cannot be found.
580 return lookup(encoding
)[1]
582 def getreader(encoding
):
584 """ Lookup up the codec for the given encoding and return
585 its StreamReader class or factory function.
587 Raises a LookupError in case the encoding cannot be found.
590 return lookup(encoding
)[2]
592 def getwriter(encoding
):
594 """ Lookup up the codec for the given encoding and return
595 its StreamWriter class or factory function.
597 Raises a LookupError in case the encoding cannot be found.
600 return lookup(encoding
)[3]
602 ### Helpers for charmap-based codecs
604 def make_identity_dict(rng
):
606 """ make_identity_dict(rng) -> dict
608 Return a dictionary where elements of the rng sequence are
609 mapped to themselves.
617 def make_encoding_map(decoding_map
):
619 """ Creates an encoding map from a decoding map.
621 If a target mapping in the decoding map occurrs multiple
622 times, then that target is mapped to None (undefined mapping),
623 causing an exception when encountered by the charmap codec
626 One example where this happens is cp875.py which decodes
627 multiple character to \u001a.
631 for k
,v
in decoding_map
.items():
640 strict_errors
= lookup_error("strict")
641 ignore_errors
= lookup_error("ignore")
642 replace_errors
= lookup_error("replace")
643 xmlcharrefreplace_errors
= lookup_error("xmlcharrefreplace")
644 backslashreplace_errors
= lookup_error("backslashreplace")
646 # Tell modulefinder that using codecs probably needs the encodings
654 if __name__
== '__main__':
658 # Make stdout translate Latin-1 output into UTF-8 output
659 sys
.stdout
= EncodedFile(sys
.stdout
, 'latin-1', 'utf-8')
661 # Have stdin translate Latin-1 input into UTF-8 input
662 sys
.stdin
= EncodedFile(sys
.stdin
, 'utf-8', 'latin-1')