1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
10 import struct
,types
,__builtin__
12 ### Registry and builtin stateless codec functions
16 except ImportError,why
:
18 'Failed to load the builtin codecs: %s' % why
23 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
25 BOM
= struct
.pack('=H',0xFEFF)
27 BOM_BE
= BOM32_BE
= '\376\377'
28 # corresponds to Unicode U+FEFF in UTF-16 on big endian
29 # platforms == ZERO WIDTH NO-BREAK SPACE
30 BOM_LE
= BOM32_LE
= '\377\376'
31 # corresponds to Unicode U+FFFE in UTF-16 on little endian
32 # platforms == defined as being an illegal Unicode character
35 # 64-bit Byte Order Marks
37 BOM64_BE
= '\000\000\376\377'
38 # corresponds to Unicode U+0000FEFF in UCS-4
39 BOM64_LE
= '\377\376\000\000'
40 # corresponds to Unicode U+0000FFFE in UCS-4
43 ### Codec base classes (defining the API)
47 """ Defines the interface for stateless encoders/decoders.
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
53 'strict' - raise a ValueError error (or a subclass)
54 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
60 def encode(self
,input,errors
='strict'):
62 """ Encodes the object input and returns a tuple (output
63 object, length consumed).
65 errors defines the error handling to apply. It defaults to
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
77 raise NotImplementedError
79 def decode(self
,input,errors
='strict'):
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
88 errors defines the error handling to apply. It defaults to
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
100 raise NotImplementedError
103 # The StreamWriter and StreamReader class provide generic working
104 # interfaces which can be used to implement new encodings submodules
105 # very easily. See encodings/utf_8.py for an example on how this is
109 class StreamWriter(Codec
):
111 def __init__(self
,stream
,errors
='strict'):
113 """ Creates a StreamWriter instance.
115 stream must be a file-like object open for writing
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
130 def write(self
, object):
132 """ Writes the object's contents encoded to self.stream.
134 data
, consumed
= self
.encode(object,self
.errors
)
135 self
.stream
.write(data
)
137 def writelines(self
, list):
139 """ Writes the concatenated list of strings to the stream
142 self
.write(''.join(list))
146 """ Flushes and resets the codec buffers used for keeping state.
148 Calling this method should ensure that the data on the
149 output is put into a clean state, that allows appending
150 of new fresh data without having to rescan the whole
151 stream to recover state.
156 def __getattr__(self
,name
,
160 """ Inherit all other methods from the underlying stream.
162 return getattr(self
.stream
,name
)
166 class StreamReader(Codec
):
168 def __init__(self
,stream
,errors
='strict'):
170 """ Creates a StreamReader instance.
172 stream must be a file-like object open for reading
175 The StreamReader may implement different error handling
176 schemes by providing the errors keyword argument. These
177 parameters are defined:
179 'strict' - raise a ValueError (or a subclass)
180 'ignore' - ignore the character and continue with the next
181 'replace'- replace with a suitable replacement character;
187 def read(self
, size
=-1):
189 """ Decodes data from the stream self.stream and returns the
192 size indicates the approximate maximum number of bytes to
193 read from the stream for decoding purposes. The decoder
194 can modify this setting as appropriate. The default value
195 -1 indicates to read and decode as much as possible. size
196 is intended to prevent having to decode huge files in one
199 The method should use a greedy read strategy meaning that
200 it should read as much data as is allowed within the
201 definition of the encoding and the given size, e.g. if
202 optional encoding endings or state markers are available
203 on the stream, these should be read too.
208 return self
.decode(self
.stream
.read())[0]
211 read
= self
.stream
.read
217 object, decodedbytes
= decode(data
)
218 except ValueError,why
:
219 # This method is slow but should work under pretty much
220 # all conditions; at most 10 tries are made
223 if not newdata
or i
> 10:
225 data
= data
+ newdata
229 def readline(self
, size
=None):
231 """ Read one line from the input stream and return the
234 Note: Unlike the .readlines() method, this method inherits
235 the line breaking knowledge from the underlying stream's
236 .readline() method -- there is currently no support for
237 line breaking using the codec decoder due to lack of line
238 buffering. Sublcasses should however, if possible, try to
239 implement this method using their own knowledge of line
242 size, if given, is passed as size argument to the stream's
247 line
= self
.stream
.readline()
249 line
= self
.stream
.readline(size
)
250 return self
.decode(line
)[0]
253 def readlines(self
, sizehint
=0):
255 """ Read all lines available on the input stream
256 and return them as list of lines.
258 Line breaks are implemented using the codec's decoder
259 method and are included in the list entries.
261 sizehint, if given, is passed as size argument to the
262 stream's .read() method.
266 data
= self
.stream
.read()
268 data
= self
.stream
.read(sizehint
)
269 return self
.decode(data
)[0].splitlines(1)
273 """ Resets the codec buffers used for keeping state.
275 Note that no stream repositioning should take place.
276 This method is primarily intended to be able to recover
277 from decoding errors.
282 def __getattr__(self
,name
,
286 """ Inherit all other methods from the underlying stream.
288 return getattr(self
.stream
,name
)
292 class StreamReaderWriter
:
294 """ StreamReaderWriter instances allow wrapping streams which
295 work in both read and write modes.
297 The design is such that one can use the factory functions
298 returned by the codec.lookup() function to construct the
302 # Optional attributes set by the file wrappers below
305 def __init__(self
,stream
,Reader
,Writer
,errors
='strict'):
307 """ Creates a StreamReaderWriter instance.
309 stream must be a Stream-like object.
311 Reader, Writer must be factory functions or classes
312 providing the StreamReader, StreamWriter interface resp.
314 Error handling is done in the same way as defined for the
315 StreamWriter/Readers.
319 self
.reader
= Reader(stream
, errors
)
320 self
.writer
= Writer(stream
, errors
)
323 def read(self
,size
=-1):
325 return self
.reader
.read(size
)
327 def readline(self
, size
=None):
329 return self
.reader
.readline(size
)
331 def readlines(self
, sizehint
=None):
333 return self
.reader
.readlines(sizehint
)
335 def write(self
,data
):
337 return self
.writer
.write(data
)
339 def writelines(self
,list):
341 return self
.writer
.writelines(list)
348 def __getattr__(self
,name
,
352 """ Inherit all other methods from the underlying stream.
354 return getattr(self
.stream
,name
)
360 """ StreamRecoder instances provide a frontend - backend
361 view of encoding data.
363 They use the complete set of APIs returned by the
364 codecs.lookup() function to implement their task.
366 Data written to the stream is first decoded into an
367 intermediate format (which is dependent on the given codec
368 combination) and then written to the stream using an instance
369 of the provided Writer class.
371 In the other direction, data is read from the stream using a
372 Reader instance and then return encoded data to the caller.
375 # Optional attributes set by the file wrappers below
376 data_encoding
= 'unknown'
377 file_encoding
= 'unknown'
379 def __init__(self
,stream
,encode
,decode
,Reader
,Writer
,errors
='strict'):
381 """ Creates a StreamRecoder instance which implements a two-way
382 conversion: encode and decode work on the frontend (the
383 input to .read() and output of .write()) while
384 Reader and Writer work on the backend (reading and
385 writing to the stream).
387 You can use these objects to do transparent direct
388 recodings from e.g. latin-1 to utf-8 and back.
390 stream must be a file-like object.
392 encode, decode must adhere to the Codec interface, Reader,
393 Writer must be factory functions or classes providing the
394 StreamReader, StreamWriter interface resp.
396 encode and decode are needed for the frontend translation,
397 Reader and Writer for the backend translation. Unicode is
398 used as intermediate encoding.
400 Error handling is done in the same way as defined for the
401 StreamWriter/Readers.
407 self
.reader
= Reader(stream
, errors
)
408 self
.writer
= Writer(stream
, errors
)
411 def read(self
,size
=-1):
413 data
= self
.reader
.read(size
)
414 data
, bytesencoded
= self
.encode(data
, self
.errors
)
417 def readline(self
,size
=None):
420 data
= self
.reader
.readline()
422 data
= self
.reader
.readline(size
)
423 data
, bytesencoded
= self
.encode(data
, self
.errors
)
426 def readlines(self
,sizehint
=None):
429 data
= self
.reader
.read()
431 data
= self
.reader
.read(sizehint
)
432 data
, bytesencoded
= self
.encode(data
, self
.errors
)
433 return data
.splitlines(1)
435 def write(self
,data
):
437 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
438 return self
.writer
.write(data
)
440 def writelines(self
,list):
443 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
444 return self
.writer
.write(data
)
451 def __getattr__(self
,name
,
455 """ Inherit all other methods from the underlying stream.
457 return getattr(self
.stream
,name
)
461 def open(filename
, mode
='rb', encoding
=None, errors
='strict', buffering
=1):
463 """ Open an encoded file using the given mode and return
464 a wrapped version providing transparent encoding/decoding.
466 Note: The wrapped version will only accept the object format
467 defined by the codecs, i.e. Unicode objects for most builtin
468 codecs. Output is also codec dependent and will usually by
471 Files are always opened in binary mode, even if no binary mode
472 was specified. Thisis done to avoid data loss due to encodings
473 using 8-bit values. The default file mode is 'rb' meaning to
474 open the file in binary read mode.
476 encoding specifies the encoding which is to be used for the
479 errors may be given to define the error handling. It defaults
480 to 'strict' which causes ValueErrors to be raised in case an
481 encoding error occurs.
483 buffering has the same meaning as for the builtin open() API.
484 It defaults to line buffered.
486 The returned wrapped file object provides an extra attribute
487 .encoding which allows querying the used encoding. This
488 attribute is only available if an encoding was specified as
492 if encoding
is not None and \
494 # Force opening of the file in binary mode
496 file = __builtin__
.open(filename
, mode
, buffering
)
499 (e
,d
,sr
,sw
) = lookup(encoding
)
500 srw
= StreamReaderWriter(file, sr
, sw
, errors
)
501 # Add attributes to simplify introspection
502 srw
.encoding
= encoding
505 def EncodedFile(file, data_encoding
, file_encoding
=None, errors
='strict'):
507 """ Return a wrapped version of file which provides transparent
508 encoding translation.
510 Strings written to the wrapped file are interpreted according
511 to the given data_encoding and then written to the original
512 file as string using file_encoding. The intermediate encoding
513 will usually be Unicode but depends on the specified codecs.
515 Strings are read from the file using file_encoding and then
516 passed back to the caller as string using data_encoding.
518 If file_encoding is not given, it defaults to data_encoding.
520 errors may be given to define the error handling. It defaults
521 to 'strict' which causes ValueErrors to be raised in case an
522 encoding error occurs.
524 data_encoding and file_encoding are added to the wrapped file
525 object as attributes .data_encoding and .file_encoding resp.
527 The returned wrapped file object provides two extra attributes
528 .data_encoding and .file_encoding which reflect the given
529 parameters of the same name. The attributes can be used for
530 introspection by Python programs.
533 if file_encoding
is None:
534 file_encoding
= data_encoding
535 encode
, decode
= lookup(data_encoding
)[:2]
536 Reader
, Writer
= lookup(file_encoding
)[2:]
537 sr
= StreamRecoder(file,
538 encode
,decode
,Reader
,Writer
,
540 # Add attributes to simplify introspection
541 sr
.data_encoding
= data_encoding
542 sr
.file_encoding
= file_encoding
547 if __name__
== '__main__':
551 # Make stdout translate Latin-1 output into UTF-8 output
552 sys
.stdout
= EncodedFile(sys
.stdout
, 'latin-1', 'utf-8')
554 # Have stdin translate Latin-1 input into UTF-8 input
555 sys
.stdin
= EncodedFile(sys
.stdin
, 'utf-8', 'latin-1')