1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
10 import struct
,types
,__builtin__
12 ### Registry and builtin stateless codec functions
16 except ImportError,why
:
18 'Failed to load the builtin codecs: %s' % why
23 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
25 BOM
= struct
.pack('=H',0xFEFF)
27 BOM_BE
= BOM32_BE
= '\376\377'
28 # corresponds to Unicode U+FEFF in UTF-16 on big endian
29 # platforms == ZERO WIDTH NO-BREAK SPACE
30 BOM_LE
= BOM32_LE
= '\377\376'
31 # corresponds to Unicode U+FFFE in UTF-16 on little endian
32 # platforms == defined as being an illegal Unicode character
35 # 64-bit Byte Order Marks
37 BOM64_BE
= '\000\000\376\377'
38 # corresponds to Unicode U+0000FEFF in UCS-4
39 BOM64_LE
= '\377\376\000\000'
40 # corresponds to Unicode U+0000FFFE in UCS-4
43 ### Codec base classes (defining the API)
47 """ Defines the interface for stateless encoders/decoders.
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
53 'strict' - raise a ValueError error (or a subclass)
54 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
60 def encode(self
,input,errors
='strict'):
62 """ Encodes the object input and returns a tuple (output
63 object, length consumed).
65 errors defines the error handling to apply. It defaults to
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
77 raise NotImplementedError
79 def decode(self
,input,errors
='strict'):
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
88 errors defines the error handling to apply. It defaults to
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
100 raise NotImplementedError
103 # The StreamWriter and StreamReader class provide generic working
104 # interfaces which can be used to implement new encodings submodules
105 # very easily. See encodings/utf_8.py for an example on how this is
109 class StreamWriter(Codec
):
111 def __init__(self
,stream
,errors
='strict'):
113 """ Creates a StreamWriter instance.
115 stream must be a file-like object open for writing
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
130 def write(self
,object):
132 """ Writes the object's contents encoded to self.stream.
134 data
, consumed
= self
.encode(object,self
.errors
)
135 self
.stream
.write(data
)
137 # XXX .writelines() ?
141 """ Flushes and resets the codec buffers used for keeping state.
143 Calling this method should ensure that the data on the
144 output is put into a clean state, that allows appending
145 of new fresh data without having to rescan the whole
146 stream to recover state.
151 def __getattr__(self
,name
,
155 """ Inherit all other methods from the underlying stream.
157 return getattr(self
.stream
,name
)
161 class StreamReader(Codec
):
163 def __init__(self
,stream
,errors
='strict'):
165 """ Creates a StreamReader instance.
167 stream must be a file-like object open for reading
170 The StreamReader may implement different error handling
171 schemes by providing the errors keyword argument. These
172 parameters are defined:
174 'strict' - raise a ValueError (or a subclass)
175 'ignore' - ignore the character and continue with the next
176 'replace'- replace with a suitable replacement character;
182 def read(self
,size
=-1):
184 """ Decodes data from the stream self.stream and returns the
187 size indicates the approximate maximum number of bytes to
188 read from the stream for decoding purposes. The decoder
189 can modify this setting as appropriate. The default value
190 -1 indicates to read and decode as much as possible. size
191 is intended to prevent having to decode huge files in one
194 The method should use a greedy read strategy meaning that
195 it should read as much data as is allowed within the
196 definition of the encoding and the given size, e.g. if
197 optional encoding endings or state markers are available
198 on the stream, these should be read too.
203 return self
.decode(self
.stream
.read())[0]
206 read
= self
.stream
.read
212 object, decodedbytes
= decode(data
)
213 except ValueError,why
:
214 # This method is slow but should work under pretty much
215 # all conditions; at most 10 tries are made
218 if not newdata
or i
> 10:
220 data
= data
+ newdata
224 # XXX .readline() and .readlines() (these are hard to implement
225 # without using buffers for keeping read-ahead data)
229 """ Resets the codec buffers used for keeping state.
231 Note that no stream repositioning should take place.
232 This method is primarely intended to be able to recover
233 from decoding errors.
238 def __getattr__(self
,name
,
242 """ Inherit all other methods from the underlying stream.
244 return getattr(self
.stream
,name
)
248 class StreamReaderWriter
:
250 def __init__(self
,stream
,Reader
,Writer
,errors
='strict'):
252 """ Creates a StreamReaderWriter instance.
254 stream must be a Stream-like object.
256 Reader, Writer must be factory functions or classes
257 providing the StreamReader, StreamWriter interface resp.
259 Error handling is done in the same way as defined for the
260 StreamWriter/Readers.
264 self
.reader
= Reader(stream
, errors
)
265 self
.writer
= Writer(stream
, errors
)
268 def read(self
,size
=-1):
270 return self
.reader
.read(size
)
272 def write(self
,data
):
274 return self
.writer
.write(data
)
281 def __getattr__(self
,name
,
285 """ Inherit all other methods from the underlying stream.
287 return getattr(self
.stream
,name
)
293 def __init__(self
,stream
,encode
,decode
,Reader
,Writer
,errors
='strict'):
295 """ Creates a StreamRecoder instance which implements a two-way
296 conversion: encode and decode work on the frontend (the
297 input to .read() and output of .write()) while
298 Reader and Writer work on the backend (reading and
299 writing to the stream).
301 You can use these objects to do transparent direct
302 recodings from e.g. latin-1 to utf-8 and back.
304 stream must be a file-like object.
306 encode, decode must adhere to the Codec interface, Reader,
307 Writer must be factory functions or classes providing the
308 StreamReader, StreamWriter interface resp.
310 encode and decode are needed for the frontend translation,
311 Reader and Writer for the backend translation. Unicode is
312 used as intermediate encoding.
314 Error handling is done in the same way as defined for the
315 StreamWriter/Readers.
321 self
.reader
= Reader(stream
, errors
)
322 self
.writer
= Writer(stream
, errors
)
325 def read(self
,size
=-1):
327 data
= self
.reader
.read(size
)
328 data
, bytesencoded
= self
.encode(data
, self
.errors
)
331 def write(self
,data
):
333 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
334 return self
.writer
.write(data
)
336 # .writelines(), .readline() and .readlines() ... see notes
344 def __getattr__(self
,name
,
348 """ Inherit all other methods from the underlying stream.
350 return getattr(self
.stream
,name
)
354 def open(filename
, mode
, encoding
=None, errors
='strict', buffering
=1):
356 """ Open an encoded file using the given mode and return
357 a wrapped version providing transparent encoding/decoding.
359 Note: The wrapped version will only accept the object format
360 defined by the codecs, i.e. Unicode objects for most builtin
361 codecs. Output is also codec dependent and will usually by
364 encoding specifies the encoding which is to be used for the
367 errors may be given to define the error handling. It defaults
368 to 'strict' which causes ValueErrors to be raised in case an
369 encoding error occurs.
371 buffering has the same meaning as for the builtin open() API.
372 It defaults to line buffered.
375 if encoding
is not None and \
377 # Force opening of the file in binary mode
379 file = __builtin__
.open(filename
, mode
, buffering
)
382 (e
,d
,sr
,sw
) = lookup(encoding
)
383 return StreamReaderWriter(file, sr
, sw
, errors
)
385 def EncodedFile(file, input, output
=None, errors
='strict'):
387 """ Return a wrapped version of file which provides transparent
388 encoding translation.
390 Strings written to the wrapped file are interpreted according
391 to the given input encoding and then written to the original
392 file as string using the output encoding. The intermediate
393 encoding will usually be Unicode but depends on the specified
396 If output is not given, it defaults to input.
398 errors may be given to define the error handling. It defaults
399 to 'strict' which causes ValueErrors to be raised in case an
400 encoding error occurs.
405 encode
, decode
= lookup(input)[:2]
406 Reader
, Writer
= lookup(output
)[2:]
407 return StreamRecoder(file,
408 encode
,decode
,Reader
,Writer
,
413 if __name__
== '__main__':
417 # Make stdout translate Latin-1 into Unicode-Escape
418 sys
.stdout
= EncodedFile(sys
.stdout
, 'latin-1', 'unicode-escape')