1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
10 import struct
,types
,__builtin__
12 ### Registry and builtin stateless codec functions
16 except ImportError,why
:
18 'Failed to load the builtin codecs: %s' % why
20 __all__
= ["register","lookup","open","EncodedFile","BOM","BOM_BE",
21 "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
26 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
28 BOM
= struct
.pack('=H',0xFEFF)
30 BOM_BE
= BOM32_BE
= '\376\377'
31 # corresponds to Unicode U+FEFF in UTF-16 on big endian
32 # platforms == ZERO WIDTH NO-BREAK SPACE
33 BOM_LE
= BOM32_LE
= '\377\376'
34 # corresponds to Unicode U+FFFE in UTF-16 on little endian
35 # platforms == defined as being an illegal Unicode character
38 # 64-bit Byte Order Marks
40 BOM64_BE
= '\000\000\376\377'
41 # corresponds to Unicode U+0000FEFF in UCS-4
42 BOM64_LE
= '\377\376\000\000'
43 # corresponds to Unicode U+0000FFFE in UCS-4
46 ### Codec base classes (defining the API)
50 """ Defines the interface for stateless encoders/decoders.
52 The .encode()/.decode() methods may implement different error
53 handling schemes by providing the errors argument. These
54 string values are defined:
56 'strict' - raise a ValueError error (or a subclass)
57 'ignore' - ignore the character and continue with the next
58 'replace' - replace with a suitable replacement character;
59 Python will use the official U+FFFD REPLACEMENT
60 CHARACTER for the builtin Unicode codecs.
63 def encode(self
,input,errors
='strict'):
65 """ Encodes the object input and returns a tuple (output
66 object, length consumed).
68 errors defines the error handling to apply. It defaults to
71 The method may not store state in the Codec instance. Use
72 StreamCodec for codecs which have to keep state in order to
73 make encoding/decoding efficient.
75 The encoder must be able to handle zero length input and
76 return an empty object of the output object type in this
80 raise NotImplementedError
82 def decode(self
,input,errors
='strict'):
84 """ Decodes the object input and returns a tuple (output
85 object, length consumed).
87 input must be an object which provides the bf_getreadbuf
88 buffer slot. Python strings, buffer objects and memory
89 mapped files are examples of objects providing this slot.
91 errors defines the error handling to apply. It defaults to
94 The method may not store state in the Codec instance. Use
95 StreamCodec for codecs which have to keep state in order to
96 make encoding/decoding efficient.
98 The decoder must be able to handle zero length input and
99 return an empty object of the output object type in this
103 raise NotImplementedError
106 # The StreamWriter and StreamReader class provide generic working
107 # interfaces which can be used to implement new encodings submodules
108 # very easily. See encodings/utf_8.py for an example on how this is
112 class StreamWriter(Codec
):
114 def __init__(self
,stream
,errors
='strict'):
116 """ Creates a StreamWriter instance.
118 stream must be a file-like object open for writing
121 The StreamWriter may implement different error handling
122 schemes by providing the errors keyword argument. These
123 parameters are defined:
125 'strict' - raise a ValueError (or a subclass)
126 'ignore' - ignore the character and continue with the next
127 'replace'- replace with a suitable replacement character
133 def write(self
, object):
135 """ Writes the object's contents encoded to self.stream.
137 data
, consumed
= self
.encode(object,self
.errors
)
138 self
.stream
.write(data
)
140 def writelines(self
, list):
142 """ Writes the concatenated list of strings to the stream
145 self
.write(''.join(list))
149 """ Flushes and resets the codec buffers used for keeping state.
151 Calling this method should ensure that the data on the
152 output is put into a clean state, that allows appending
153 of new fresh data without having to rescan the whole
154 stream to recover state.
159 def __getattr__(self
,name
,
163 """ Inherit all other methods from the underlying stream.
165 return getattr(self
.stream
,name
)
169 class StreamReader(Codec
):
171 def __init__(self
,stream
,errors
='strict'):
173 """ Creates a StreamReader instance.
175 stream must be a file-like object open for reading
178 The StreamReader may implement different error handling
179 schemes by providing the errors keyword argument. These
180 parameters are defined:
182 'strict' - raise a ValueError (or a subclass)
183 'ignore' - ignore the character and continue with the next
184 'replace'- replace with a suitable replacement character;
190 def read(self
, size
=-1):
192 """ Decodes data from the stream self.stream and returns the
195 size indicates the approximate maximum number of bytes to
196 read from the stream for decoding purposes. The decoder
197 can modify this setting as appropriate. The default value
198 -1 indicates to read and decode as much as possible. size
199 is intended to prevent having to decode huge files in one
202 The method should use a greedy read strategy meaning that
203 it should read as much data as is allowed within the
204 definition of the encoding and the given size, e.g. if
205 optional encoding endings or state markers are available
206 on the stream, these should be read too.
211 return self
.decode(self
.stream
.read(), self
.errors
)[0]
214 read
= self
.stream
.read
220 object, decodedbytes
= decode(data
, self
.errors
)
221 except ValueError,why
:
222 # This method is slow but should work under pretty much
223 # all conditions; at most 10 tries are made
226 if not newdata
or i
> 10:
228 data
= data
+ newdata
232 def readline(self
, size
=None):
234 """ Read one line from the input stream and return the
237 Note: Unlike the .readlines() method, this method inherits
238 the line breaking knowledge from the underlying stream's
239 .readline() method -- there is currently no support for
240 line breaking using the codec decoder due to lack of line
241 buffering. Sublcasses should however, if possible, try to
242 implement this method using their own knowledge of line
245 size, if given, is passed as size argument to the stream's
250 line
= self
.stream
.readline()
252 line
= self
.stream
.readline(size
)
253 return self
.decode(line
,self
.errors
)[0]
256 def readlines(self
, sizehint
=0):
258 """ Read all lines available on the input stream
259 and return them as list of lines.
261 Line breaks are implemented using the codec's decoder
262 method and are included in the list entries.
264 sizehint, if given, is passed as size argument to the
265 stream's .read() method.
269 data
= self
.stream
.read()
271 data
= self
.stream
.read(sizehint
)
272 return self
.decode(data
,self
.errors
)[0].splitlines(1)
276 """ Resets the codec buffers used for keeping state.
278 Note that no stream repositioning should take place.
279 This method is primarily intended to be able to recover
280 from decoding errors.
285 def __getattr__(self
,name
,
289 """ Inherit all other methods from the underlying stream.
291 return getattr(self
.stream
,name
)
295 class StreamReaderWriter
:
297 """ StreamReaderWriter instances allow wrapping streams which
298 work in both read and write modes.
300 The design is such that one can use the factory functions
301 returned by the codec.lookup() function to construct the
305 # Optional attributes set by the file wrappers below
308 def __init__(self
,stream
,Reader
,Writer
,errors
='strict'):
310 """ Creates a StreamReaderWriter instance.
312 stream must be a Stream-like object.
314 Reader, Writer must be factory functions or classes
315 providing the StreamReader, StreamWriter interface resp.
317 Error handling is done in the same way as defined for the
318 StreamWriter/Readers.
322 self
.reader
= Reader(stream
, errors
)
323 self
.writer
= Writer(stream
, errors
)
326 def read(self
,size
=-1):
328 return self
.reader
.read(size
)
330 def readline(self
, size
=None):
332 return self
.reader
.readline(size
)
334 def readlines(self
, sizehint
=None):
336 return self
.reader
.readlines(sizehint
)
338 def write(self
,data
):
340 return self
.writer
.write(data
)
342 def writelines(self
,list):
344 return self
.writer
.writelines(list)
351 def __getattr__(self
,name
,
355 """ Inherit all other methods from the underlying stream.
357 return getattr(self
.stream
,name
)
363 """ StreamRecoder instances provide a frontend - backend
364 view of encoding data.
366 They use the complete set of APIs returned by the
367 codecs.lookup() function to implement their task.
369 Data written to the stream is first decoded into an
370 intermediate format (which is dependent on the given codec
371 combination) and then written to the stream using an instance
372 of the provided Writer class.
374 In the other direction, data is read from the stream using a
375 Reader instance and then return encoded data to the caller.
378 # Optional attributes set by the file wrappers below
379 data_encoding
= 'unknown'
380 file_encoding
= 'unknown'
382 def __init__(self
,stream
,encode
,decode
,Reader
,Writer
,errors
='strict'):
384 """ Creates a StreamRecoder instance which implements a two-way
385 conversion: encode and decode work on the frontend (the
386 input to .read() and output of .write()) while
387 Reader and Writer work on the backend (reading and
388 writing to the stream).
390 You can use these objects to do transparent direct
391 recodings from e.g. latin-1 to utf-8 and back.
393 stream must be a file-like object.
395 encode, decode must adhere to the Codec interface, Reader,
396 Writer must be factory functions or classes providing the
397 StreamReader, StreamWriter interface resp.
399 encode and decode are needed for the frontend translation,
400 Reader and Writer for the backend translation. Unicode is
401 used as intermediate encoding.
403 Error handling is done in the same way as defined for the
404 StreamWriter/Readers.
410 self
.reader
= Reader(stream
, errors
)
411 self
.writer
= Writer(stream
, errors
)
414 def read(self
,size
=-1):
416 data
= self
.reader
.read(size
)
417 data
, bytesencoded
= self
.encode(data
, self
.errors
)
420 def readline(self
,size
=None):
423 data
= self
.reader
.readline()
425 data
= self
.reader
.readline(size
)
426 data
, bytesencoded
= self
.encode(data
, self
.errors
)
429 def readlines(self
,sizehint
=None):
432 data
= self
.reader
.read()
434 data
= self
.reader
.read(sizehint
)
435 data
, bytesencoded
= self
.encode(data
, self
.errors
)
436 return data
.splitlines(1)
438 def write(self
,data
):
440 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
441 return self
.writer
.write(data
)
443 def writelines(self
,list):
446 data
, bytesdecoded
= self
.decode(data
, self
.errors
)
447 return self
.writer
.write(data
)
454 def __getattr__(self
,name
,
458 """ Inherit all other methods from the underlying stream.
460 return getattr(self
.stream
,name
)
464 def open(filename
, mode
='rb', encoding
=None, errors
='strict', buffering
=1):
466 """ Open an encoded file using the given mode and return
467 a wrapped version providing transparent encoding/decoding.
469 Note: The wrapped version will only accept the object format
470 defined by the codecs, i.e. Unicode objects for most builtin
471 codecs. Output is also codec dependent and will usually by
474 Files are always opened in binary mode, even if no binary mode
475 was specified. Thisis done to avoid data loss due to encodings
476 using 8-bit values. The default file mode is 'rb' meaning to
477 open the file in binary read mode.
479 encoding specifies the encoding which is to be used for the
482 errors may be given to define the error handling. It defaults
483 to 'strict' which causes ValueErrors to be raised in case an
484 encoding error occurs.
486 buffering has the same meaning as for the builtin open() API.
487 It defaults to line buffered.
489 The returned wrapped file object provides an extra attribute
490 .encoding which allows querying the used encoding. This
491 attribute is only available if an encoding was specified as
495 if encoding
is not None and \
497 # Force opening of the file in binary mode
499 file = __builtin__
.open(filename
, mode
, buffering
)
502 (e
,d
,sr
,sw
) = lookup(encoding
)
503 srw
= StreamReaderWriter(file, sr
, sw
, errors
)
504 # Add attributes to simplify introspection
505 srw
.encoding
= encoding
508 def EncodedFile(file, data_encoding
, file_encoding
=None, errors
='strict'):
510 """ Return a wrapped version of file which provides transparent
511 encoding translation.
513 Strings written to the wrapped file are interpreted according
514 to the given data_encoding and then written to the original
515 file as string using file_encoding. The intermediate encoding
516 will usually be Unicode but depends on the specified codecs.
518 Strings are read from the file using file_encoding and then
519 passed back to the caller as string using data_encoding.
521 If file_encoding is not given, it defaults to data_encoding.
523 errors may be given to define the error handling. It defaults
524 to 'strict' which causes ValueErrors to be raised in case an
525 encoding error occurs.
527 The returned wrapped file object provides two extra attributes
528 .data_encoding and .file_encoding which reflect the given
529 parameters of the same name. The attributes can be used for
530 introspection by Python programs.
533 if file_encoding
is None:
534 file_encoding
= data_encoding
535 encode
, decode
= lookup(data_encoding
)[:2]
536 Reader
, Writer
= lookup(file_encoding
)[2:]
537 sr
= StreamRecoder(file,
538 encode
,decode
,Reader
,Writer
,
540 # Add attributes to simplify introspection
541 sr
.data_encoding
= data_encoding
542 sr
.file_encoding
= file_encoding
545 ### Helpers for charmap-based codecs
547 def make_identity_dict(rng
):
549 """ make_identity_dict(rng) -> dict
551 Return a dictionary where elements of the rng sequence are
552 mapped to themselves.
562 if __name__
== '__main__':
566 # Make stdout translate Latin-1 output into UTF-8 output
567 sys
.stdout
= EncodedFile(sys
.stdout
, 'latin-1', 'utf-8')
569 # Have stdin translate Latin-1 input into UTF-8 input
570 sys
.stdin
= EncodedFile(sys
.stdin
, 'utf-8', 'latin-1')