Updated for 2.1a3
[python/dscho.git] / Lib / codecs.py
blob21652b6146cb2f2417395c03f9417dfc53ed8038
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import struct,types,__builtin__
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
20 __all__ = ["register","lookup","open","EncodedFile","BOM","BOM_BE",
21 "BOM_LE","BOM32_BE","BOM32_LE","BOM64_BE","BOM64_LE"]
23 ### Constants
26 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
28 BOM = struct.pack('=H',0xFEFF)
30 BOM_BE = BOM32_BE = '\376\377'
31 # corresponds to Unicode U+FEFF in UTF-16 on big endian
32 # platforms == ZERO WIDTH NO-BREAK SPACE
33 BOM_LE = BOM32_LE = '\377\376'
34 # corresponds to Unicode U+FFFE in UTF-16 on little endian
35 # platforms == defined as being an illegal Unicode character
38 # 64-bit Byte Order Marks
40 BOM64_BE = '\000\000\376\377'
41 # corresponds to Unicode U+0000FEFF in UCS-4
42 BOM64_LE = '\377\376\000\000'
43 # corresponds to Unicode U+0000FFFE in UCS-4
46 ### Codec base classes (defining the API)
48 class Codec:
50 """ Defines the interface for stateless encoders/decoders.
52 The .encode()/.decode() methods may implement different error
53 handling schemes by providing the errors argument. These
54 string values are defined:
56 'strict' - raise a ValueError error (or a subclass)
57 'ignore' - ignore the character and continue with the next
58 'replace' - replace with a suitable replacement character;
59 Python will use the official U+FFFD REPLACEMENT
60 CHARACTER for the builtin Unicode codecs.
62 """
63 def encode(self,input,errors='strict'):
65 """ Encodes the object input and returns a tuple (output
66 object, length consumed).
68 errors defines the error handling to apply. It defaults to
69 'strict' handling.
71 The method may not store state in the Codec instance. Use
72 StreamCodec for codecs which have to keep state in order to
73 make encoding/decoding efficient.
75 The encoder must be able to handle zero length input and
76 return an empty object of the output object type in this
77 situation.
79 """
80 raise NotImplementedError
82 def decode(self,input,errors='strict'):
84 """ Decodes the object input and returns a tuple (output
85 object, length consumed).
87 input must be an object which provides the bf_getreadbuf
88 buffer slot. Python strings, buffer objects and memory
89 mapped files are examples of objects providing this slot.
91 errors defines the error handling to apply. It defaults to
92 'strict' handling.
94 The method may not store state in the Codec instance. Use
95 StreamCodec for codecs which have to keep state in order to
96 make encoding/decoding efficient.
98 The decoder must be able to handle zero length input and
99 return an empty object of the output object type in this
100 situation.
103 raise NotImplementedError
106 # The StreamWriter and StreamReader class provide generic working
107 # interfaces which can be used to implement new encodings submodules
108 # very easily. See encodings/utf_8.py for an example on how this is
109 # done.
112 class StreamWriter(Codec):
114 def __init__(self,stream,errors='strict'):
116 """ Creates a StreamWriter instance.
118 stream must be a file-like object open for writing
119 (binary) data.
121 The StreamWriter may implement different error handling
122 schemes by providing the errors keyword argument. These
123 parameters are defined:
125 'strict' - raise a ValueError (or a subclass)
126 'ignore' - ignore the character and continue with the next
127 'replace'- replace with a suitable replacement character
130 self.stream = stream
131 self.errors = errors
133 def write(self, object):
135 """ Writes the object's contents encoded to self.stream.
137 data, consumed = self.encode(object,self.errors)
138 self.stream.write(data)
140 def writelines(self, list):
142 """ Writes the concatenated list of strings to the stream
143 using .write().
145 self.write(''.join(list))
147 def reset(self):
149 """ Flushes and resets the codec buffers used for keeping state.
151 Calling this method should ensure that the data on the
152 output is put into a clean state, that allows appending
153 of new fresh data without having to rescan the whole
154 stream to recover state.
157 pass
159 def __getattr__(self,name,
161 getattr=getattr):
163 """ Inherit all other methods from the underlying stream.
165 return getattr(self.stream,name)
169 class StreamReader(Codec):
171 def __init__(self,stream,errors='strict'):
173 """ Creates a StreamReader instance.
175 stream must be a file-like object open for reading
176 (binary) data.
178 The StreamReader may implement different error handling
179 schemes by providing the errors keyword argument. These
180 parameters are defined:
182 'strict' - raise a ValueError (or a subclass)
183 'ignore' - ignore the character and continue with the next
184 'replace'- replace with a suitable replacement character;
187 self.stream = stream
188 self.errors = errors
190 def read(self, size=-1):
192 """ Decodes data from the stream self.stream and returns the
193 resulting object.
195 size indicates the approximate maximum number of bytes to
196 read from the stream for decoding purposes. The decoder
197 can modify this setting as appropriate. The default value
198 -1 indicates to read and decode as much as possible. size
199 is intended to prevent having to decode huge files in one
200 step.
202 The method should use a greedy read strategy meaning that
203 it should read as much data as is allowed within the
204 definition of the encoding and the given size, e.g. if
205 optional encoding endings or state markers are available
206 on the stream, these should be read too.
209 # Unsliced reading:
210 if size < 0:
211 return self.decode(self.stream.read(), self.errors)[0]
213 # Sliced reading:
214 read = self.stream.read
215 decode = self.decode
216 data = read(size)
217 i = 0
218 while 1:
219 try:
220 object, decodedbytes = decode(data, self.errors)
221 except ValueError,why:
222 # This method is slow but should work under pretty much
223 # all conditions; at most 10 tries are made
224 i = i + 1
225 newdata = read(1)
226 if not newdata or i > 10:
227 raise
228 data = data + newdata
229 else:
230 return object
232 def readline(self, size=None):
234 """ Read one line from the input stream and return the
235 decoded data.
237 Note: Unlike the .readlines() method, this method inherits
238 the line breaking knowledge from the underlying stream's
239 .readline() method -- there is currently no support for
240 line breaking using the codec decoder due to lack of line
241 buffering. Sublcasses should however, if possible, try to
242 implement this method using their own knowledge of line
243 breaking.
245 size, if given, is passed as size argument to the stream's
246 .readline() method.
249 if size is None:
250 line = self.stream.readline()
251 else:
252 line = self.stream.readline(size)
253 return self.decode(line,self.errors)[0]
256 def readlines(self, sizehint=0):
258 """ Read all lines available on the input stream
259 and return them as list of lines.
261 Line breaks are implemented using the codec's decoder
262 method and are included in the list entries.
264 sizehint, if given, is passed as size argument to the
265 stream's .read() method.
268 if sizehint is None:
269 data = self.stream.read()
270 else:
271 data = self.stream.read(sizehint)
272 return self.decode(data,self.errors)[0].splitlines(1)
274 def reset(self):
276 """ Resets the codec buffers used for keeping state.
278 Note that no stream repositioning should take place.
279 This method is primarily intended to be able to recover
280 from decoding errors.
283 pass
285 def __getattr__(self,name,
287 getattr=getattr):
289 """ Inherit all other methods from the underlying stream.
291 return getattr(self.stream,name)
295 class StreamReaderWriter:
297 """ StreamReaderWriter instances allow wrapping streams which
298 work in both read and write modes.
300 The design is such that one can use the factory functions
301 returned by the codec.lookup() function to construct the
302 instance.
305 # Optional attributes set by the file wrappers below
306 encoding = 'unknown'
308 def __init__(self,stream,Reader,Writer,errors='strict'):
310 """ Creates a StreamReaderWriter instance.
312 stream must be a Stream-like object.
314 Reader, Writer must be factory functions or classes
315 providing the StreamReader, StreamWriter interface resp.
317 Error handling is done in the same way as defined for the
318 StreamWriter/Readers.
321 self.stream = stream
322 self.reader = Reader(stream, errors)
323 self.writer = Writer(stream, errors)
324 self.errors = errors
326 def read(self,size=-1):
328 return self.reader.read(size)
330 def readline(self, size=None):
332 return self.reader.readline(size)
334 def readlines(self, sizehint=None):
336 return self.reader.readlines(sizehint)
338 def write(self,data):
340 return self.writer.write(data)
342 def writelines(self,list):
344 return self.writer.writelines(list)
346 def reset(self):
348 self.reader.reset()
349 self.writer.reset()
351 def __getattr__(self,name,
353 getattr=getattr):
355 """ Inherit all other methods from the underlying stream.
357 return getattr(self.stream,name)
361 class StreamRecoder:
363 """ StreamRecoder instances provide a frontend - backend
364 view of encoding data.
366 They use the complete set of APIs returned by the
367 codecs.lookup() function to implement their task.
369 Data written to the stream is first decoded into an
370 intermediate format (which is dependent on the given codec
371 combination) and then written to the stream using an instance
372 of the provided Writer class.
374 In the other direction, data is read from the stream using a
375 Reader instance and then return encoded data to the caller.
378 # Optional attributes set by the file wrappers below
379 data_encoding = 'unknown'
380 file_encoding = 'unknown'
382 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
384 """ Creates a StreamRecoder instance which implements a two-way
385 conversion: encode and decode work on the frontend (the
386 input to .read() and output of .write()) while
387 Reader and Writer work on the backend (reading and
388 writing to the stream).
390 You can use these objects to do transparent direct
391 recodings from e.g. latin-1 to utf-8 and back.
393 stream must be a file-like object.
395 encode, decode must adhere to the Codec interface, Reader,
396 Writer must be factory functions or classes providing the
397 StreamReader, StreamWriter interface resp.
399 encode and decode are needed for the frontend translation,
400 Reader and Writer for the backend translation. Unicode is
401 used as intermediate encoding.
403 Error handling is done in the same way as defined for the
404 StreamWriter/Readers.
407 self.stream = stream
408 self.encode = encode
409 self.decode = decode
410 self.reader = Reader(stream, errors)
411 self.writer = Writer(stream, errors)
412 self.errors = errors
414 def read(self,size=-1):
416 data = self.reader.read(size)
417 data, bytesencoded = self.encode(data, self.errors)
418 return data
420 def readline(self,size=None):
422 if size is None:
423 data = self.reader.readline()
424 else:
425 data = self.reader.readline(size)
426 data, bytesencoded = self.encode(data, self.errors)
427 return data
429 def readlines(self,sizehint=None):
431 if sizehint is None:
432 data = self.reader.read()
433 else:
434 data = self.reader.read(sizehint)
435 data, bytesencoded = self.encode(data, self.errors)
436 return data.splitlines(1)
438 def write(self,data):
440 data, bytesdecoded = self.decode(data, self.errors)
441 return self.writer.write(data)
443 def writelines(self,list):
445 data = ''.join(list)
446 data, bytesdecoded = self.decode(data, self.errors)
447 return self.writer.write(data)
449 def reset(self):
451 self.reader.reset()
452 self.writer.reset()
454 def __getattr__(self,name,
456 getattr=getattr):
458 """ Inherit all other methods from the underlying stream.
460 return getattr(self.stream,name)
462 ### Shortcuts
464 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
466 """ Open an encoded file using the given mode and return
467 a wrapped version providing transparent encoding/decoding.
469 Note: The wrapped version will only accept the object format
470 defined by the codecs, i.e. Unicode objects for most builtin
471 codecs. Output is also codec dependent and will usually by
472 Unicode as well.
474 Files are always opened in binary mode, even if no binary mode
475 was specified. Thisis done to avoid data loss due to encodings
476 using 8-bit values. The default file mode is 'rb' meaning to
477 open the file in binary read mode.
479 encoding specifies the encoding which is to be used for the
480 the file.
482 errors may be given to define the error handling. It defaults
483 to 'strict' which causes ValueErrors to be raised in case an
484 encoding error occurs.
486 buffering has the same meaning as for the builtin open() API.
487 It defaults to line buffered.
489 The returned wrapped file object provides an extra attribute
490 .encoding which allows querying the used encoding. This
491 attribute is only available if an encoding was specified as
492 parameter.
495 if encoding is not None and \
496 'b' not in mode:
497 # Force opening of the file in binary mode
498 mode = mode + 'b'
499 file = __builtin__.open(filename, mode, buffering)
500 if encoding is None:
501 return file
502 (e,d,sr,sw) = lookup(encoding)
503 srw = StreamReaderWriter(file, sr, sw, errors)
504 # Add attributes to simplify introspection
505 srw.encoding = encoding
506 return srw
508 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
510 """ Return a wrapped version of file which provides transparent
511 encoding translation.
513 Strings written to the wrapped file are interpreted according
514 to the given data_encoding and then written to the original
515 file as string using file_encoding. The intermediate encoding
516 will usually be Unicode but depends on the specified codecs.
518 Strings are read from the file using file_encoding and then
519 passed back to the caller as string using data_encoding.
521 If file_encoding is not given, it defaults to data_encoding.
523 errors may be given to define the error handling. It defaults
524 to 'strict' which causes ValueErrors to be raised in case an
525 encoding error occurs.
527 The returned wrapped file object provides two extra attributes
528 .data_encoding and .file_encoding which reflect the given
529 parameters of the same name. The attributes can be used for
530 introspection by Python programs.
533 if file_encoding is None:
534 file_encoding = data_encoding
535 encode, decode = lookup(data_encoding)[:2]
536 Reader, Writer = lookup(file_encoding)[2:]
537 sr = StreamRecoder(file,
538 encode,decode,Reader,Writer,
539 errors)
540 # Add attributes to simplify introspection
541 sr.data_encoding = data_encoding
542 sr.file_encoding = file_encoding
543 return sr
545 ### Helpers for charmap-based codecs
547 def make_identity_dict(rng):
549 """ make_identity_dict(rng) -> dict
551 Return a dictionary where elements of the rng sequence are
552 mapped to themselves.
555 res = {}
556 for i in rng:
557 res[i]=i
558 return res
560 ### Tests
562 if __name__ == '__main__':
564 import sys
566 # Make stdout translate Latin-1 output into UTF-8 output
567 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
569 # Have stdin translate Latin-1 input into UTF-8 input
570 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')