Quick update to the README file. For intros and books we now point to
[python/dscho.git] / Lib / codecs.py
blob5c669c07a5d472cf0b20ccc987120d769c3bf825
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import struct,types,__builtin__
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError,why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
20 ### Constants
23 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
25 BOM = struct.pack('=H',0xFEFF)
27 BOM_BE = BOM32_BE = '\376\377'
28 # corresponds to Unicode U+FEFF in UTF-16 on big endian
29 # platforms == ZERO WIDTH NO-BREAK SPACE
30 BOM_LE = BOM32_LE = '\377\376'
31 # corresponds to Unicode U+FFFE in UTF-16 on little endian
32 # platforms == defined as being an illegal Unicode character
35 # 64-bit Byte Order Marks
37 BOM64_BE = '\000\000\376\377'
38 # corresponds to Unicode U+0000FEFF in UCS-4
39 BOM64_LE = '\377\376\000\000'
40 # corresponds to Unicode U+0000FFFE in UCS-4
43 ### Codec base classes (defining the API)
45 class Codec:
47 """ Defines the interface for stateless encoders/decoders.
49 The .encode()/.decode() methods may implement different error
50 handling schemes by providing the errors argument. These
51 string values are defined:
53 'strict' - raise a ValueError error (or a subclass)
54 'ignore' - ignore the character and continue with the next
55 'replace' - replace with a suitable replacement character;
56 Python will use the official U+FFFD REPLACEMENT
57 CHARACTER for the builtin Unicode codecs.
59 """
60 def encode(self,input,errors='strict'):
62 """ Encodes the object input and returns a tuple (output
63 object, length consumed).
65 errors defines the error handling to apply. It defaults to
66 'strict' handling.
68 The method may not store state in the Codec instance. Use
69 StreamCodec for codecs which have to keep state in order to
70 make encoding/decoding efficient.
72 The encoder must be able to handle zero length input and
73 return an empty object of the output object type in this
74 situation.
76 """
77 raise NotImplementedError
79 def decode(self,input,errors='strict'):
81 """ Decodes the object input and returns a tuple (output
82 object, length consumed).
84 input must be an object which provides the bf_getreadbuf
85 buffer slot. Python strings, buffer objects and memory
86 mapped files are examples of objects providing this slot.
88 errors defines the error handling to apply. It defaults to
89 'strict' handling.
91 The method may not store state in the Codec instance. Use
92 StreamCodec for codecs which have to keep state in order to
93 make encoding/decoding efficient.
95 The decoder must be able to handle zero length input and
96 return an empty object of the output object type in this
97 situation.
99 """
100 raise NotImplementedError
103 # The StreamWriter and StreamReader class provide generic working
104 # interfaces which can be used to implement new encodings submodules
105 # very easily. See encodings/utf_8.py for an example on how this is
106 # done.
109 class StreamWriter(Codec):
111 def __init__(self,stream,errors='strict'):
113 """ Creates a StreamWriter instance.
115 stream must be a file-like object open for writing
116 (binary) data.
118 The StreamWriter may implement different error handling
119 schemes by providing the errors keyword argument. These
120 parameters are defined:
122 'strict' - raise a ValueError (or a subclass)
123 'ignore' - ignore the character and continue with the next
124 'replace'- replace with a suitable replacement character
127 self.stream = stream
128 self.errors = errors
130 def write(self,object):
132 """ Writes the object's contents encoded to self.stream.
134 data, consumed = self.encode(object,self.errors)
135 self.stream.write(data)
137 # XXX .writelines() ?
139 def reset(self):
141 """ Flushes and resets the codec buffers used for keeping state.
143 Calling this method should ensure that the data on the
144 output is put into a clean state, that allows appending
145 of new fresh data without having to rescan the whole
146 stream to recover state.
149 pass
151 def __getattr__(self,name,
153 getattr=getattr):
155 """ Inherit all other methods from the underlying stream.
157 return getattr(self.stream,name)
161 class StreamReader(Codec):
163 def __init__(self,stream,errors='strict'):
165 """ Creates a StreamReader instance.
167 stream must be a file-like object open for reading
168 (binary) data.
170 The StreamReader may implement different error handling
171 schemes by providing the errors keyword argument. These
172 parameters are defined:
174 'strict' - raise a ValueError (or a subclass)
175 'ignore' - ignore the character and continue with the next
176 'replace'- replace with a suitable replacement character;
179 self.stream = stream
180 self.errors = errors
182 def read(self,size=-1):
184 """ Decodes data from the stream self.stream and returns the
185 resulting object.
187 size indicates the approximate maximum number of bytes to
188 read from the stream for decoding purposes. The decoder
189 can modify this setting as appropriate. The default value
190 -1 indicates to read and decode as much as possible. size
191 is intended to prevent having to decode huge files in one
192 step.
194 The method should use a greedy read strategy meaning that
195 it should read as much data as is allowed within the
196 definition of the encoding and the given size, e.g. if
197 optional encoding endings or state markers are available
198 on the stream, these should be read too.
201 # Unsliced reading:
202 if size < 0:
203 return self.decode(self.stream.read())[0]
205 # Sliced reading:
206 read = self.stream.read
207 decode = self.decode
208 data = read(size)
209 i = 0
210 while 1:
211 try:
212 object, decodedbytes = decode(data)
213 except ValueError,why:
214 # This method is slow but should work under pretty much
215 # all conditions; at most 10 tries are made
216 i = i + 1
217 newdata = read(1)
218 if not newdata or i > 10:
219 raise
220 data = data + newdata
221 else:
222 return object
224 # XXX .readline() and .readlines() (these are hard to implement
225 # without using buffers for keeping read-ahead data)
227 def reset(self):
229 """ Resets the codec buffers used for keeping state.
231 Note that no stream repositioning should take place.
232 This method is primarely intended to be able to recover
233 from decoding errors.
236 pass
238 def __getattr__(self,name,
240 getattr=getattr):
242 """ Inherit all other methods from the underlying stream.
244 return getattr(self.stream,name)
248 class StreamReaderWriter:
250 def __init__(self,stream,Reader,Writer,errors='strict'):
252 """ Creates a StreamReaderWriter instance.
254 stream must be a Stream-like object.
256 Reader, Writer must be factory functions or classes
257 providing the StreamReader, StreamWriter interface resp.
259 Error handling is done in the same way as defined for the
260 StreamWriter/Readers.
263 self.stream = stream
264 self.reader = Reader(stream, errors)
265 self.writer = Writer(stream, errors)
266 self.errors = errors
268 def read(self,size=-1):
270 return self.reader.read(size)
272 def write(self,data):
274 return self.writer.write(data)
276 def reset(self):
278 self.reader.reset()
279 self.writer.reset()
281 def __getattr__(self,name,
283 getattr=getattr):
285 """ Inherit all other methods from the underlying stream.
287 return getattr(self.stream,name)
291 class StreamRecoder:
293 def __init__(self,stream,encode,decode,Reader,Writer,errors='strict'):
295 """ Creates a StreamRecoder instance which implements a two-way
296 conversion: encode and decode work on the frontend (the
297 input to .read() and output of .write()) while
298 Reader and Writer work on the backend (reading and
299 writing to the stream).
301 You can use these objects to do transparent direct
302 recodings from e.g. latin-1 to utf-8 and back.
304 stream must be a file-like object.
306 encode, decode must adhere to the Codec interface, Reader,
307 Writer must be factory functions or classes providing the
308 StreamReader, StreamWriter interface resp.
310 encode and decode are needed for the frontend translation,
311 Reader and Writer for the backend translation. Unicode is
312 used as intermediate encoding.
314 Error handling is done in the same way as defined for the
315 StreamWriter/Readers.
318 self.stream = stream
319 self.encode = encode
320 self.decode = decode
321 self.reader = Reader(stream, errors)
322 self.writer = Writer(stream, errors)
323 self.errors = errors
325 def read(self,size=-1):
327 data = self.reader.read(size)
328 data, bytesencoded = self.encode(data, self.errors)
329 return data
331 def write(self,data):
333 data, bytesdecoded = self.decode(data, self.errors)
334 return self.writer.write(data)
336 # .writelines(), .readline() and .readlines() ... see notes
337 # above.
339 def reset(self):
341 self.reader.reset()
342 self.writer.reset()
344 def __getattr__(self,name,
346 getattr=getattr):
348 """ Inherit all other methods from the underlying stream.
350 return getattr(self.stream,name)
352 ### Shortcuts
354 def open(filename, mode, encoding=None, errors='strict', buffering=1):
356 """ Open an encoded file using the given mode and return
357 a wrapped version providing transparent encoding/decoding.
359 Note: The wrapped version will only accept the object format
360 defined by the codecs, i.e. Unicode objects for most builtin
361 codecs. Output is also codec dependent and will usually by
362 Unicode as well.
364 encoding specifies the encoding which is to be used for the
365 the file.
367 errors may be given to define the error handling. It defaults
368 to 'strict' which causes ValueErrors to be raised in case an
369 encoding error occurs.
371 buffering has the same meaning as for the builtin open() API.
372 It defaults to line buffered.
375 if encoding is not None and \
376 'b' not in mode:
377 # Force opening of the file in binary mode
378 mode = mode + 'b'
379 file = __builtin__.open(filename, mode, buffering)
380 if encoding is None:
381 return file
382 (e,d,sr,sw) = lookup(encoding)
383 return StreamReaderWriter(file, sr, sw, errors)
385 def EncodedFile(file, input, output=None, errors='strict'):
387 """ Return a wrapped version of file which provides transparent
388 encoding translation.
390 Strings written to the wrapped file are interpreted according
391 to the given input encoding and then written to the original
392 file as string using the output encoding. The intermediate
393 encoding will usually be Unicode but depends on the specified
394 codecs.
396 If output is not given, it defaults to input.
398 errors may be given to define the error handling. It defaults
399 to 'strict' which causes ValueErrors to be raised in case an
400 encoding error occurs.
403 if output is None:
404 output = input
405 encode, decode = lookup(input)[:2]
406 Reader, Writer = lookup(output)[2:]
407 return StreamRecoder(file,
408 encode,decode,Reader,Writer,
409 errors)
411 ### Tests
413 if __name__ == '__main__':
415 import sys
417 # Make stdout translate Latin-1 into Unicode-Escape
418 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'unicode-escape')