Merged release21-maint changes.
[python/dscho.git] / Lib / codecs.py
blob711d67a2610ac21e936aeabc234cced380c870ae
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import struct, __builtin__
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError, why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE"]
23 ### Constants
26 # Byte Order Mark (BOM) and its possible values (BOM_BE, BOM_LE)
28 BOM = struct.pack('=H', 0xFEFF)
30 BOM_BE = BOM32_BE = '\376\377'
31 # corresponds to Unicode U+FEFF in UTF-16 on big endian
32 # platforms == ZERO WIDTH NO-BREAK SPACE
33 BOM_LE = BOM32_LE = '\377\376'
34 # corresponds to Unicode U+FFFE in UTF-16 on little endian
35 # platforms == defined as being an illegal Unicode character
38 # 64-bit Byte Order Marks
40 BOM64_BE = '\000\000\376\377'
41 # corresponds to Unicode U+0000FEFF in UCS-4
42 BOM64_LE = '\377\376\000\000'
43 # corresponds to Unicode U+0000FFFE in UCS-4
46 ### Codec base classes (defining the API)
48 class Codec:
50 """ Defines the interface for stateless encoders/decoders.
52 The .encode()/.decode() methods may implement different error
53 handling schemes by providing the errors argument. These
54 string values are defined:
56 'strict' - raise a ValueError error (or a subclass)
57 'ignore' - ignore the character and continue with the next
58 'replace' - replace with a suitable replacement character;
59 Python will use the official U+FFFD REPLACEMENT
60 CHARACTER for the builtin Unicode codecs.
62 """
63 def encode(self, input, errors='strict'):
65 """ Encodes the object input and returns a tuple (output
66 object, length consumed).
68 errors defines the error handling to apply. It defaults to
69 'strict' handling.
71 The method may not store state in the Codec instance. Use
72 StreamCodec for codecs which have to keep state in order to
73 make encoding/decoding efficient.
75 The encoder must be able to handle zero length input and
76 return an empty object of the output object type in this
77 situation.
79 """
80 raise NotImplementedError
82 def decode(self, input, errors='strict'):
84 """ Decodes the object input and returns a tuple (output
85 object, length consumed).
87 input must be an object which provides the bf_getreadbuf
88 buffer slot. Python strings, buffer objects and memory
89 mapped files are examples of objects providing this slot.
91 errors defines the error handling to apply. It defaults to
92 'strict' handling.
94 The method may not store state in the Codec instance. Use
95 StreamCodec for codecs which have to keep state in order to
96 make encoding/decoding efficient.
98 The decoder must be able to handle zero length input and
99 return an empty object of the output object type in this
100 situation.
103 raise NotImplementedError
106 # The StreamWriter and StreamReader class provide generic working
107 # interfaces which can be used to implement new encodings submodules
108 # very easily. See encodings/utf_8.py for an example on how this is
109 # done.
112 class StreamWriter(Codec):
114 def __init__(self, stream, errors='strict'):
116 """ Creates a StreamWriter instance.
118 stream must be a file-like object open for writing
119 (binary) data.
121 The StreamWriter may implement different error handling
122 schemes by providing the errors keyword argument. These
123 parameters are defined:
125 'strict' - raise a ValueError (or a subclass)
126 'ignore' - ignore the character and continue with the next
127 'replace'- replace with a suitable replacement character
130 self.stream = stream
131 self.errors = errors
133 def write(self, object):
135 """ Writes the object's contents encoded to self.stream.
137 data, consumed = self.encode(object, self.errors)
138 self.stream.write(data)
140 def writelines(self, list):
142 """ Writes the concatenated list of strings to the stream
143 using .write().
145 self.write(''.join(list))
147 def reset(self):
149 """ Flushes and resets the codec buffers used for keeping state.
151 Calling this method should ensure that the data on the
152 output is put into a clean state, that allows appending
153 of new fresh data without having to rescan the whole
154 stream to recover state.
157 pass
159 def __getattr__(self, name,
160 getattr=getattr):
162 """ Inherit all other methods from the underlying stream.
164 return getattr(self.stream, name)
168 class StreamReader(Codec):
170 def __init__(self, stream, errors='strict'):
172 """ Creates a StreamReader instance.
174 stream must be a file-like object open for reading
175 (binary) data.
177 The StreamReader may implement different error handling
178 schemes by providing the errors keyword argument. These
179 parameters are defined:
181 'strict' - raise a ValueError (or a subclass)
182 'ignore' - ignore the character and continue with the next
183 'replace'- replace with a suitable replacement character;
186 self.stream = stream
187 self.errors = errors
189 def read(self, size=-1):
191 """ Decodes data from the stream self.stream and returns the
192 resulting object.
194 size indicates the approximate maximum number of bytes to
195 read from the stream for decoding purposes. The decoder
196 can modify this setting as appropriate. The default value
197 -1 indicates to read and decode as much as possible. size
198 is intended to prevent having to decode huge files in one
199 step.
201 The method should use a greedy read strategy meaning that
202 it should read as much data as is allowed within the
203 definition of the encoding and the given size, e.g. if
204 optional encoding endings or state markers are available
205 on the stream, these should be read too.
208 # Unsliced reading:
209 if size < 0:
210 return self.decode(self.stream.read(), self.errors)[0]
212 # Sliced reading:
213 read = self.stream.read
214 decode = self.decode
215 data = read(size)
216 i = 0
217 while 1:
218 try:
219 object, decodedbytes = decode(data, self.errors)
220 except ValueError, why:
221 # This method is slow but should work under pretty much
222 # all conditions; at most 10 tries are made
223 i = i + 1
224 newdata = read(1)
225 if not newdata or i > 10:
226 raise
227 data = data + newdata
228 else:
229 return object
231 def readline(self, size=None):
233 """ Read one line from the input stream and return the
234 decoded data.
236 Note: Unlike the .readlines() method, this method inherits
237 the line breaking knowledge from the underlying stream's
238 .readline() method -- there is currently no support for
239 line breaking using the codec decoder due to lack of line
240 buffering. Sublcasses should however, if possible, try to
241 implement this method using their own knowledge of line
242 breaking.
244 size, if given, is passed as size argument to the stream's
245 .readline() method.
248 if size is None:
249 line = self.stream.readline()
250 else:
251 line = self.stream.readline(size)
252 return self.decode(line, self.errors)[0]
255 def readlines(self, sizehint=0):
257 """ Read all lines available on the input stream
258 and return them as list of lines.
260 Line breaks are implemented using the codec's decoder
261 method and are included in the list entries.
263 sizehint, if given, is passed as size argument to the
264 stream's .read() method.
267 if sizehint is None:
268 data = self.stream.read()
269 else:
270 data = self.stream.read(sizehint)
271 return self.decode(data, self.errors)[0].splitlines(1)
273 def reset(self):
275 """ Resets the codec buffers used for keeping state.
277 Note that no stream repositioning should take place.
278 This method is primarily intended to be able to recover
279 from decoding errors.
282 pass
284 def __getattr__(self, name,
285 getattr=getattr):
287 """ Inherit all other methods from the underlying stream.
289 return getattr(self.stream, name)
293 class StreamReaderWriter:
295 """ StreamReaderWriter instances allow wrapping streams which
296 work in both read and write modes.
298 The design is such that one can use the factory functions
299 returned by the codec.lookup() function to construct the
300 instance.
303 # Optional attributes set by the file wrappers below
304 encoding = 'unknown'
306 def __init__(self, stream, Reader, Writer, errors='strict'):
308 """ Creates a StreamReaderWriter instance.
310 stream must be a Stream-like object.
312 Reader, Writer must be factory functions or classes
313 providing the StreamReader, StreamWriter interface resp.
315 Error handling is done in the same way as defined for the
316 StreamWriter/Readers.
319 self.stream = stream
320 self.reader = Reader(stream, errors)
321 self.writer = Writer(stream, errors)
322 self.errors = errors
324 def read(self, size=-1):
326 return self.reader.read(size)
328 def readline(self, size=None):
330 return self.reader.readline(size)
332 def readlines(self, sizehint=None):
334 return self.reader.readlines(sizehint)
336 def write(self, data):
338 return self.writer.write(data)
340 def writelines(self, list):
342 return self.writer.writelines(list)
344 def reset(self):
346 self.reader.reset()
347 self.writer.reset()
349 def __getattr__(self, name,
350 getattr=getattr):
352 """ Inherit all other methods from the underlying stream.
354 return getattr(self.stream, name)
358 class StreamRecoder:
360 """ StreamRecoder instances provide a frontend - backend
361 view of encoding data.
363 They use the complete set of APIs returned by the
364 codecs.lookup() function to implement their task.
366 Data written to the stream is first decoded into an
367 intermediate format (which is dependent on the given codec
368 combination) and then written to the stream using an instance
369 of the provided Writer class.
371 In the other direction, data is read from the stream using a
372 Reader instance and then return encoded data to the caller.
375 # Optional attributes set by the file wrappers below
376 data_encoding = 'unknown'
377 file_encoding = 'unknown'
379 def __init__(self, stream, encode, decode, Reader, Writer,
380 errors='strict'):
382 """ Creates a StreamRecoder instance which implements a two-way
383 conversion: encode and decode work on the frontend (the
384 input to .read() and output of .write()) while
385 Reader and Writer work on the backend (reading and
386 writing to the stream).
388 You can use these objects to do transparent direct
389 recodings from e.g. latin-1 to utf-8 and back.
391 stream must be a file-like object.
393 encode, decode must adhere to the Codec interface, Reader,
394 Writer must be factory functions or classes providing the
395 StreamReader, StreamWriter interface resp.
397 encode and decode are needed for the frontend translation,
398 Reader and Writer for the backend translation. Unicode is
399 used as intermediate encoding.
401 Error handling is done in the same way as defined for the
402 StreamWriter/Readers.
405 self.stream = stream
406 self.encode = encode
407 self.decode = decode
408 self.reader = Reader(stream, errors)
409 self.writer = Writer(stream, errors)
410 self.errors = errors
412 def read(self, size=-1):
414 data = self.reader.read(size)
415 data, bytesencoded = self.encode(data, self.errors)
416 return data
418 def readline(self, size=None):
420 if size is None:
421 data = self.reader.readline()
422 else:
423 data = self.reader.readline(size)
424 data, bytesencoded = self.encode(data, self.errors)
425 return data
427 def readlines(self, sizehint=None):
429 if sizehint is None:
430 data = self.reader.read()
431 else:
432 data = self.reader.read(sizehint)
433 data, bytesencoded = self.encode(data, self.errors)
434 return data.splitlines(1)
436 def write(self, data):
438 data, bytesdecoded = self.decode(data, self.errors)
439 return self.writer.write(data)
441 def writelines(self, list):
443 data = ''.join(list)
444 data, bytesdecoded = self.decode(data, self.errors)
445 return self.writer.write(data)
447 def reset(self):
449 self.reader.reset()
450 self.writer.reset()
452 def __getattr__(self, name,
453 getattr=getattr):
455 """ Inherit all other methods from the underlying stream.
457 return getattr(self.stream, name)
459 ### Shortcuts
461 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
463 """ Open an encoded file using the given mode and return
464 a wrapped version providing transparent encoding/decoding.
466 Note: The wrapped version will only accept the object format
467 defined by the codecs, i.e. Unicode objects for most builtin
468 codecs. Output is also codec dependent and will usually by
469 Unicode as well.
471 Files are always opened in binary mode, even if no binary mode
472 was specified. Thisis done to avoid data loss due to encodings
473 using 8-bit values. The default file mode is 'rb' meaning to
474 open the file in binary read mode.
476 encoding specifies the encoding which is to be used for the
477 the file.
479 errors may be given to define the error handling. It defaults
480 to 'strict' which causes ValueErrors to be raised in case an
481 encoding error occurs.
483 buffering has the same meaning as for the builtin open() API.
484 It defaults to line buffered.
486 The returned wrapped file object provides an extra attribute
487 .encoding which allows querying the used encoding. This
488 attribute is only available if an encoding was specified as
489 parameter.
492 if encoding is not None and \
493 'b' not in mode:
494 # Force opening of the file in binary mode
495 mode = mode + 'b'
496 file = __builtin__.open(filename, mode, buffering)
497 if encoding is None:
498 return file
499 (e, d, sr, sw) = lookup(encoding)
500 srw = StreamReaderWriter(file, sr, sw, errors)
501 # Add attributes to simplify introspection
502 srw.encoding = encoding
503 return srw
505 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
507 """ Return a wrapped version of file which provides transparent
508 encoding translation.
510 Strings written to the wrapped file are interpreted according
511 to the given data_encoding and then written to the original
512 file as string using file_encoding. The intermediate encoding
513 will usually be Unicode but depends on the specified codecs.
515 Strings are read from the file using file_encoding and then
516 passed back to the caller as string using data_encoding.
518 If file_encoding is not given, it defaults to data_encoding.
520 errors may be given to define the error handling. It defaults
521 to 'strict' which causes ValueErrors to be raised in case an
522 encoding error occurs.
524 The returned wrapped file object provides two extra attributes
525 .data_encoding and .file_encoding which reflect the given
526 parameters of the same name. The attributes can be used for
527 introspection by Python programs.
530 if file_encoding is None:
531 file_encoding = data_encoding
532 encode, decode = lookup(data_encoding)[:2]
533 Reader, Writer = lookup(file_encoding)[2:]
534 sr = StreamRecoder(file,
535 encode, decode, Reader, Writer,
536 errors)
537 # Add attributes to simplify introspection
538 sr.data_encoding = data_encoding
539 sr.file_encoding = file_encoding
540 return sr
542 ### Helpers for charmap-based codecs
544 def make_identity_dict(rng):
546 """ make_identity_dict(rng) -> dict
548 Return a dictionary where elements of the rng sequence are
549 mapped to themselves.
552 res = {}
553 for i in rng:
554 res[i]=i
555 return res
557 def make_encoding_map(decoding_map):
559 """ Creates an encoding map from a decoding map.
561 If a target mapping in the decoding map occurrs multiple
562 times, then that target is mapped to None (undefined mapping),
563 causing an exception when encountered by the charmap codec
564 during translation.
566 One example where this happens is cp875.py which decodes
567 multiple character to \u001a.
570 m = {}
571 for k,v in decoding_map.items():
572 if not m.has_key(v):
573 m[v] = k
574 else:
575 m[v] = None
576 return m
578 # Tell modulefinder that using codecs probably needs the encodings
579 # package
580 _false = 0
581 if _false:
582 import encodings
584 ### Tests
586 if __name__ == '__main__':
588 import sys
590 # Make stdout translate Latin-1 output into UTF-8 output
591 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
593 # Have stdin translate Latin-1 input into UTF-8 input
594 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')