Bump to 2.3.1 to pick up the missing file.
[python/dscho.git] / Lib / codecs.py
blob40f0a2e2262b30332a2611cda4ecba6fa397450f
1 """ codecs -- Python Codec Registry, API and helpers.
4 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8 """#"
10 import struct, __builtin__
12 ### Registry and builtin stateless codec functions
14 try:
15 from _codecs import *
16 except ImportError, why:
17 raise SystemError,\
18 'Failed to load the builtin codecs: %s' % why
20 __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
21 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
22 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
23 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
24 "strict_errors", "ignore_errors", "replace_errors",
25 "xmlcharrefreplace_errors",
26 "register_error", "lookup_error"]
28 ### Constants
31 # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
32 # and its possible byte string values
33 # for UTF8/UTF16/UTF32 output and little/big endian machines
36 # UTF-8
37 BOM_UTF8 = '\xef\xbb\xbf'
39 # UTF-16, little endian
40 BOM_LE = BOM_UTF16_LE = '\xff\xfe'
42 # UTF-16, big endian
43 BOM_BE = BOM_UTF16_BE = '\xfe\xff'
45 # UTF-32, little endian
46 BOM_UTF32_LE = '\xff\xfe\x00\x00'
48 # UTF-32, big endian
49 BOM_UTF32_BE = '\x00\x00\xfe\xff'
51 # UTF-16, native endianness
52 BOM = BOM_UTF16 = struct.pack('=H', 0xFEFF)
54 # UTF-32, native endianness
55 BOM_UTF32 = struct.pack('=L', 0x0000FEFF)
57 # Old broken names (don't use in new code)
58 BOM32_LE = BOM_UTF16_LE
59 BOM32_BE = BOM_UTF16_BE
60 BOM64_LE = BOM_UTF32_LE
61 BOM64_BE = BOM_UTF32_BE
64 ### Codec base classes (defining the API)
66 class Codec:
68 """ Defines the interface for stateless encoders/decoders.
70 The .encode()/.decode() methods may implement different error
71 handling schemes by providing the errors argument. These
72 string values are defined:
74 'strict' - raise a ValueError error (or a subclass)
75 'ignore' - ignore the character and continue with the next
76 'replace' - replace with a suitable replacement character;
77 Python will use the official U+FFFD REPLACEMENT
78 CHARACTER for the builtin Unicode codecs.
80 """
81 def encode(self, input, errors='strict'):
83 """ Encodes the object input and returns a tuple (output
84 object, length consumed).
86 errors defines the error handling to apply. It defaults to
87 'strict' handling.
89 The method may not store state in the Codec instance. Use
90 StreamCodec for codecs which have to keep state in order to
91 make encoding/decoding efficient.
93 The encoder must be able to handle zero length input and
94 return an empty object of the output object type in this
95 situation.
97 """
98 raise NotImplementedError
100 def decode(self, input, errors='strict'):
102 """ Decodes the object input and returns a tuple (output
103 object, length consumed).
105 input must be an object which provides the bf_getreadbuf
106 buffer slot. Python strings, buffer objects and memory
107 mapped files are examples of objects providing this slot.
109 errors defines the error handling to apply. It defaults to
110 'strict' handling.
112 The method may not store state in the Codec instance. Use
113 StreamCodec for codecs which have to keep state in order to
114 make encoding/decoding efficient.
116 The decoder must be able to handle zero length input and
117 return an empty object of the output object type in this
118 situation.
121 raise NotImplementedError
124 # The StreamWriter and StreamReader class provide generic working
125 # interfaces which can be used to implement new encoding submodules
126 # very easily. See encodings/utf_8.py for an example on how this is
127 # done.
130 class StreamWriter(Codec):
132 def __init__(self, stream, errors='strict'):
134 """ Creates a StreamWriter instance.
136 stream must be a file-like object open for writing
137 (binary) data.
139 The StreamWriter may implement different error handling
140 schemes by providing the errors keyword argument. These
141 parameters are defined:
143 'strict' - raise a ValueError (or a subclass)
144 'ignore' - ignore the character and continue with the next
145 'replace'- replace with a suitable replacement character
148 self.stream = stream
149 self.errors = errors
151 def write(self, object):
153 """ Writes the object's contents encoded to self.stream.
155 data, consumed = self.encode(object, self.errors)
156 self.stream.write(data)
158 def writelines(self, list):
160 """ Writes the concatenated list of strings to the stream
161 using .write().
163 self.write(''.join(list))
165 def reset(self):
167 """ Flushes and resets the codec buffers used for keeping state.
169 Calling this method should ensure that the data on the
170 output is put into a clean state, that allows appending
171 of new fresh data without having to rescan the whole
172 stream to recover state.
175 pass
177 def __getattr__(self, name,
178 getattr=getattr):
180 """ Inherit all other methods from the underlying stream.
182 return getattr(self.stream, name)
186 class StreamReader(Codec):
188 def __init__(self, stream, errors='strict'):
190 """ Creates a StreamReader instance.
192 stream must be a file-like object open for reading
193 (binary) data.
195 The StreamReader may implement different error handling
196 schemes by providing the errors keyword argument. These
197 parameters are defined:
199 'strict' - raise a ValueError (or a subclass)
200 'ignore' - ignore the character and continue with the next
201 'replace'- replace with a suitable replacement character;
204 self.stream = stream
205 self.errors = errors
207 def read(self, size=-1):
209 """ Decodes data from the stream self.stream and returns the
210 resulting object.
212 size indicates the approximate maximum number of bytes to
213 read from the stream for decoding purposes. The decoder
214 can modify this setting as appropriate. The default value
215 -1 indicates to read and decode as much as possible. size
216 is intended to prevent having to decode huge files in one
217 step.
219 The method should use a greedy read strategy meaning that
220 it should read as much data as is allowed within the
221 definition of the encoding and the given size, e.g. if
222 optional encoding endings or state markers are available
223 on the stream, these should be read too.
226 # Unsliced reading:
227 if size < 0:
228 return self.decode(self.stream.read(), self.errors)[0]
230 # Sliced reading:
231 read = self.stream.read
232 decode = self.decode
233 data = read(size)
234 i = 0
235 while 1:
236 try:
237 object, decodedbytes = decode(data, self.errors)
238 except ValueError, why:
239 # This method is slow but should work under pretty much
240 # all conditions; at most 10 tries are made
241 i = i + 1
242 newdata = read(1)
243 if not newdata or i > 10:
244 raise
245 data = data + newdata
246 else:
247 return object
249 def readline(self, size=None):
251 """ Read one line from the input stream and return the
252 decoded data.
254 Note: Unlike the .readlines() method, this method inherits
255 the line breaking knowledge from the underlying stream's
256 .readline() method -- there is currently no support for
257 line breaking using the codec decoder due to lack of line
258 buffering. Sublcasses should however, if possible, try to
259 implement this method using their own knowledge of line
260 breaking.
262 size, if given, is passed as size argument to the stream's
263 .readline() method.
266 if size is None:
267 line = self.stream.readline()
268 else:
269 line = self.stream.readline(size)
270 return self.decode(line, self.errors)[0]
273 def readlines(self, sizehint=None):
275 """ Read all lines available on the input stream
276 and return them as list of lines.
278 Line breaks are implemented using the codec's decoder
279 method and are included in the list entries.
281 sizehint, if given, is passed as size argument to the
282 stream's .read() method.
285 if sizehint is None:
286 data = self.stream.read()
287 else:
288 data = self.stream.read(sizehint)
289 return self.decode(data, self.errors)[0].splitlines(1)
291 def reset(self):
293 """ Resets the codec buffers used for keeping state.
295 Note that no stream repositioning should take place.
296 This method is primarily intended to be able to recover
297 from decoding errors.
300 pass
302 def __getattr__(self, name,
303 getattr=getattr):
305 """ Inherit all other methods from the underlying stream.
307 return getattr(self.stream, name)
311 class StreamReaderWriter:
313 """ StreamReaderWriter instances allow wrapping streams which
314 work in both read and write modes.
316 The design is such that one can use the factory functions
317 returned by the codec.lookup() function to construct the
318 instance.
321 # Optional attributes set by the file wrappers below
322 encoding = 'unknown'
324 def __init__(self, stream, Reader, Writer, errors='strict'):
326 """ Creates a StreamReaderWriter instance.
328 stream must be a Stream-like object.
330 Reader, Writer must be factory functions or classes
331 providing the StreamReader, StreamWriter interface resp.
333 Error handling is done in the same way as defined for the
334 StreamWriter/Readers.
337 self.stream = stream
338 self.reader = Reader(stream, errors)
339 self.writer = Writer(stream, errors)
340 self.errors = errors
342 def read(self, size=-1):
344 return self.reader.read(size)
346 def readline(self, size=None):
348 return self.reader.readline(size)
350 def readlines(self, sizehint=None):
352 return self.reader.readlines(sizehint)
354 def write(self, data):
356 return self.writer.write(data)
358 def writelines(self, list):
360 return self.writer.writelines(list)
362 def reset(self):
364 self.reader.reset()
365 self.writer.reset()
367 def __getattr__(self, name,
368 getattr=getattr):
370 """ Inherit all other methods from the underlying stream.
372 return getattr(self.stream, name)
376 class StreamRecoder:
378 """ StreamRecoder instances provide a frontend - backend
379 view of encoding data.
381 They use the complete set of APIs returned by the
382 codecs.lookup() function to implement their task.
384 Data written to the stream is first decoded into an
385 intermediate format (which is dependent on the given codec
386 combination) and then written to the stream using an instance
387 of the provided Writer class.
389 In the other direction, data is read from the stream using a
390 Reader instance and then return encoded data to the caller.
393 # Optional attributes set by the file wrappers below
394 data_encoding = 'unknown'
395 file_encoding = 'unknown'
397 def __init__(self, stream, encode, decode, Reader, Writer,
398 errors='strict'):
400 """ Creates a StreamRecoder instance which implements a two-way
401 conversion: encode and decode work on the frontend (the
402 input to .read() and output of .write()) while
403 Reader and Writer work on the backend (reading and
404 writing to the stream).
406 You can use these objects to do transparent direct
407 recodings from e.g. latin-1 to utf-8 and back.
409 stream must be a file-like object.
411 encode, decode must adhere to the Codec interface, Reader,
412 Writer must be factory functions or classes providing the
413 StreamReader, StreamWriter interface resp.
415 encode and decode are needed for the frontend translation,
416 Reader and Writer for the backend translation. Unicode is
417 used as intermediate encoding.
419 Error handling is done in the same way as defined for the
420 StreamWriter/Readers.
423 self.stream = stream
424 self.encode = encode
425 self.decode = decode
426 self.reader = Reader(stream, errors)
427 self.writer = Writer(stream, errors)
428 self.errors = errors
430 def read(self, size=-1):
432 data = self.reader.read(size)
433 data, bytesencoded = self.encode(data, self.errors)
434 return data
436 def readline(self, size=None):
438 if size is None:
439 data = self.reader.readline()
440 else:
441 data = self.reader.readline(size)
442 data, bytesencoded = self.encode(data, self.errors)
443 return data
445 def readlines(self, sizehint=None):
447 if sizehint is None:
448 data = self.reader.read()
449 else:
450 data = self.reader.read(sizehint)
451 data, bytesencoded = self.encode(data, self.errors)
452 return data.splitlines(1)
454 def write(self, data):
456 data, bytesdecoded = self.decode(data, self.errors)
457 return self.writer.write(data)
459 def writelines(self, list):
461 data = ''.join(list)
462 data, bytesdecoded = self.decode(data, self.errors)
463 return self.writer.write(data)
465 def reset(self):
467 self.reader.reset()
468 self.writer.reset()
470 def __getattr__(self, name,
471 getattr=getattr):
473 """ Inherit all other methods from the underlying stream.
475 return getattr(self.stream, name)
477 ### Shortcuts
479 def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
481 """ Open an encoded file using the given mode and return
482 a wrapped version providing transparent encoding/decoding.
484 Note: The wrapped version will only accept the object format
485 defined by the codecs, i.e. Unicode objects for most builtin
486 codecs. Output is also codec dependent and will usually by
487 Unicode as well.
489 Files are always opened in binary mode, even if no binary mode
490 was specified. Thisis done to avoid data loss due to encodings
491 using 8-bit values. The default file mode is 'rb' meaning to
492 open the file in binary read mode.
494 encoding specifies the encoding which is to be used for the
495 the file.
497 errors may be given to define the error handling. It defaults
498 to 'strict' which causes ValueErrors to be raised in case an
499 encoding error occurs.
501 buffering has the same meaning as for the builtin open() API.
502 It defaults to line buffered.
504 The returned wrapped file object provides an extra attribute
505 .encoding which allows querying the used encoding. This
506 attribute is only available if an encoding was specified as
507 parameter.
510 if encoding is not None and \
511 'b' not in mode:
512 # Force opening of the file in binary mode
513 mode = mode + 'b'
514 file = __builtin__.open(filename, mode, buffering)
515 if encoding is None:
516 return file
517 (e, d, sr, sw) = lookup(encoding)
518 srw = StreamReaderWriter(file, sr, sw, errors)
519 # Add attributes to simplify introspection
520 srw.encoding = encoding
521 return srw
523 def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
525 """ Return a wrapped version of file which provides transparent
526 encoding translation.
528 Strings written to the wrapped file are interpreted according
529 to the given data_encoding and then written to the original
530 file as string using file_encoding. The intermediate encoding
531 will usually be Unicode but depends on the specified codecs.
533 Strings are read from the file using file_encoding and then
534 passed back to the caller as string using data_encoding.
536 If file_encoding is not given, it defaults to data_encoding.
538 errors may be given to define the error handling. It defaults
539 to 'strict' which causes ValueErrors to be raised in case an
540 encoding error occurs.
542 The returned wrapped file object provides two extra attributes
543 .data_encoding and .file_encoding which reflect the given
544 parameters of the same name. The attributes can be used for
545 introspection by Python programs.
548 if file_encoding is None:
549 file_encoding = data_encoding
550 encode, decode = lookup(data_encoding)[:2]
551 Reader, Writer = lookup(file_encoding)[2:]
552 sr = StreamRecoder(file,
553 encode, decode, Reader, Writer,
554 errors)
555 # Add attributes to simplify introspection
556 sr.data_encoding = data_encoding
557 sr.file_encoding = file_encoding
558 return sr
560 ### Helpers for codec lookup
562 def getencoder(encoding):
564 """ Lookup up the codec for the given encoding and return
565 its encoder function.
567 Raises a LookupError in case the encoding cannot be found.
570 return lookup(encoding)[0]
572 def getdecoder(encoding):
574 """ Lookup up the codec for the given encoding and return
575 its decoder function.
577 Raises a LookupError in case the encoding cannot be found.
580 return lookup(encoding)[1]
582 def getreader(encoding):
584 """ Lookup up the codec for the given encoding and return
585 its StreamReader class or factory function.
587 Raises a LookupError in case the encoding cannot be found.
590 return lookup(encoding)[2]
592 def getwriter(encoding):
594 """ Lookup up the codec for the given encoding and return
595 its StreamWriter class or factory function.
597 Raises a LookupError in case the encoding cannot be found.
600 return lookup(encoding)[3]
602 ### Helpers for charmap-based codecs
604 def make_identity_dict(rng):
606 """ make_identity_dict(rng) -> dict
608 Return a dictionary where elements of the rng sequence are
609 mapped to themselves.
612 res = {}
613 for i in rng:
614 res[i]=i
615 return res
617 def make_encoding_map(decoding_map):
619 """ Creates an encoding map from a decoding map.
621 If a target mapping in the decoding map occurrs multiple
622 times, then that target is mapped to None (undefined mapping),
623 causing an exception when encountered by the charmap codec
624 during translation.
626 One example where this happens is cp875.py which decodes
627 multiple character to \u001a.
630 m = {}
631 for k,v in decoding_map.items():
632 if not v in m:
633 m[v] = k
634 else:
635 m[v] = None
636 return m
638 ### error handlers
640 strict_errors = lookup_error("strict")
641 ignore_errors = lookup_error("ignore")
642 replace_errors = lookup_error("replace")
643 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
644 backslashreplace_errors = lookup_error("backslashreplace")
646 # Tell modulefinder that using codecs probably needs the encodings
647 # package
648 _false = 0
649 if _false:
650 import encodings
652 ### Tests
654 if __name__ == '__main__':
656 import sys
658 # Make stdout translate Latin-1 output into UTF-8 output
659 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
661 # Have stdin translate Latin-1 input into UTF-8 input
662 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')