1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
40 /* --- Registry ----------------------------------------------------------- */
42 PyDoc_STRVAR(register__doc__
,
43 "register(search_function)\n\
45 Register a codec search function. Search functions are expected to take\n\
46 one argument, the encoding name in all lower case letters, and return\n\
47 a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
50 PyObject
*codec_register(PyObject
*self
, PyObject
*args
)
52 PyObject
*search_function
;
54 if (!PyArg_ParseTuple(args
, "O:register", &search_function
))
57 if (PyCodec_Register(search_function
))
67 PyDoc_STRVAR(lookup__doc__
,
68 "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
70 Looks up a codec tuple in the Python codec registry and returns\n\
71 a tuple of functions.");
74 PyObject
*codec_lookup(PyObject
*self
, PyObject
*args
)
78 if (!PyArg_ParseTuple(args
, "s:lookup", &encoding
))
81 return _PyCodec_Lookup(encoding
);
87 PyDoc_STRVAR(encode__doc__
,
88 "encode(obj, [encoding[,errors]]) -> object\n\
90 Encodes obj using the codec registered for encoding. encoding defaults\n\
91 to the default encoding. errors may be given to set a different error\n\
92 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
93 a ValueError. Other possible values are 'ignore', 'replace' and\n\
94 'xmlcharrefreplace' as well as any other name registered with\n\
95 codecs.register_error that can handle ValueErrors.");
98 codec_encode(PyObject
*self
, PyObject
*args
)
100 const char *encoding
= NULL
;
101 const char *errors
= NULL
;
104 if (!PyArg_ParseTuple(args
, "O|ss:encode", &v
, &encoding
, &errors
))
107 #ifdef Py_USING_UNICODE
108 if (encoding
== NULL
)
109 encoding
= PyUnicode_GetDefaultEncoding();
111 if (encoding
== NULL
) {
112 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
117 /* Encode via the codec registry */
118 v
= PyCodec_Encode(v
, encoding
, errors
);
127 PyDoc_STRVAR(decode__doc__
,
128 "decode(obj, [encoding[,errors]]) -> object\n\
130 Decodes obj using the codec registered for encoding. encoding defaults\n\
131 to the default encoding. errors may be given to set a different error\n\
132 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
133 a ValueError. Other possible values are 'ignore' and 'replace'\n\
134 as well as any other name registerd with codecs.register_error that is\n\
135 able to handle ValueErrors.");
138 codec_decode(PyObject
*self
, PyObject
*args
)
140 const char *encoding
= NULL
;
141 const char *errors
= NULL
;
144 if (!PyArg_ParseTuple(args
, "O|ss:decode", &v
, &encoding
, &errors
))
147 #ifdef Py_USING_UNICODE
148 if (encoding
== NULL
)
149 encoding
= PyUnicode_GetDefaultEncoding();
151 if (encoding
== NULL
) {
152 PyErr_SetString(PyExc_ValueError
, "no encoding specified");
157 /* Decode via the codec registry */
158 v
= PyCodec_Decode(v
, encoding
, errors
);
167 /* --- Helpers ------------------------------------------------------------ */
170 PyObject
*codec_tuple(PyObject
*unicode
,
182 PyTuple_SET_ITEM(v
,0,unicode
);
183 w
= PyInt_FromLong(len
);
188 PyTuple_SET_ITEM(v
,1,w
);
192 /* --- String codecs ------------------------------------------------------ */
194 escape_decode(PyObject
*self
,
197 const char *errors
= NULL
;
201 if (!PyArg_ParseTuple(args
, "s#|z:escape_decode",
202 &data
, &size
, &errors
))
204 return codec_tuple(PyString_DecodeEscape(data
, size
, errors
, 0, NULL
),
209 escape_encode(PyObject
*self
,
213 const char *errors
= NULL
;
217 if (!PyArg_ParseTuple(args
, "O!|z:escape_encode",
218 &PyString_Type
, &str
, &errors
))
221 str
= PyString_Repr(str
, 0);
225 /* The string will be quoted. Unquote, similar to unicode-escape. */
226 buf
= PyString_AS_STRING (str
);
227 len
= PyString_GET_SIZE (str
);
228 memmove(buf
, buf
+1, len
-2);
229 _PyString_Resize(&str
, len
-2);
231 return codec_tuple(str
, PyString_Size(str
));
234 #ifdef Py_USING_UNICODE
235 /* --- Decoder ------------------------------------------------------------ */
238 unicode_internal_decode(PyObject
*self
,
242 const char *errors
= NULL
;
246 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_decode",
250 if (PyUnicode_Check(obj
)) {
252 return codec_tuple(obj
, PyUnicode_GET_SIZE(obj
));
255 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
257 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE
*)data
,
258 size
/ sizeof(Py_UNICODE
)),
264 utf_7_decode(PyObject
*self
,
269 const char *errors
= NULL
;
271 if (!PyArg_ParseTuple(args
, "t#|z:utf_7_decode",
272 &data
, &size
, &errors
))
275 return codec_tuple(PyUnicode_DecodeUTF7(data
, size
, errors
),
280 utf_8_decode(PyObject
*self
,
285 const char *errors
= NULL
;
288 PyObject
*decoded
= NULL
;
290 if (!PyArg_ParseTuple(args
, "t#|zi:utf_8_decode",
291 &data
, &size
, &errors
, &final
))
295 decoded
= PyUnicode_DecodeUTF8Stateful(data
, size
, errors
,
296 final
? NULL
: &consumed
);
299 return codec_tuple(decoded
, consumed
);
303 utf_16_decode(PyObject
*self
,
308 const char *errors
= NULL
;
314 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_decode",
315 &data
, &size
, &errors
, &final
))
318 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
319 final
? NULL
: &consumed
);
322 return codec_tuple(decoded
, consumed
);
326 utf_16_le_decode(PyObject
*self
,
331 const char *errors
= NULL
;
335 PyObject
*decoded
= NULL
;
337 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_le_decode",
338 &data
, &size
, &errors
, &final
))
341 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
342 &byteorder
, final
? NULL
: &consumed
);
345 return codec_tuple(decoded
, consumed
);
350 utf_16_be_decode(PyObject
*self
,
355 const char *errors
= NULL
;
359 PyObject
*decoded
= NULL
;
361 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_be_decode",
362 &data
, &size
, &errors
, &final
))
365 decoded
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
,
366 &byteorder
, final
? NULL
: &consumed
);
369 return codec_tuple(decoded
, consumed
);
372 /* This non-standard version also provides access to the byteorder
373 parameter of the builtin UTF-16 codec.
375 It returns a tuple (unicode, bytesread, byteorder) with byteorder
376 being the value in effect at the end of data.
381 utf_16_ex_decode(PyObject
*self
,
386 const char *errors
= NULL
;
388 PyObject
*unicode
, *tuple
;
392 if (!PyArg_ParseTuple(args
, "t#|zii:utf_16_ex_decode",
393 &data
, &size
, &errors
, &byteorder
, &final
))
397 unicode
= PyUnicode_DecodeUTF16Stateful(data
, size
, errors
, &byteorder
,
398 final
? NULL
: &consumed
);
401 tuple
= Py_BuildValue("Oii", unicode
, consumed
, byteorder
);
407 unicode_escape_decode(PyObject
*self
,
412 const char *errors
= NULL
;
414 if (!PyArg_ParseTuple(args
, "t#|z:unicode_escape_decode",
415 &data
, &size
, &errors
))
418 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data
, size
, errors
),
423 raw_unicode_escape_decode(PyObject
*self
,
428 const char *errors
= NULL
;
430 if (!PyArg_ParseTuple(args
, "t#|z:raw_unicode_escape_decode",
431 &data
, &size
, &errors
))
434 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data
, size
, errors
),
439 latin_1_decode(PyObject
*self
,
444 const char *errors
= NULL
;
446 if (!PyArg_ParseTuple(args
, "t#|z:latin_1_decode",
447 &data
, &size
, &errors
))
450 return codec_tuple(PyUnicode_DecodeLatin1(data
, size
, errors
),
455 ascii_decode(PyObject
*self
,
460 const char *errors
= NULL
;
462 if (!PyArg_ParseTuple(args
, "t#|z:ascii_decode",
463 &data
, &size
, &errors
))
466 return codec_tuple(PyUnicode_DecodeASCII(data
, size
, errors
),
471 charmap_decode(PyObject
*self
,
476 const char *errors
= NULL
;
477 PyObject
*mapping
= NULL
;
479 if (!PyArg_ParseTuple(args
, "t#|zO:charmap_decode",
480 &data
, &size
, &errors
, &mapping
))
482 if (mapping
== Py_None
)
485 return codec_tuple(PyUnicode_DecodeCharmap(data
, size
, mapping
, errors
),
489 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
492 mbcs_decode(PyObject
*self
,
497 const char *errors
= NULL
;
499 if (!PyArg_ParseTuple(args
, "t#|z:mbcs_decode",
500 &data
, &size
, &errors
))
503 return codec_tuple(PyUnicode_DecodeMBCS(data
, size
, errors
),
507 #endif /* MS_WINDOWS */
509 /* --- Encoder ------------------------------------------------------------ */
512 readbuffer_encode(PyObject
*self
,
517 const char *errors
= NULL
;
519 if (!PyArg_ParseTuple(args
, "s#|z:readbuffer_encode",
520 &data
, &size
, &errors
))
523 return codec_tuple(PyString_FromStringAndSize(data
, size
),
528 charbuffer_encode(PyObject
*self
,
533 const char *errors
= NULL
;
535 if (!PyArg_ParseTuple(args
, "t#|z:charbuffer_encode",
536 &data
, &size
, &errors
))
539 return codec_tuple(PyString_FromStringAndSize(data
, size
),
544 unicode_internal_encode(PyObject
*self
,
548 const char *errors
= NULL
;
552 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_encode",
556 if (PyUnicode_Check(obj
)) {
557 data
= PyUnicode_AS_DATA(obj
);
558 size
= PyUnicode_GET_DATA_SIZE(obj
);
559 return codec_tuple(PyString_FromStringAndSize(data
, size
),
563 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
565 return codec_tuple(PyString_FromStringAndSize(data
, size
),
571 utf_7_encode(PyObject
*self
,
575 const char *errors
= NULL
;
577 if (!PyArg_ParseTuple(args
, "O|z:utf_7_encode",
581 str
= PyUnicode_FromObject(str
);
584 v
= codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str
),
585 PyUnicode_GET_SIZE(str
),
589 PyUnicode_GET_SIZE(str
));
595 utf_8_encode(PyObject
*self
,
599 const char *errors
= NULL
;
601 if (!PyArg_ParseTuple(args
, "O|z:utf_8_encode",
605 str
= PyUnicode_FromObject(str
);
608 v
= codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str
),
609 PyUnicode_GET_SIZE(str
),
611 PyUnicode_GET_SIZE(str
));
616 /* This version provides access to the byteorder parameter of the
617 builtin UTF-16 codecs as optional third argument. It defaults to 0
618 which means: use the native byte order and prepend the data with a
624 utf_16_encode(PyObject
*self
,
628 const char *errors
= NULL
;
631 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_encode",
632 &str
, &errors
, &byteorder
))
635 str
= PyUnicode_FromObject(str
);
638 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
639 PyUnicode_GET_SIZE(str
),
642 PyUnicode_GET_SIZE(str
));
648 utf_16_le_encode(PyObject
*self
,
652 const char *errors
= NULL
;
654 if (!PyArg_ParseTuple(args
, "O|z:utf_16_le_encode",
658 str
= PyUnicode_FromObject(str
);
661 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
662 PyUnicode_GET_SIZE(str
),
665 PyUnicode_GET_SIZE(str
));
671 utf_16_be_encode(PyObject
*self
,
675 const char *errors
= NULL
;
677 if (!PyArg_ParseTuple(args
, "O|z:utf_16_be_encode",
681 str
= PyUnicode_FromObject(str
);
684 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
685 PyUnicode_GET_SIZE(str
),
688 PyUnicode_GET_SIZE(str
));
694 unicode_escape_encode(PyObject
*self
,
698 const char *errors
= NULL
;
700 if (!PyArg_ParseTuple(args
, "O|z:unicode_escape_encode",
704 str
= PyUnicode_FromObject(str
);
707 v
= codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str
),
708 PyUnicode_GET_SIZE(str
)),
709 PyUnicode_GET_SIZE(str
));
715 raw_unicode_escape_encode(PyObject
*self
,
719 const char *errors
= NULL
;
721 if (!PyArg_ParseTuple(args
, "O|z:raw_unicode_escape_encode",
725 str
= PyUnicode_FromObject(str
);
728 v
= codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
729 PyUnicode_AS_UNICODE(str
),
730 PyUnicode_GET_SIZE(str
)),
731 PyUnicode_GET_SIZE(str
));
737 latin_1_encode(PyObject
*self
,
741 const char *errors
= NULL
;
743 if (!PyArg_ParseTuple(args
, "O|z:latin_1_encode",
747 str
= PyUnicode_FromObject(str
);
750 v
= codec_tuple(PyUnicode_EncodeLatin1(
751 PyUnicode_AS_UNICODE(str
),
752 PyUnicode_GET_SIZE(str
),
754 PyUnicode_GET_SIZE(str
));
760 ascii_encode(PyObject
*self
,
764 const char *errors
= NULL
;
766 if (!PyArg_ParseTuple(args
, "O|z:ascii_encode",
770 str
= PyUnicode_FromObject(str
);
773 v
= codec_tuple(PyUnicode_EncodeASCII(
774 PyUnicode_AS_UNICODE(str
),
775 PyUnicode_GET_SIZE(str
),
777 PyUnicode_GET_SIZE(str
));
783 charmap_encode(PyObject
*self
,
787 const char *errors
= NULL
;
788 PyObject
*mapping
= NULL
;
790 if (!PyArg_ParseTuple(args
, "O|zO:charmap_encode",
791 &str
, &errors
, &mapping
))
793 if (mapping
== Py_None
)
796 str
= PyUnicode_FromObject(str
);
799 v
= codec_tuple(PyUnicode_EncodeCharmap(
800 PyUnicode_AS_UNICODE(str
),
801 PyUnicode_GET_SIZE(str
),
804 PyUnicode_GET_SIZE(str
));
809 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
812 mbcs_encode(PyObject
*self
,
816 const char *errors
= NULL
;
818 if (!PyArg_ParseTuple(args
, "O|z:mbcs_encode",
822 str
= PyUnicode_FromObject(str
);
825 v
= codec_tuple(PyUnicode_EncodeMBCS(
826 PyUnicode_AS_UNICODE(str
),
827 PyUnicode_GET_SIZE(str
),
829 PyUnicode_GET_SIZE(str
));
834 #endif /* MS_WINDOWS */
835 #endif /* Py_USING_UNICODE */
837 /* --- Error handler registry --------------------------------------------- */
839 PyDoc_STRVAR(register_error__doc__
,
840 "register_error(errors, handler)\n\
842 Register the specified error handler under the name\n\
843 errors. handler must be a callable object, that\n\
844 will be called with an exception instance containing\n\
845 information about the location of the encoding/decoding\n\
846 error and must return a (replacement, new position) tuple.");
848 static PyObject
*register_error(PyObject
*self
, PyObject
*args
)
853 if (!PyArg_ParseTuple(args
, "sO:register_error",
856 if (PyCodec_RegisterError(name
, handler
))
862 PyDoc_STRVAR(lookup_error__doc__
,
863 "lookup_error(errors) -> handler\n\
865 Return the error handler for the specified error handling name\n\
866 or raise a LookupError, if no handler exists under this name.");
868 static PyObject
*lookup_error(PyObject
*self
, PyObject
*args
)
872 if (!PyArg_ParseTuple(args
, "s:lookup_error",
875 return PyCodec_LookupError(name
);
878 /* --- Module API --------------------------------------------------------- */
880 static PyMethodDef _codecs_functions
[] = {
881 {"register", codec_register
, METH_VARARGS
,
883 {"lookup", codec_lookup
, METH_VARARGS
,
885 {"encode", codec_encode
, METH_VARARGS
,
887 {"decode", codec_decode
, METH_VARARGS
,
889 {"escape_encode", escape_encode
, METH_VARARGS
},
890 {"escape_decode", escape_decode
, METH_VARARGS
},
891 #ifdef Py_USING_UNICODE
892 {"utf_8_encode", utf_8_encode
, METH_VARARGS
},
893 {"utf_8_decode", utf_8_decode
, METH_VARARGS
},
894 {"utf_7_encode", utf_7_encode
, METH_VARARGS
},
895 {"utf_7_decode", utf_7_decode
, METH_VARARGS
},
896 {"utf_16_encode", utf_16_encode
, METH_VARARGS
},
897 {"utf_16_le_encode", utf_16_le_encode
, METH_VARARGS
},
898 {"utf_16_be_encode", utf_16_be_encode
, METH_VARARGS
},
899 {"utf_16_decode", utf_16_decode
, METH_VARARGS
},
900 {"utf_16_le_decode", utf_16_le_decode
, METH_VARARGS
},
901 {"utf_16_be_decode", utf_16_be_decode
, METH_VARARGS
},
902 {"utf_16_ex_decode", utf_16_ex_decode
, METH_VARARGS
},
903 {"unicode_escape_encode", unicode_escape_encode
, METH_VARARGS
},
904 {"unicode_escape_decode", unicode_escape_decode
, METH_VARARGS
},
905 {"unicode_internal_encode", unicode_internal_encode
, METH_VARARGS
},
906 {"unicode_internal_decode", unicode_internal_decode
, METH_VARARGS
},
907 {"raw_unicode_escape_encode", raw_unicode_escape_encode
, METH_VARARGS
},
908 {"raw_unicode_escape_decode", raw_unicode_escape_decode
, METH_VARARGS
},
909 {"latin_1_encode", latin_1_encode
, METH_VARARGS
},
910 {"latin_1_decode", latin_1_decode
, METH_VARARGS
},
911 {"ascii_encode", ascii_encode
, METH_VARARGS
},
912 {"ascii_decode", ascii_decode
, METH_VARARGS
},
913 {"charmap_encode", charmap_encode
, METH_VARARGS
},
914 {"charmap_decode", charmap_decode
, METH_VARARGS
},
915 {"readbuffer_encode", readbuffer_encode
, METH_VARARGS
},
916 {"charbuffer_encode", charbuffer_encode
, METH_VARARGS
},
917 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
918 {"mbcs_encode", mbcs_encode
, METH_VARARGS
},
919 {"mbcs_decode", mbcs_decode
, METH_VARARGS
},
921 #endif /* Py_USING_UNICODE */
922 {"register_error", register_error
, METH_VARARGS
,
923 register_error__doc__
},
924 {"lookup_error", lookup_error
, METH_VARARGS
,
925 lookup_error__doc__
},
926 {NULL
, NULL
} /* sentinel */
932 Py_InitModule("_codecs", _codecs_functions
);