1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
40 /* --- Registry ----------------------------------------------------------- */
42 PyDoc_STRVAR(register__doc__
,
43 "register(search_function)\n\
45 Register a codec search function. Search functions are expected to take\n\
46 one argument, the encoding name in all lower case letters, and return\n\
47 a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
50 PyObject
*codecregister(PyObject
*self
, PyObject
*args
)
52 PyObject
*search_function
;
54 if (!PyArg_ParseTuple(args
, "O:register", &search_function
))
57 if (PyCodec_Register(search_function
))
67 PyDoc_STRVAR(lookup__doc__
,
68 "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
70 Looks up a codec tuple in the Python codec registry and returns\n\
71 a tuple of functions.");
74 PyObject
*codeclookup(PyObject
*self
, PyObject
*args
)
78 if (!PyArg_ParseTuple(args
, "s:lookup", &encoding
))
81 return _PyCodec_Lookup(encoding
);
87 /* --- Helpers ------------------------------------------------------------ */
90 PyObject
*codec_tuple(PyObject
*unicode
,
102 PyTuple_SET_ITEM(v
,0,unicode
);
103 w
= PyInt_FromLong(len
);
108 PyTuple_SET_ITEM(v
,1,w
);
112 /* --- String codecs ------------------------------------------------------ */
114 escape_decode(PyObject
*self
,
117 const char *errors
= NULL
;
121 if (!PyArg_ParseTuple(args
, "s#|z:escape_decode",
122 &data
, &size
, &errors
))
124 return codec_tuple(PyString_DecodeEscape(data
, size
, errors
, 0, NULL
),
129 escape_encode(PyObject
*self
,
133 const char *errors
= NULL
;
137 if (!PyArg_ParseTuple(args
, "O!|z:escape_encode",
138 &PyString_Type
, &str
, &errors
))
141 str
= PyString_Repr(str
, 0);
145 /* The string will be quoted. Unquote, similar to unicode-escape. */
146 buf
= PyString_AS_STRING (str
);
147 len
= PyString_GET_SIZE (str
);
148 memmove(buf
, buf
+1, len
-2);
149 _PyString_Resize(&str
, len
-2);
151 return codec_tuple(str
, PyString_Size(str
));
154 #ifdef Py_USING_UNICODE
155 /* --- Decoder ------------------------------------------------------------ */
158 unicode_internal_decode(PyObject
*self
,
162 const char *errors
= NULL
;
166 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_decode",
170 if (PyUnicode_Check(obj
)) {
172 return codec_tuple(obj
, PyUnicode_GET_SIZE(obj
));
175 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
177 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE
*)data
,
178 size
/ sizeof(Py_UNICODE
)),
184 utf_7_decode(PyObject
*self
,
189 const char *errors
= NULL
;
191 if (!PyArg_ParseTuple(args
, "t#|z:utf_7_decode",
192 &data
, &size
, &errors
))
195 return codec_tuple(PyUnicode_DecodeUTF7(data
, size
, errors
),
200 utf_8_decode(PyObject
*self
,
205 const char *errors
= NULL
;
207 if (!PyArg_ParseTuple(args
, "t#|z:utf_8_decode",
208 &data
, &size
, &errors
))
211 return codec_tuple(PyUnicode_DecodeUTF8(data
, size
, errors
),
216 utf_16_decode(PyObject
*self
,
221 const char *errors
= NULL
;
224 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_decode",
225 &data
, &size
, &errors
))
227 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
232 utf_16_le_decode(PyObject
*self
,
237 const char *errors
= NULL
;
240 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_le_decode",
241 &data
, &size
, &errors
))
243 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
248 utf_16_be_decode(PyObject
*self
,
253 const char *errors
= NULL
;
256 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_be_decode",
257 &data
, &size
, &errors
))
259 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
263 /* This non-standard version also provides access to the byteorder
264 parameter of the builtin UTF-16 codec.
266 It returns a tuple (unicode, bytesread, byteorder) with byteorder
267 being the value in effect at the end of data.
272 utf_16_ex_decode(PyObject
*self
,
277 const char *errors
= NULL
;
279 PyObject
*unicode
, *tuple
;
281 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_ex_decode",
282 &data
, &size
, &errors
, &byteorder
))
285 unicode
= PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
);
288 tuple
= Py_BuildValue("Oii", unicode
, size
, byteorder
);
294 unicode_escape_decode(PyObject
*self
,
299 const char *errors
= NULL
;
301 if (!PyArg_ParseTuple(args
, "t#|z:unicode_escape_decode",
302 &data
, &size
, &errors
))
305 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data
, size
, errors
),
310 raw_unicode_escape_decode(PyObject
*self
,
315 const char *errors
= NULL
;
317 if (!PyArg_ParseTuple(args
, "t#|z:raw_unicode_escape_decode",
318 &data
, &size
, &errors
))
321 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data
, size
, errors
),
326 latin_1_decode(PyObject
*self
,
331 const char *errors
= NULL
;
333 if (!PyArg_ParseTuple(args
, "t#|z:latin_1_decode",
334 &data
, &size
, &errors
))
337 return codec_tuple(PyUnicode_DecodeLatin1(data
, size
, errors
),
342 ascii_decode(PyObject
*self
,
347 const char *errors
= NULL
;
349 if (!PyArg_ParseTuple(args
, "t#|z:ascii_decode",
350 &data
, &size
, &errors
))
353 return codec_tuple(PyUnicode_DecodeASCII(data
, size
, errors
),
358 charmap_decode(PyObject
*self
,
363 const char *errors
= NULL
;
364 PyObject
*mapping
= NULL
;
366 if (!PyArg_ParseTuple(args
, "t#|zO:charmap_decode",
367 &data
, &size
, &errors
, &mapping
))
369 if (mapping
== Py_None
)
372 return codec_tuple(PyUnicode_DecodeCharmap(data
, size
, mapping
, errors
),
376 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
379 mbcs_decode(PyObject
*self
,
384 const char *errors
= NULL
;
386 if (!PyArg_ParseTuple(args
, "t#|z:mbcs_decode",
387 &data
, &size
, &errors
))
390 return codec_tuple(PyUnicode_DecodeMBCS(data
, size
, errors
),
394 #endif /* MS_WINDOWS */
396 /* --- Encoder ------------------------------------------------------------ */
399 readbuffer_encode(PyObject
*self
,
404 const char *errors
= NULL
;
406 if (!PyArg_ParseTuple(args
, "s#|z:readbuffer_encode",
407 &data
, &size
, &errors
))
410 return codec_tuple(PyString_FromStringAndSize(data
, size
),
415 charbuffer_encode(PyObject
*self
,
420 const char *errors
= NULL
;
422 if (!PyArg_ParseTuple(args
, "t#|z:charbuffer_encode",
423 &data
, &size
, &errors
))
426 return codec_tuple(PyString_FromStringAndSize(data
, size
),
431 unicode_internal_encode(PyObject
*self
,
435 const char *errors
= NULL
;
439 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_encode",
443 if (PyUnicode_Check(obj
)) {
444 data
= PyUnicode_AS_DATA(obj
);
445 size
= PyUnicode_GET_DATA_SIZE(obj
);
446 return codec_tuple(PyString_FromStringAndSize(data
, size
),
450 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
452 return codec_tuple(PyString_FromStringAndSize(data
, size
),
458 utf_7_encode(PyObject
*self
,
462 const char *errors
= NULL
;
464 if (!PyArg_ParseTuple(args
, "O|z:utf_7_encode",
468 str
= PyUnicode_FromObject(str
);
471 v
= codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str
),
472 PyUnicode_GET_SIZE(str
),
476 PyUnicode_GET_SIZE(str
));
482 utf_8_encode(PyObject
*self
,
486 const char *errors
= NULL
;
488 if (!PyArg_ParseTuple(args
, "O|z:utf_8_encode",
492 str
= PyUnicode_FromObject(str
);
495 v
= codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str
),
496 PyUnicode_GET_SIZE(str
),
498 PyUnicode_GET_SIZE(str
));
503 /* This version provides access to the byteorder parameter of the
504 builtin UTF-16 codecs as optional third argument. It defaults to 0
505 which means: use the native byte order and prepend the data with a
511 utf_16_encode(PyObject
*self
,
515 const char *errors
= NULL
;
518 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_encode",
519 &str
, &errors
, &byteorder
))
522 str
= PyUnicode_FromObject(str
);
525 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
526 PyUnicode_GET_SIZE(str
),
529 PyUnicode_GET_SIZE(str
));
535 utf_16_le_encode(PyObject
*self
,
539 const char *errors
= NULL
;
541 if (!PyArg_ParseTuple(args
, "O|z:utf_16_le_encode",
545 str
= PyUnicode_FromObject(str
);
548 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
549 PyUnicode_GET_SIZE(str
),
552 PyUnicode_GET_SIZE(str
));
558 utf_16_be_encode(PyObject
*self
,
562 const char *errors
= NULL
;
564 if (!PyArg_ParseTuple(args
, "O|z:utf_16_be_encode",
568 str
= PyUnicode_FromObject(str
);
571 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
572 PyUnicode_GET_SIZE(str
),
575 PyUnicode_GET_SIZE(str
));
581 unicode_escape_encode(PyObject
*self
,
585 const char *errors
= NULL
;
587 if (!PyArg_ParseTuple(args
, "O|z:unicode_escape_encode",
591 str
= PyUnicode_FromObject(str
);
594 v
= codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str
),
595 PyUnicode_GET_SIZE(str
)),
596 PyUnicode_GET_SIZE(str
));
602 raw_unicode_escape_encode(PyObject
*self
,
606 const char *errors
= NULL
;
608 if (!PyArg_ParseTuple(args
, "O|z:raw_unicode_escape_encode",
612 str
= PyUnicode_FromObject(str
);
615 v
= codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
616 PyUnicode_AS_UNICODE(str
),
617 PyUnicode_GET_SIZE(str
)),
618 PyUnicode_GET_SIZE(str
));
624 latin_1_encode(PyObject
*self
,
628 const char *errors
= NULL
;
630 if (!PyArg_ParseTuple(args
, "O|z:latin_1_encode",
634 str
= PyUnicode_FromObject(str
);
637 v
= codec_tuple(PyUnicode_EncodeLatin1(
638 PyUnicode_AS_UNICODE(str
),
639 PyUnicode_GET_SIZE(str
),
641 PyUnicode_GET_SIZE(str
));
647 ascii_encode(PyObject
*self
,
651 const char *errors
= NULL
;
653 if (!PyArg_ParseTuple(args
, "O|z:ascii_encode",
657 str
= PyUnicode_FromObject(str
);
660 v
= codec_tuple(PyUnicode_EncodeASCII(
661 PyUnicode_AS_UNICODE(str
),
662 PyUnicode_GET_SIZE(str
),
664 PyUnicode_GET_SIZE(str
));
670 charmap_encode(PyObject
*self
,
674 const char *errors
= NULL
;
675 PyObject
*mapping
= NULL
;
677 if (!PyArg_ParseTuple(args
, "O|zO:charmap_encode",
678 &str
, &errors
, &mapping
))
680 if (mapping
== Py_None
)
683 str
= PyUnicode_FromObject(str
);
686 v
= codec_tuple(PyUnicode_EncodeCharmap(
687 PyUnicode_AS_UNICODE(str
),
688 PyUnicode_GET_SIZE(str
),
691 PyUnicode_GET_SIZE(str
));
696 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
699 mbcs_encode(PyObject
*self
,
703 const char *errors
= NULL
;
705 if (!PyArg_ParseTuple(args
, "O|z:mbcs_encode",
709 str
= PyUnicode_FromObject(str
);
712 v
= codec_tuple(PyUnicode_EncodeMBCS(
713 PyUnicode_AS_UNICODE(str
),
714 PyUnicode_GET_SIZE(str
),
716 PyUnicode_GET_SIZE(str
));
721 #endif /* MS_WINDOWS */
722 #endif /* Py_USING_UNICODE */
724 /* --- Error handler registry --------------------------------------------- */
726 PyDoc_STRVAR(register_error__doc__
,
727 "register_error(errors, handler)\n\
729 Register the specified error handler under the name\n\
730 errors. handler must be a callable object, that\n\
731 will be called with an exception instance containing\n\
732 information about the location of the encoding/decoding\n\
733 error and must return a (replacement, new position) tuple.");
735 static PyObject
*register_error(PyObject
*self
, PyObject
*args
)
740 if (!PyArg_ParseTuple(args
, "sO:register_error",
743 if (PyCodec_RegisterError(name
, handler
))
749 PyDoc_STRVAR(lookup_error__doc__
,
750 "lookup_error(errors) -> handler\n\
752 Return the error handler for the specified error handling name\n\
753 or raise a LookupError, if no handler exists under this name.");
755 static PyObject
*lookup_error(PyObject
*self
, PyObject
*args
)
759 if (!PyArg_ParseTuple(args
, "s:lookup_error",
762 return PyCodec_LookupError(name
);
765 /* --- Module API --------------------------------------------------------- */
767 static PyMethodDef _codecs_functions
[] = {
768 {"register", codecregister
, METH_VARARGS
,
770 {"lookup", codeclookup
, METH_VARARGS
,
772 {"escape_encode", escape_encode
, METH_VARARGS
},
773 {"escape_decode", escape_decode
, METH_VARARGS
},
774 #ifdef Py_USING_UNICODE
775 {"utf_8_encode", utf_8_encode
, METH_VARARGS
},
776 {"utf_8_decode", utf_8_decode
, METH_VARARGS
},
777 {"utf_7_encode", utf_7_encode
, METH_VARARGS
},
778 {"utf_7_decode", utf_7_decode
, METH_VARARGS
},
779 {"utf_16_encode", utf_16_encode
, METH_VARARGS
},
780 {"utf_16_le_encode", utf_16_le_encode
, METH_VARARGS
},
781 {"utf_16_be_encode", utf_16_be_encode
, METH_VARARGS
},
782 {"utf_16_decode", utf_16_decode
, METH_VARARGS
},
783 {"utf_16_le_decode", utf_16_le_decode
, METH_VARARGS
},
784 {"utf_16_be_decode", utf_16_be_decode
, METH_VARARGS
},
785 {"utf_16_ex_decode", utf_16_ex_decode
, METH_VARARGS
},
786 {"unicode_escape_encode", unicode_escape_encode
, METH_VARARGS
},
787 {"unicode_escape_decode", unicode_escape_decode
, METH_VARARGS
},
788 {"unicode_internal_encode", unicode_internal_encode
, METH_VARARGS
},
789 {"unicode_internal_decode", unicode_internal_decode
, METH_VARARGS
},
790 {"raw_unicode_escape_encode", raw_unicode_escape_encode
, METH_VARARGS
},
791 {"raw_unicode_escape_decode", raw_unicode_escape_decode
, METH_VARARGS
},
792 {"latin_1_encode", latin_1_encode
, METH_VARARGS
},
793 {"latin_1_decode", latin_1_decode
, METH_VARARGS
},
794 {"ascii_encode", ascii_encode
, METH_VARARGS
},
795 {"ascii_decode", ascii_decode
, METH_VARARGS
},
796 {"charmap_encode", charmap_encode
, METH_VARARGS
},
797 {"charmap_decode", charmap_decode
, METH_VARARGS
},
798 {"readbuffer_encode", readbuffer_encode
, METH_VARARGS
},
799 {"charbuffer_encode", charbuffer_encode
, METH_VARARGS
},
800 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
801 {"mbcs_encode", mbcs_encode
, METH_VARARGS
},
802 {"mbcs_decode", mbcs_decode
, METH_VARARGS
},
804 #endif /* Py_USING_UNICODE */
805 {"register_error", register_error
, METH_VARARGS
,
806 register_error__doc__
},
807 {"lookup_error", lookup_error
, METH_VARARGS
,
808 lookup_error__doc__
},
809 {NULL
, NULL
} /* sentinel */
815 Py_InitModule("_codecs", _codecs_functions
);