Maintain backwards compatibility with python < 2.3 by dynamically
[python/dscho.git] / Modules / _codecsmodule.c
blob210be516f949254ec32efc51cf4f9e65334f91fc
1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
4 codecs.
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
29 mbcs (on win32).
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
38 #include "Python.h"
40 /* --- Registry ----------------------------------------------------------- */
42 PyDoc_STRVAR(register__doc__,
43 "register(search_function)\n\
44 \n\
45 Register a codec search function. Search functions are expected to take\n\
46 one argument, the encoding name in all lower case letters, and return\n\
47 a tuple of functions (encoder, decoder, stream_reader, stream_writer).");
49 static
50 PyObject *codecregister(PyObject *self, PyObject *args)
52 PyObject *search_function;
54 if (!PyArg_ParseTuple(args, "O:register", &search_function))
55 goto onError;
57 if (PyCodec_Register(search_function))
58 goto onError;
60 Py_INCREF(Py_None);
61 return Py_None;
63 onError:
64 return NULL;
67 PyDoc_STRVAR(lookup__doc__,
68 "lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)\n\
69 \n\
70 Looks up a codec tuple in the Python codec registry and returns\n\
71 a tuple of functions.");
73 static
74 PyObject *codeclookup(PyObject *self, PyObject *args)
76 char *encoding;
78 if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
79 goto onError;
81 return _PyCodec_Lookup(encoding);
83 onError:
84 return NULL;
87 /* --- Helpers ------------------------------------------------------------ */
89 static
90 PyObject *codec_tuple(PyObject *unicode,
91 int len)
93 PyObject *v,*w;
95 if (unicode == NULL)
96 return NULL;
97 v = PyTuple_New(2);
98 if (v == NULL) {
99 Py_DECREF(unicode);
100 return NULL;
102 PyTuple_SET_ITEM(v,0,unicode);
103 w = PyInt_FromLong(len);
104 if (w == NULL) {
105 Py_DECREF(v);
106 return NULL;
108 PyTuple_SET_ITEM(v,1,w);
109 return v;
112 /* --- String codecs ------------------------------------------------------ */
113 static PyObject *
114 escape_decode(PyObject *self,
115 PyObject *args)
117 const char *errors = NULL;
118 const char *data;
119 int size;
121 if (!PyArg_ParseTuple(args, "s#|z:escape_decode",
122 &data, &size, &errors))
123 return NULL;
124 return codec_tuple(PyString_DecodeEscape(data, size, errors, 0, NULL),
125 size);
128 static PyObject *
129 escape_encode(PyObject *self,
130 PyObject *args)
132 PyObject *str;
133 const char *errors = NULL;
134 char *buf;
135 int len;
137 if (!PyArg_ParseTuple(args, "O!|z:escape_encode",
138 &PyString_Type, &str, &errors))
139 return NULL;
141 str = PyString_Repr(str, 0);
142 if (!str)
143 return NULL;
145 /* The string will be quoted. Unquote, similar to unicode-escape. */
146 buf = PyString_AS_STRING (str);
147 len = PyString_GET_SIZE (str);
148 memmove(buf, buf+1, len-2);
149 _PyString_Resize(&str, len-2);
151 return codec_tuple(str, PyString_Size(str));
154 #ifdef Py_USING_UNICODE
155 /* --- Decoder ------------------------------------------------------------ */
157 static PyObject *
158 unicode_internal_decode(PyObject *self,
159 PyObject *args)
161 PyObject *obj;
162 const char *errors = NULL;
163 const char *data;
164 int size;
166 if (!PyArg_ParseTuple(args, "O|z:unicode_internal_decode",
167 &obj, &errors))
168 return NULL;
170 if (PyUnicode_Check(obj)) {
171 Py_INCREF(obj);
172 return codec_tuple(obj, PyUnicode_GET_SIZE(obj));
174 else {
175 if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
176 return NULL;
177 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
178 size / sizeof(Py_UNICODE)),
179 size);
183 static PyObject *
184 utf_7_decode(PyObject *self,
185 PyObject *args)
187 const char *data;
188 int size;
189 const char *errors = NULL;
191 if (!PyArg_ParseTuple(args, "t#|z:utf_7_decode",
192 &data, &size, &errors))
193 return NULL;
195 return codec_tuple(PyUnicode_DecodeUTF7(data, size, errors),
196 size);
199 static PyObject *
200 utf_8_decode(PyObject *self,
201 PyObject *args)
203 const char *data;
204 int size;
205 const char *errors = NULL;
207 if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
208 &data, &size, &errors))
209 return NULL;
211 return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
212 size);
215 static PyObject *
216 utf_16_decode(PyObject *self,
217 PyObject *args)
219 const char *data;
220 int size;
221 const char *errors = NULL;
222 int byteorder = 0;
224 if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
225 &data, &size, &errors))
226 return NULL;
227 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
228 size);
231 static PyObject *
232 utf_16_le_decode(PyObject *self,
233 PyObject *args)
235 const char *data;
236 int size;
237 const char *errors = NULL;
238 int byteorder = -1;
240 if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
241 &data, &size, &errors))
242 return NULL;
243 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
244 size);
247 static PyObject *
248 utf_16_be_decode(PyObject *self,
249 PyObject *args)
251 const char *data;
252 int size;
253 const char *errors = NULL;
254 int byteorder = 1;
256 if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
257 &data, &size, &errors))
258 return NULL;
259 return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
260 size);
263 /* This non-standard version also provides access to the byteorder
264 parameter of the builtin UTF-16 codec.
266 It returns a tuple (unicode, bytesread, byteorder) with byteorder
267 being the value in effect at the end of data.
271 static PyObject *
272 utf_16_ex_decode(PyObject *self,
273 PyObject *args)
275 const char *data;
276 int size;
277 const char *errors = NULL;
278 int byteorder = 0;
279 PyObject *unicode, *tuple;
281 if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
282 &data, &size, &errors, &byteorder))
283 return NULL;
285 unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
286 if (unicode == NULL)
287 return NULL;
288 tuple = Py_BuildValue("Oii", unicode, size, byteorder);
289 Py_DECREF(unicode);
290 return tuple;
293 static PyObject *
294 unicode_escape_decode(PyObject *self,
295 PyObject *args)
297 const char *data;
298 int size;
299 const char *errors = NULL;
301 if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
302 &data, &size, &errors))
303 return NULL;
305 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
306 size);
309 static PyObject *
310 raw_unicode_escape_decode(PyObject *self,
311 PyObject *args)
313 const char *data;
314 int size;
315 const char *errors = NULL;
317 if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
318 &data, &size, &errors))
319 return NULL;
321 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
322 size);
325 static PyObject *
326 latin_1_decode(PyObject *self,
327 PyObject *args)
329 const char *data;
330 int size;
331 const char *errors = NULL;
333 if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
334 &data, &size, &errors))
335 return NULL;
337 return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
338 size);
341 static PyObject *
342 ascii_decode(PyObject *self,
343 PyObject *args)
345 const char *data;
346 int size;
347 const char *errors = NULL;
349 if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
350 &data, &size, &errors))
351 return NULL;
353 return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
354 size);
357 static PyObject *
358 charmap_decode(PyObject *self,
359 PyObject *args)
361 const char *data;
362 int size;
363 const char *errors = NULL;
364 PyObject *mapping = NULL;
366 if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
367 &data, &size, &errors, &mapping))
368 return NULL;
369 if (mapping == Py_None)
370 mapping = NULL;
372 return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
373 size);
376 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
378 static PyObject *
379 mbcs_decode(PyObject *self,
380 PyObject *args)
382 const char *data;
383 int size;
384 const char *errors = NULL;
386 if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
387 &data, &size, &errors))
388 return NULL;
390 return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
391 size);
394 #endif /* MS_WINDOWS */
396 /* --- Encoder ------------------------------------------------------------ */
398 static PyObject *
399 readbuffer_encode(PyObject *self,
400 PyObject *args)
402 const char *data;
403 int size;
404 const char *errors = NULL;
406 if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
407 &data, &size, &errors))
408 return NULL;
410 return codec_tuple(PyString_FromStringAndSize(data, size),
411 size);
414 static PyObject *
415 charbuffer_encode(PyObject *self,
416 PyObject *args)
418 const char *data;
419 int size;
420 const char *errors = NULL;
422 if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
423 &data, &size, &errors))
424 return NULL;
426 return codec_tuple(PyString_FromStringAndSize(data, size),
427 size);
430 static PyObject *
431 unicode_internal_encode(PyObject *self,
432 PyObject *args)
434 PyObject *obj;
435 const char *errors = NULL;
436 const char *data;
437 int size;
439 if (!PyArg_ParseTuple(args, "O|z:unicode_internal_encode",
440 &obj, &errors))
441 return NULL;
443 if (PyUnicode_Check(obj)) {
444 data = PyUnicode_AS_DATA(obj);
445 size = PyUnicode_GET_DATA_SIZE(obj);
446 return codec_tuple(PyString_FromStringAndSize(data, size),
447 size);
449 else {
450 if (PyObject_AsReadBuffer(obj, (const void **)&data, &size))
451 return NULL;
452 return codec_tuple(PyString_FromStringAndSize(data, size),
453 size);
457 static PyObject *
458 utf_7_encode(PyObject *self,
459 PyObject *args)
461 PyObject *str, *v;
462 const char *errors = NULL;
464 if (!PyArg_ParseTuple(args, "O|z:utf_7_encode",
465 &str, &errors))
466 return NULL;
468 str = PyUnicode_FromObject(str);
469 if (str == NULL)
470 return NULL;
471 v = codec_tuple(PyUnicode_EncodeUTF7(PyUnicode_AS_UNICODE(str),
472 PyUnicode_GET_SIZE(str),
475 errors),
476 PyUnicode_GET_SIZE(str));
477 Py_DECREF(str);
478 return v;
481 static PyObject *
482 utf_8_encode(PyObject *self,
483 PyObject *args)
485 PyObject *str, *v;
486 const char *errors = NULL;
488 if (!PyArg_ParseTuple(args, "O|z:utf_8_encode",
489 &str, &errors))
490 return NULL;
492 str = PyUnicode_FromObject(str);
493 if (str == NULL)
494 return NULL;
495 v = codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
496 PyUnicode_GET_SIZE(str),
497 errors),
498 PyUnicode_GET_SIZE(str));
499 Py_DECREF(str);
500 return v;
503 /* This version provides access to the byteorder parameter of the
504 builtin UTF-16 codecs as optional third argument. It defaults to 0
505 which means: use the native byte order and prepend the data with a
506 BOM mark.
510 static PyObject *
511 utf_16_encode(PyObject *self,
512 PyObject *args)
514 PyObject *str, *v;
515 const char *errors = NULL;
516 int byteorder = 0;
518 if (!PyArg_ParseTuple(args, "O|zi:utf_16_encode",
519 &str, &errors, &byteorder))
520 return NULL;
522 str = PyUnicode_FromObject(str);
523 if (str == NULL)
524 return NULL;
525 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
526 PyUnicode_GET_SIZE(str),
527 errors,
528 byteorder),
529 PyUnicode_GET_SIZE(str));
530 Py_DECREF(str);
531 return v;
534 static PyObject *
535 utf_16_le_encode(PyObject *self,
536 PyObject *args)
538 PyObject *str, *v;
539 const char *errors = NULL;
541 if (!PyArg_ParseTuple(args, "O|z:utf_16_le_encode",
542 &str, &errors))
543 return NULL;
545 str = PyUnicode_FromObject(str);
546 if (str == NULL)
547 return NULL;
548 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
549 PyUnicode_GET_SIZE(str),
550 errors,
551 -1),
552 PyUnicode_GET_SIZE(str));
553 Py_DECREF(str);
554 return v;
557 static PyObject *
558 utf_16_be_encode(PyObject *self,
559 PyObject *args)
561 PyObject *str, *v;
562 const char *errors = NULL;
564 if (!PyArg_ParseTuple(args, "O|z:utf_16_be_encode",
565 &str, &errors))
566 return NULL;
568 str = PyUnicode_FromObject(str);
569 if (str == NULL)
570 return NULL;
571 v = codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
572 PyUnicode_GET_SIZE(str),
573 errors,
574 +1),
575 PyUnicode_GET_SIZE(str));
576 Py_DECREF(str);
577 return v;
580 static PyObject *
581 unicode_escape_encode(PyObject *self,
582 PyObject *args)
584 PyObject *str, *v;
585 const char *errors = NULL;
587 if (!PyArg_ParseTuple(args, "O|z:unicode_escape_encode",
588 &str, &errors))
589 return NULL;
591 str = PyUnicode_FromObject(str);
592 if (str == NULL)
593 return NULL;
594 v = codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str),
595 PyUnicode_GET_SIZE(str)),
596 PyUnicode_GET_SIZE(str));
597 Py_DECREF(str);
598 return v;
601 static PyObject *
602 raw_unicode_escape_encode(PyObject *self,
603 PyObject *args)
605 PyObject *str, *v;
606 const char *errors = NULL;
608 if (!PyArg_ParseTuple(args, "O|z:raw_unicode_escape_encode",
609 &str, &errors))
610 return NULL;
612 str = PyUnicode_FromObject(str);
613 if (str == NULL)
614 return NULL;
615 v = codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
616 PyUnicode_AS_UNICODE(str),
617 PyUnicode_GET_SIZE(str)),
618 PyUnicode_GET_SIZE(str));
619 Py_DECREF(str);
620 return v;
623 static PyObject *
624 latin_1_encode(PyObject *self,
625 PyObject *args)
627 PyObject *str, *v;
628 const char *errors = NULL;
630 if (!PyArg_ParseTuple(args, "O|z:latin_1_encode",
631 &str, &errors))
632 return NULL;
634 str = PyUnicode_FromObject(str);
635 if (str == NULL)
636 return NULL;
637 v = codec_tuple(PyUnicode_EncodeLatin1(
638 PyUnicode_AS_UNICODE(str),
639 PyUnicode_GET_SIZE(str),
640 errors),
641 PyUnicode_GET_SIZE(str));
642 Py_DECREF(str);
643 return v;
646 static PyObject *
647 ascii_encode(PyObject *self,
648 PyObject *args)
650 PyObject *str, *v;
651 const char *errors = NULL;
653 if (!PyArg_ParseTuple(args, "O|z:ascii_encode",
654 &str, &errors))
655 return NULL;
657 str = PyUnicode_FromObject(str);
658 if (str == NULL)
659 return NULL;
660 v = codec_tuple(PyUnicode_EncodeASCII(
661 PyUnicode_AS_UNICODE(str),
662 PyUnicode_GET_SIZE(str),
663 errors),
664 PyUnicode_GET_SIZE(str));
665 Py_DECREF(str);
666 return v;
669 static PyObject *
670 charmap_encode(PyObject *self,
671 PyObject *args)
673 PyObject *str, *v;
674 const char *errors = NULL;
675 PyObject *mapping = NULL;
677 if (!PyArg_ParseTuple(args, "O|zO:charmap_encode",
678 &str, &errors, &mapping))
679 return NULL;
680 if (mapping == Py_None)
681 mapping = NULL;
683 str = PyUnicode_FromObject(str);
684 if (str == NULL)
685 return NULL;
686 v = codec_tuple(PyUnicode_EncodeCharmap(
687 PyUnicode_AS_UNICODE(str),
688 PyUnicode_GET_SIZE(str),
689 mapping,
690 errors),
691 PyUnicode_GET_SIZE(str));
692 Py_DECREF(str);
693 return v;
696 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
698 static PyObject *
699 mbcs_encode(PyObject *self,
700 PyObject *args)
702 PyObject *str, *v;
703 const char *errors = NULL;
705 if (!PyArg_ParseTuple(args, "O|z:mbcs_encode",
706 &str, &errors))
707 return NULL;
709 str = PyUnicode_FromObject(str);
710 if (str == NULL)
711 return NULL;
712 v = codec_tuple(PyUnicode_EncodeMBCS(
713 PyUnicode_AS_UNICODE(str),
714 PyUnicode_GET_SIZE(str),
715 errors),
716 PyUnicode_GET_SIZE(str));
717 Py_DECREF(str);
718 return v;
721 #endif /* MS_WINDOWS */
722 #endif /* Py_USING_UNICODE */
724 /* --- Error handler registry --------------------------------------------- */
726 PyDoc_STRVAR(register_error__doc__,
727 "register_error(errors, handler)\n\
729 Register the specified error handler under the name\n\
730 errors. handler must be a callable object, that\n\
731 will be called with an exception instance containing\n\
732 information about the location of the encoding/decoding\n\
733 error and must return a (replacement, new position) tuple.");
735 static PyObject *register_error(PyObject *self, PyObject *args)
737 const char *name;
738 PyObject *handler;
740 if (!PyArg_ParseTuple(args, "sO:register_error",
741 &name, &handler))
742 return NULL;
743 if (PyCodec_RegisterError(name, handler))
744 return NULL;
745 Py_INCREF(Py_None);
746 return Py_None;
749 PyDoc_STRVAR(lookup_error__doc__,
750 "lookup_error(errors) -> handler\n\
752 Return the error handler for the specified error handling name\n\
753 or raise a LookupError, if no handler exists under this name.");
755 static PyObject *lookup_error(PyObject *self, PyObject *args)
757 const char *name;
759 if (!PyArg_ParseTuple(args, "s:lookup_error",
760 &name))
761 return NULL;
762 return PyCodec_LookupError(name);
765 /* --- Module API --------------------------------------------------------- */
767 static PyMethodDef _codecs_functions[] = {
768 {"register", codecregister, METH_VARARGS,
769 register__doc__},
770 {"lookup", codeclookup, METH_VARARGS,
771 lookup__doc__},
772 {"escape_encode", escape_encode, METH_VARARGS},
773 {"escape_decode", escape_decode, METH_VARARGS},
774 #ifdef Py_USING_UNICODE
775 {"utf_8_encode", utf_8_encode, METH_VARARGS},
776 {"utf_8_decode", utf_8_decode, METH_VARARGS},
777 {"utf_7_encode", utf_7_encode, METH_VARARGS},
778 {"utf_7_decode", utf_7_decode, METH_VARARGS},
779 {"utf_16_encode", utf_16_encode, METH_VARARGS},
780 {"utf_16_le_encode", utf_16_le_encode, METH_VARARGS},
781 {"utf_16_be_encode", utf_16_be_encode, METH_VARARGS},
782 {"utf_16_decode", utf_16_decode, METH_VARARGS},
783 {"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
784 {"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
785 {"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
786 {"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
787 {"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
788 {"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},
789 {"unicode_internal_decode", unicode_internal_decode, METH_VARARGS},
790 {"raw_unicode_escape_encode", raw_unicode_escape_encode, METH_VARARGS},
791 {"raw_unicode_escape_decode", raw_unicode_escape_decode, METH_VARARGS},
792 {"latin_1_encode", latin_1_encode, METH_VARARGS},
793 {"latin_1_decode", latin_1_decode, METH_VARARGS},
794 {"ascii_encode", ascii_encode, METH_VARARGS},
795 {"ascii_decode", ascii_decode, METH_VARARGS},
796 {"charmap_encode", charmap_encode, METH_VARARGS},
797 {"charmap_decode", charmap_decode, METH_VARARGS},
798 {"readbuffer_encode", readbuffer_encode, METH_VARARGS},
799 {"charbuffer_encode", charbuffer_encode, METH_VARARGS},
800 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
801 {"mbcs_encode", mbcs_encode, METH_VARARGS},
802 {"mbcs_decode", mbcs_decode, METH_VARARGS},
803 #endif
804 #endif /* Py_USING_UNICODE */
805 {"register_error", register_error, METH_VARARGS,
806 register_error__doc__},
807 {"lookup_error", lookup_error, METH_VARARGS,
808 lookup_error__doc__},
809 {NULL, NULL} /* sentinel */
812 PyMODINIT_FUNC
813 init_codecs(void)
815 Py_InitModule("_codecs", _codecs_functions);