1 /* ------------------------------------------------------------------------
3 _codecs -- Provides access to the codec registry and the builtin
6 This module should never be imported directly. The standard library
7 module "codecs" wraps this builtin module for use within Python.
9 The codec registry is accessible via:
11 register(search_function) -> None
13 lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
15 The builtin Unicode codecs use the following interface:
17 <encoding>_encode(Unicode_object[,errors='strict']) ->
18 (string object, bytes consumed)
20 <encoding>_decode(char_buffer_obj[,errors='strict']) ->
21 (Unicode object, bytes consumed)
23 <encoding>_encode() interfaces also accept non-Unicode object as
24 input. The objects are then converted to Unicode using
25 PyUnicode_FromObject() prior to applying the conversion.
27 These <encoding>s are available: utf_8, unicode_escape,
28 raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit),
32 Written by Marc-Andre Lemburg (mal@lemburg.com).
34 Copyright (c) Corporation for National Research Initiatives.
36 ------------------------------------------------------------------------ */
40 /* --- Registry ----------------------------------------------------------- */
43 PyObject
*codecregister(PyObject
*self
, PyObject
*args
)
45 PyObject
*search_function
;
47 if (!PyArg_ParseTuple(args
, "O:register", &search_function
))
50 if (PyCodec_Register(search_function
))
61 PyObject
*codeclookup(PyObject
*self
, PyObject
*args
)
65 if (!PyArg_ParseTuple(args
, "s:lookup", &encoding
))
68 return _PyCodec_Lookup(encoding
);
74 /* --- Helpers ------------------------------------------------------------ */
77 PyObject
*codec_tuple(PyObject
*unicode
,
89 PyTuple_SET_ITEM(v
,0,unicode
);
90 w
= PyInt_FromLong(len
);
95 PyTuple_SET_ITEM(v
,1,w
);
99 /* --- Decoder ------------------------------------------------------------ */
102 unicode_internal_decode(PyObject
*self
,
106 const char *errors
= NULL
;
110 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_decode",
114 if (PyUnicode_Check(obj
))
115 return codec_tuple(obj
, PyUnicode_GET_SIZE(obj
));
117 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
119 return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE
*)data
,
120 size
/ sizeof(Py_UNICODE
)),
126 utf_8_decode(PyObject
*self
,
131 const char *errors
= NULL
;
133 if (!PyArg_ParseTuple(args
, "t#|z:utf_8_decode",
134 &data
, &size
, &errors
))
137 return codec_tuple(PyUnicode_DecodeUTF8(data
, size
, errors
),
142 utf_16_decode(PyObject
*self
,
147 const char *errors
= NULL
;
150 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_decode",
151 &data
, &size
, &errors
))
153 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
158 utf_16_le_decode(PyObject
*self
,
163 const char *errors
= NULL
;
166 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_le_decode",
167 &data
, &size
, &errors
))
169 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
174 utf_16_be_decode(PyObject
*self
,
179 const char *errors
= NULL
;
182 if (!PyArg_ParseTuple(args
, "t#|z:utf_16_be_decode",
183 &data
, &size
, &errors
))
185 return codec_tuple(PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
),
189 /* This non-standard version also provides access to the byteorder
190 parameter of the builtin UTF-16 codec.
192 It returns a tuple (unicode, bytesread, byteorder) with byteorder
193 being the value in effect at the end of data.
198 utf_16_ex_decode(PyObject
*self
,
203 const char *errors
= NULL
;
205 PyObject
*unicode
, *tuple
;
207 if (!PyArg_ParseTuple(args
, "t#|zi:utf_16_ex_decode",
208 &data
, &size
, &errors
, &byteorder
))
211 unicode
= PyUnicode_DecodeUTF16(data
, size
, errors
, &byteorder
);
214 tuple
= Py_BuildValue("Oii", unicode
, size
, byteorder
);
220 unicode_escape_decode(PyObject
*self
,
225 const char *errors
= NULL
;
227 if (!PyArg_ParseTuple(args
, "t#|z:unicode_escape_decode",
228 &data
, &size
, &errors
))
231 return codec_tuple(PyUnicode_DecodeUnicodeEscape(data
, size
, errors
),
236 raw_unicode_escape_decode(PyObject
*self
,
241 const char *errors
= NULL
;
243 if (!PyArg_ParseTuple(args
, "t#|z:raw_unicode_escape_decode",
244 &data
, &size
, &errors
))
247 return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data
, size
, errors
),
252 latin_1_decode(PyObject
*self
,
257 const char *errors
= NULL
;
259 if (!PyArg_ParseTuple(args
, "t#|z:latin_1_decode",
260 &data
, &size
, &errors
))
263 return codec_tuple(PyUnicode_DecodeLatin1(data
, size
, errors
),
268 ascii_decode(PyObject
*self
,
273 const char *errors
= NULL
;
275 if (!PyArg_ParseTuple(args
, "t#|z:ascii_decode",
276 &data
, &size
, &errors
))
279 return codec_tuple(PyUnicode_DecodeASCII(data
, size
, errors
),
284 charmap_decode(PyObject
*self
,
289 const char *errors
= NULL
;
290 PyObject
*mapping
= NULL
;
292 if (!PyArg_ParseTuple(args
, "t#|zO:charmap_decode",
293 &data
, &size
, &errors
, &mapping
))
295 if (mapping
== Py_None
)
298 return codec_tuple(PyUnicode_DecodeCharmap(data
, size
, mapping
, errors
),
305 mbcs_decode(PyObject
*self
,
310 const char *errors
= NULL
;
312 if (!PyArg_ParseTuple(args
, "t#|z:mbcs_decode",
313 &data
, &size
, &errors
))
316 return codec_tuple(PyUnicode_DecodeMBCS(data
, size
, errors
),
320 #endif /* MS_WIN32 */
322 /* --- Encoder ------------------------------------------------------------ */
325 readbuffer_encode(PyObject
*self
,
330 const char *errors
= NULL
;
332 if (!PyArg_ParseTuple(args
, "s#|z:readbuffer_encode",
333 &data
, &size
, &errors
))
336 return codec_tuple(PyString_FromStringAndSize(data
, size
),
341 charbuffer_encode(PyObject
*self
,
346 const char *errors
= NULL
;
348 if (!PyArg_ParseTuple(args
, "t#|z:charbuffer_encode",
349 &data
, &size
, &errors
))
352 return codec_tuple(PyString_FromStringAndSize(data
, size
),
357 unicode_internal_encode(PyObject
*self
,
361 const char *errors
= NULL
;
365 if (!PyArg_ParseTuple(args
, "O|z:unicode_internal_encode",
369 if (PyUnicode_Check(obj
)) {
370 data
= PyUnicode_AS_DATA(obj
);
371 size
= PyUnicode_GET_DATA_SIZE(obj
);
372 return codec_tuple(PyString_FromStringAndSize(data
, size
),
376 if (PyObject_AsReadBuffer(obj
, (const void **)&data
, &size
))
378 return codec_tuple(PyString_FromStringAndSize(data
, size
),
384 utf_8_encode(PyObject
*self
,
388 const char *errors
= NULL
;
390 if (!PyArg_ParseTuple(args
, "O|z:utf_8_encode",
394 str
= PyUnicode_FromObject(str
);
397 v
= codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str
),
398 PyUnicode_GET_SIZE(str
),
400 PyUnicode_GET_SIZE(str
));
405 /* This version provides access to the byteorder parameter of the
406 builtin UTF-16 codecs as optional third argument. It defaults to 0
407 which means: use the native byte order and prepend the data with a
413 utf_16_encode(PyObject
*self
,
417 const char *errors
= NULL
;
420 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_encode",
421 &str
, &errors
, &byteorder
))
424 str
= PyUnicode_FromObject(str
);
427 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
428 PyUnicode_GET_SIZE(str
),
431 PyUnicode_GET_SIZE(str
));
437 utf_16_le_encode(PyObject
*self
,
441 const char *errors
= NULL
;
443 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_le_encode",
447 str
= PyUnicode_FromObject(str
);
450 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
451 PyUnicode_GET_SIZE(str
),
454 PyUnicode_GET_SIZE(str
));
460 utf_16_be_encode(PyObject
*self
,
464 const char *errors
= NULL
;
466 if (!PyArg_ParseTuple(args
, "O|zi:utf_16_be_encode",
470 str
= PyUnicode_FromObject(str
);
473 v
= codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str
),
474 PyUnicode_GET_SIZE(str
),
477 PyUnicode_GET_SIZE(str
));
483 unicode_escape_encode(PyObject
*self
,
487 const char *errors
= NULL
;
489 if (!PyArg_ParseTuple(args
, "O|z:unicode_escape_encode",
493 str
= PyUnicode_FromObject(str
);
496 v
= codec_tuple(PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(str
),
497 PyUnicode_GET_SIZE(str
)),
498 PyUnicode_GET_SIZE(str
));
504 raw_unicode_escape_encode(PyObject
*self
,
508 const char *errors
= NULL
;
510 if (!PyArg_ParseTuple(args
, "O|z:raw_unicode_escape_encode",
514 str
= PyUnicode_FromObject(str
);
517 v
= codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
518 PyUnicode_AS_UNICODE(str
),
519 PyUnicode_GET_SIZE(str
)),
520 PyUnicode_GET_SIZE(str
));
526 latin_1_encode(PyObject
*self
,
530 const char *errors
= NULL
;
532 if (!PyArg_ParseTuple(args
, "O|z:latin_1_encode",
536 str
= PyUnicode_FromObject(str
);
539 v
= codec_tuple(PyUnicode_EncodeLatin1(
540 PyUnicode_AS_UNICODE(str
),
541 PyUnicode_GET_SIZE(str
),
543 PyUnicode_GET_SIZE(str
));
549 ascii_encode(PyObject
*self
,
553 const char *errors
= NULL
;
555 if (!PyArg_ParseTuple(args
, "O|z:ascii_encode",
559 str
= PyUnicode_FromObject(str
);
562 v
= codec_tuple(PyUnicode_EncodeASCII(
563 PyUnicode_AS_UNICODE(str
),
564 PyUnicode_GET_SIZE(str
),
566 PyUnicode_GET_SIZE(str
));
572 charmap_encode(PyObject
*self
,
576 const char *errors
= NULL
;
577 PyObject
*mapping
= NULL
;
579 if (!PyArg_ParseTuple(args
, "O|zO:charmap_encode",
580 &str
, &errors
, &mapping
))
582 if (mapping
== Py_None
)
585 str
= PyUnicode_FromObject(str
);
588 v
= codec_tuple(PyUnicode_EncodeCharmap(
589 PyUnicode_AS_UNICODE(str
),
590 PyUnicode_GET_SIZE(str
),
593 PyUnicode_GET_SIZE(str
));
601 mbcs_encode(PyObject
*self
,
605 const char *errors
= NULL
;
607 if (!PyArg_ParseTuple(args
, "O|z:mbcs_encode",
611 str
= PyUnicode_FromObject(str
);
614 v
= codec_tuple(PyUnicode_EncodeMBCS(
615 PyUnicode_AS_UNICODE(str
),
616 PyUnicode_GET_SIZE(str
),
618 PyUnicode_GET_SIZE(str
));
623 #endif /* MS_WIN32 */
625 /* --- Module API --------------------------------------------------------- */
627 static PyMethodDef _codecs_functions
[] = {
628 {"register", codecregister
, 1},
629 {"lookup", codeclookup
, 1},
630 {"utf_8_encode", utf_8_encode
, 1},
631 {"utf_8_decode", utf_8_decode
, 1},
632 {"utf_16_encode", utf_16_encode
, 1},
633 {"utf_16_le_encode", utf_16_le_encode
, 1},
634 {"utf_16_be_encode", utf_16_be_encode
, 1},
635 {"utf_16_decode", utf_16_decode
, 1},
636 {"utf_16_le_decode", utf_16_le_decode
, 1},
637 {"utf_16_be_decode", utf_16_be_decode
, 1},
638 {"utf_16_ex_decode", utf_16_ex_decode
, 1},
639 {"unicode_escape_encode", unicode_escape_encode
, 1},
640 {"unicode_escape_decode", unicode_escape_decode
, 1},
641 {"unicode_internal_encode", unicode_internal_encode
, 1},
642 {"unicode_internal_decode", unicode_internal_decode
, 1},
643 {"raw_unicode_escape_encode", raw_unicode_escape_encode
, 1},
644 {"raw_unicode_escape_decode", raw_unicode_escape_decode
, 1},
645 {"latin_1_encode", latin_1_encode
, 1},
646 {"latin_1_decode", latin_1_decode
, 1},
647 {"ascii_encode", ascii_encode
, 1},
648 {"ascii_decode", ascii_decode
, 1},
649 {"charmap_encode", charmap_encode
, 1},
650 {"charmap_decode", charmap_decode
, 1},
651 {"readbuffer_encode", readbuffer_encode
, 1},
652 {"charbuffer_encode", charbuffer_encode
, 1},
654 {"mbcs_encode", mbcs_encode
, 1},
655 {"mbcs_decode", mbcs_decode
, 1},
657 {NULL
, NULL
} /* sentinel */
663 Py_InitModule("_codecs", _codecs_functions
);