4 * libiconv adaptor for Python iconvcodec
6 * Author : Hye-Shik Chang <perky@FreeBSD.org>
7 * Created : 17 January 2003
14 static const char *__version__
= "$Revision$";
17 # if Py_UNICODE_SIZE == 2
18 # ifdef __GNU_LIBRARY__
19 # define UNICODE_ENCODING "ucs-2"
21 # define UNICODE_ENCODING "ucs-2-internal"
23 # define MBENCODED_LENGTH_MAX 4
24 # elif Py_UNICODE_SIZE == 4
25 # ifdef __GNU_LIBRARY__
26 # define UNICODE_ENCODING "ucs-4"
28 # define UNICODE_ENCODING "ucs-4-internal"
30 # define MBENCODED_LENGTH_MAX 6
33 # error "Unicode is not available"
38 iconv_t enchdl
, dechdl
;
41 PyDoc_STRVAR(iconvcodec_doc
, "iconvcodec object");
43 staticforward PyTypeObject iconvcodec_Type
;
46 #define ERROR_STRICT (PyObject *)(1)
47 #define ERROR_IGNORE (PyObject *)(2)
48 #define ERROR_REPLACE (PyObject *)(3)
49 #define ERROR_MAX ERROR_REPLACE
51 #define REPLACEMENT_CHAR_DECODE 0xFFFD
52 #define REPLACEMENT_CHAR_ENCODE '?'
54 #define DEFAULT_ENCODING "utf-8"
58 get_errorcallback(const char *errors
)
60 if (errors
== NULL
|| strcmp(errors
, "strict") == 0)
62 else if (strcmp(errors
, "ignore") == 0)
64 else if (strcmp(errors
, "replace") == 0)
67 return PyCodec_LookupError(errors
);
71 PyDoc_STRVAR(iconvcodec_encode__doc__
,
72 "I.encode(unicode, [,errors]) -> (string, length consumed)\n\
74 Return an encoded string version of `unicode'. errors may be given to\n\
75 set a different error handling scheme. Default is 'strict' meaning that\n\
76 encoding errors raise a UnicodeEncodeError. Other possible values are\n\
77 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
78 registered with codecs.register_error that can handle UnicodeEncodeErrors.");
81 iconvcodec_encode(iconvcodecObject
*self
, PyObject
*args
, PyObject
*kwargs
)
83 static char *kwlist
[] = { "input", "errors", NULL
};
86 char *errors
= NULL
/*strict*/, *out
, *out_top
;
87 const char *inp
, *inp_top
;
88 size_t inplen
, inplen_total
, outlen
, outlen_total
, estep
;
89 PyObject
*outputobj
= NULL
, *errorcb
= NULL
,
92 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "u#|s:encode",
93 kwlist
, &input
, &inputlen
, &errors
))
94 return NULL
; /* TypeError */
96 errorcb
= get_errorcallback(errors
);
98 return NULL
; /* LookupError or something else from error handler */
100 inp
= inp_top
= (char *)input
;
101 inplen
= inplen_total
= (size_t)(inputlen
* Py_UNICODE_SIZE
);
103 outlen
= inputlen
* MBENCODED_LENGTH_MAX
;
105 outlen
= 16; /* for iso-2022 codecs */
107 outputobj
= PyString_FromStringAndSize(NULL
, outlen
);
108 if (outputobj
== NULL
)
110 out
= out_top
= PyString_AS_STRING(outputobj
);
111 outlen_total
= outlen
;
113 estep
= inputlen
* Py_UNICODE_SIZE
/ 2;
115 #define RESIZE_OUTBUFFER(size) { \
116 size_t toadd = (size); \
117 outlen_total += toadd; \
119 if (_PyString_Resize(&outputobj, outlen_total) == -1) \
121 out = PyString_AS_STRING(outputobj) + (out - out_top); \
122 out_top = PyString_AS_STRING(outputobj); \
125 if (iconv(self
->enchdl
, (char**)&inp
, &inplen
, &out
, &outlen
) == -1) {
129 if (errno
== E2BIG
) {
130 RESIZE_OUTBUFFER(estep
);
134 if (errorcb
== ERROR_IGNORE
|| errorcb
== ERROR_REPLACE
) {
135 inplen
-= Py_UNICODE_SIZE
;
136 inp
+= Py_UNICODE_SIZE
;
137 if (errorcb
== ERROR_REPLACE
) {
139 RESIZE_OUTBUFFER(errno
== EINVAL
? 1 : estep
);
141 *out
++ = REPLACEMENT_CHAR_ENCODE
;
143 if (errno
== EINVAL
) break;
147 errpos
= (int)(inp
- inp_top
) / Py_UNICODE_SIZE
;
148 sprintf(reason
, "Undefined character map from "
149 #if Py_UNICODE_SIZE == 2
151 #elif Py_UNICODE_SIZE == 4
154 , *(Py_UNICODE
*)inp
);
156 if (exceptionobj
== NULL
) {
157 if ((exceptionobj
= PyUnicodeEncodeError_Create(
158 self
->encoding
, input
, inputlen
,
159 errpos
, errpos
+ 1, reason
)) == NULL
)
162 if (PyUnicodeEncodeError_SetStart(exceptionobj
, errpos
) != 0)
164 if (PyUnicodeEncodeError_SetEnd(exceptionobj
, errpos
+ 1) != 0)
166 if (PyUnicodeEncodeError_SetReason(exceptionobj
, reason
) != 0)
170 if (errorcb
== ERROR_STRICT
) {
171 PyCodec_StrictErrors(exceptionobj
);
174 PyObject
*argsobj
, *retobj
, *retuni
;
177 argsobj
= PyTuple_New(1);
180 PyTuple_SET_ITEM(argsobj
, 0, exceptionobj
);
181 Py_INCREF(exceptionobj
);
182 retobj
= PyObject_CallObject(errorcb
, argsobj
);
187 if (!PyTuple_Check(retobj
) || PyTuple_GET_SIZE(retobj
) != 2 ||
188 !PyUnicode_Check((retuni
= PyTuple_GET_ITEM(retobj
, 0))) ||
189 !PyInt_Check(PyTuple_GET_ITEM(retobj
, 1))) {
191 PyErr_SetString(PyExc_ValueError
, "encoding error handler "
192 "must return (unicode, int) tuple");
195 if (PyUnicode_GET_SIZE(retuni
) > 0) {
196 #define errorexit errorexit_cbpad
197 PyObject
*retstr
= NULL
;
200 retstr
= PyUnicode_AsEncodedString(
201 retuni
, self
->encoding
, NULL
);
202 if (retstr
== NULL
|| !PyString_Check(retstr
))
205 retstrsize
= PyString_GET_SIZE(retstr
);
206 if (outlen
< retstrsize
)
207 RESIZE_OUTBUFFER(errno
== EINVAL
|| retstrsize
> estep
208 ? retstrsize
- outlen
: estep
);
210 memcpy(out
, PyString_AS_STRING(retstr
), retstrsize
);
212 outlen
-= retstrsize
;
215 errorexit_cbpad
: Py_XDECREF(retobj
);
222 newpos
= PyInt_AS_LONG(PyTuple_GET_ITEM(retobj
, 1));
226 newpos
= inputlen
- newpos
;
227 if (newpos
< 0 || newpos
>= inputlen
)
229 inp
= inp_top
+ Py_UNICODE_SIZE
* newpos
;
230 inplen
= inplen_total
- Py_UNICODE_SIZE
* newpos
;
235 #undef RESIZE_OUTBUFFER
241 finalsize
= (int)(out
- out_top
);
243 if (finalsize
!= outlen_total
) {
244 if (_PyString_Resize(&outputobj
, finalsize
) == -1)
248 if (errorcb
> ERROR_MAX
) {
251 Py_XDECREF(exceptionobj
);
253 rettup
= PyTuple_New(2);
254 if (rettup
== NULL
) {
255 Py_DECREF(outputobj
);
258 PyTuple_SET_ITEM(rettup
, 0, outputobj
);
259 PyTuple_SET_ITEM(rettup
, 1, PyInt_FromLong(inputlen
));
264 Py_XDECREF(outputobj
);
265 if (errorcb
> ERROR_MAX
) {
268 Py_XDECREF(exceptionobj
);
273 PyDoc_STRVAR(iconvcodec_decode__doc__
,
274 "I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\
276 Decodes `string' using I, an iconvcodec instance. errors may be given\n\
277 to set a different error handling scheme. Default is 'strict' meaning\n\
278 that encoding errors raise a UnicodeDecodeError. Other possible values\n\
279 are 'ignore' and 'replace' as well as any other name registerd with\n\
280 codecs.register_error that is able to handle UnicodeDecodeErrors.");
283 iconvcodec_decode(iconvcodecObject
*self
, PyObject
*args
, PyObject
*kwargs
)
285 static char *kwlist
[] = { "input", "errors", NULL
};
286 char *errors
= NULL
/*strict*/, *out
, *out_top
;
287 const char *inp
, *inp_top
;
289 size_t inplen
, inplen_total
, outlen
, outlen_total
, estep
;
290 PyObject
*outputobj
= NULL
, *errorcb
= NULL
,
291 *exceptionobj
= NULL
;
293 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "s#|s:decode",
294 kwlist
, &inp
, &inplen_int
, &errors
))
295 return NULL
; /* TypeError */
297 errorcb
= get_errorcallback(errors
);
299 return NULL
; /* LookupError or something else from error handler */
302 inplen_total
= inplen
= (size_t)inplen_int
;
304 outputobj
= PyUnicode_FromUnicode(NULL
, inplen
);
305 if (outputobj
== NULL
)
307 outlen_total
= outlen
= PyUnicode_GET_DATA_SIZE(outputobj
);
308 out
= out_top
= (char *)PyUnicode_AS_UNICODE(outputobj
);
312 #define RESIZE_OUTBUFFER(size) { \
313 size_t toadd = (size); \
314 outlen_total += toadd; \
316 if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \
318 out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \
319 out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \
322 if (iconv(self
->dechdl
, (char**)&inp
, &inplen
, &out
, &outlen
) == -1) {
323 char reason
[128], *reasonpos
= (char *)reason
;
326 if (errno
== E2BIG
) {
327 RESIZE_OUTBUFFER(estep
);
331 if (errorcb
== ERROR_IGNORE
|| errorcb
== ERROR_REPLACE
) {
333 if (errorcb
== ERROR_REPLACE
) {
336 if (outlen
< Py_UNICODE_SIZE
)
338 errno
== EINVAL
|| Py_UNICODE_SIZE
> estep
339 ? Py_UNICODE_SIZE
: estep
);
341 /* some compilers hate casted lvalue */
342 replp
= (Py_UNICODE
*)out
;
343 assert((long)replp
% Py_UNICODE_SIZE
== 0);/* aligned? */
344 *replp
= REPLACEMENT_CHAR_DECODE
;
346 out
+= Py_UNICODE_SIZE
;
347 outlen
-= Py_UNICODE_SIZE
;
349 if (errno
== EINVAL
) break;
353 errpos
= (int)(inp
- inp_top
);
354 reasonpos
+= sprintf(reason
, "Invalid multibyte sequence \\x%02x",
355 (unsigned char)*inp
);
357 reasonpos
+= sprintf(reasonpos
,
358 "\\x%02x", (unsigned char)*(inp
+1));
360 sprintf(reasonpos
, "\\x%02x", (unsigned char)*(inp
+2));
363 if (exceptionobj
== NULL
) {
364 exceptionobj
= PyUnicodeDecodeError_Create(
365 self
->encoding
, inp_top
, inplen_total
,
366 errpos
, errpos
+ 1, reason
);
367 if (exceptionobj
== NULL
)
370 if (PyUnicodeDecodeError_SetStart(exceptionobj
, errpos
) != 0)
372 if (PyUnicodeDecodeError_SetEnd(exceptionobj
, errpos
+ 1) != 0)
374 if (PyUnicodeDecodeError_SetReason(exceptionobj
, reason
) != 0)
378 if (errorcb
== ERROR_STRICT
) {
379 PyCodec_StrictErrors(exceptionobj
);
382 PyObject
*argsobj
, *retobj
, *retuni
;
385 argsobj
= PyTuple_New(1);
388 PyTuple_SET_ITEM(argsobj
, 0, exceptionobj
);
389 Py_INCREF(exceptionobj
);
390 retobj
= PyObject_CallObject(errorcb
, argsobj
);
395 if (!PyTuple_Check(retobj
) || PyTuple_GET_SIZE(retobj
) != 2 ||
396 !PyUnicode_Check((retuni
= PyTuple_GET_ITEM(retobj
, 0))) ||
397 !PyInt_Check(PyTuple_GET_ITEM(retobj
, 1))) {
399 PyErr_SetString(PyExc_ValueError
, "decoding error handler "
400 "must return (unicode, int) tuple");
403 if (PyUnicode_GET_SIZE(retuni
) > 0) {
404 #define errorexit errorexit_cbpad
407 retunisize
= PyUnicode_GET_DATA_SIZE(retuni
);
408 if (outlen
< retunisize
)
409 RESIZE_OUTBUFFER(errno
== EINVAL
|| retunisize
> estep
410 ? retunisize
- outlen
: estep
);
412 memcpy(out
, PyUnicode_AS_DATA(retuni
), retunisize
);
414 outlen
-= retunisize
;
417 errorexit_cbpad
: Py_DECREF(retobj
);
422 newpos
= PyInt_AS_LONG(PyTuple_GET_ITEM(retobj
, 1));
426 newpos
= inplen_total
- newpos
;
427 if (newpos
< 0 || newpos
>= inplen_total
)
429 inp
= inp_top
+ newpos
;
430 inplen
= inplen_total
- newpos
;
435 #undef RESIZE_OUTBUFFER
441 finalsize
= (int)(out
- out_top
);
442 if (finalsize
!= outlen_total
) {
443 if (PyUnicode_Resize(&outputobj
, finalsize
/ Py_UNICODE_SIZE
) == -1)
447 if (errorcb
> ERROR_MAX
) {
450 Py_XDECREF(exceptionobj
);
452 rettup
= PyTuple_New(2);
453 if (rettup
== NULL
) {
454 Py_DECREF(outputobj
);
457 PyTuple_SET_ITEM(rettup
, 0, outputobj
);
458 PyTuple_SET_ITEM(rettup
, 1, PyInt_FromLong(inplen_total
));
463 Py_XDECREF(outputobj
);
464 if (errorcb
> ERROR_MAX
) {
467 Py_XDECREF(exceptionobj
);
472 static struct PyMethodDef iconvcodec_methods
[] = {
473 {"encode", (PyCFunction
)iconvcodec_encode
,
474 METH_VARARGS
| METH_KEYWORDS
,
475 iconvcodec_encode__doc__
},
476 {"decode", (PyCFunction
)iconvcodec_decode
,
477 METH_VARARGS
| METH_KEYWORDS
,
478 iconvcodec_decode__doc__
},
483 iconvcodec_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwargs
)
485 PyObject
*encobj
= NULL
;
486 iconvcodecObject
*new = NULL
;
488 new = (iconvcodecObject
*)type
->tp_alloc(type
, 0);
492 new->encoding
= NULL
;
493 new->enchdl
= new->dechdl
= (iconv_t
)(-1);
495 encobj
= PyObject_GetAttrString((PyObject
*)new, "encoding");
496 if (encobj
== NULL
) {
498 new->encoding
= PyMem_Malloc(sizeof(DEFAULT_ENCODING
));
499 strcpy(new->encoding
, DEFAULT_ENCODING
);
500 } else if (!PyString_Check(encobj
)) {
502 PyErr_SetString(PyExc_TypeError
,
503 "`encoding' attribute must be a string.");
506 new->encoding
= PyMem_Malloc(PyString_GET_SIZE(encobj
) + 1);
507 strcpy(new->encoding
, PyString_AS_STRING(encobj
));
511 new->dechdl
= iconv_open(UNICODE_ENCODING
, new->encoding
);
512 if (new->dechdl
== (iconv_t
)(-1)) {
513 PyErr_SetString(PyExc_ValueError
, "unsupported decoding");
517 new->enchdl
= iconv_open(new->encoding
, UNICODE_ENCODING
);
518 if (new->enchdl
== (iconv_t
)(-1)) {
519 PyErr_SetString(PyExc_ValueError
, "unsupported encoding");
520 iconv_close(new->dechdl
);
521 new->dechdl
= (iconv_t
)(-1);
525 return (PyObject
*)new;
534 iconvcodec_dealloc(iconvcodecObject
*self
)
536 _PyObject_GC_UNTRACK(self
);
538 if (self
->enchdl
!= (iconv_t
)-1)
539 iconv_close(self
->enchdl
);
540 if (self
->dechdl
!= (iconv_t
)-1)
541 iconv_close(self
->dechdl
);
542 if (self
->encoding
!= NULL
)
543 PyMem_Free(self
->encoding
);
545 PyObject_GC_Del(self
);
549 iconvcodec_repr(PyObject
*self
)
551 return PyString_FromFormat("<iconvcodec encoding='%s'>",
552 ((iconvcodecObject
*)self
)->encoding
);
555 statichere PyTypeObject iconvcodec_Type
= {
556 PyObject_HEAD_INIT(&PyType_Type
)
557 0, /* Number of items for varobject */
558 "iconvcodec", /* Name of this type */
559 sizeof(iconvcodecObject
), /* Basic object size */
560 0, /* Item size for varobject */
561 (destructor
)iconvcodec_dealloc
, /* tp_dealloc */
566 iconvcodec_repr
, /* tp_repr */
567 0, /* tp_as_number */
568 0, /* tp_as_sequence */
569 0, /* tp_as_mapping */
573 PyObject_GenericGetAttr
, /* tp_getattro */
575 0, /* tp_as_buffer */
576 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_BASETYPE
|
577 Py_TPFLAGS_HAVE_GC
, /* tp_flags */
578 iconvcodec_doc
, /* tp_doc */
581 0, /* tp_richcompare */
582 0, /* tp_weaklistoffset */
585 iconvcodec_methods
, /* tp_methods */
590 0, /* tp_descr_get */
591 0, /* tp_descr_set */
592 0, /* tp_dictoffset */
594 PyType_GenericAlloc
, /* tp_alloc */
595 iconvcodec_new
, /* tp_new */
596 PyObject_GC_Del
, /* tp_free */
599 static struct PyMethodDef _iconv_codec_methods
[] = {
604 init_iconv_codec(void)
608 m
= Py_InitModule("_iconv_codec", _iconv_codec_methods
);
610 PyModule_AddStringConstant(m
, "__version__", (char*)__version__
);
611 PyModule_AddObject(m
, "iconvcodec", (PyObject
*)(&iconvcodec_Type
));
612 PyModule_AddStringConstant(m
, "internal_encoding", UNICODE_ENCODING
);
614 if (PyErr_Occurred())
615 Py_FatalError("can't initialize the _iconv_codec module");