Update for release.
[python/dscho.git] / Modules / _iconv_codec.c
blob8e84eeccb3c009b51a8494d17a62ff6d169cf34e
1 /*
2 * _iconv_codec.c
4 * libiconv adaptor for Python iconvcodec
6 * Author : Hye-Shik Chang <perky@FreeBSD.org>
7 * Created : 17 January 2003
8 */
10 #include "Python.h"
11 #include <string.h>
12 #include <iconv.h>
14 static const char *__version__ = "$Revision$";
16 #if Py_USING_UNICODE
17 # if Py_UNICODE_SIZE == 2
18 # ifdef __GNU_LIBRARY__
19 # define UNICODE_ENCODING "ucs-2"
20 # else
21 # define UNICODE_ENCODING "ucs-2-internal"
22 # endif
23 # define MBENCODED_LENGTH_MAX 4
24 # elif Py_UNICODE_SIZE == 4
25 # ifdef __GNU_LIBRARY__
26 # define UNICODE_ENCODING "ucs-4"
27 # else
28 # define UNICODE_ENCODING "ucs-4-internal"
29 # endif
30 # define MBENCODED_LENGTH_MAX 6
31 # endif
32 #else
33 # error "Unicode is not available"
34 #endif
36 typedef struct {
37 PyObject_HEAD
38 iconv_t enchdl, dechdl;
39 char *encoding;
40 } iconvcodecObject;
41 PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object");
43 staticforward PyTypeObject iconvcodec_Type;
46 #define ERROR_STRICT (PyObject *)(1)
47 #define ERROR_IGNORE (PyObject *)(2)
48 #define ERROR_REPLACE (PyObject *)(3)
49 #define ERROR_MAX ERROR_REPLACE
51 #define REPLACEMENT_CHAR_DECODE 0xFFFD
52 #define REPLACEMENT_CHAR_ENCODE '?'
54 #define DEFAULT_ENCODING "utf-8"
57 static PyObject *
58 get_errorcallback(const char *errors)
60 if (errors == NULL || strcmp(errors, "strict") == 0)
61 return ERROR_STRICT;
62 else if (strcmp(errors, "ignore") == 0)
63 return ERROR_IGNORE;
64 else if (strcmp(errors, "replace") == 0)
65 return ERROR_REPLACE;
66 else
67 return PyCodec_LookupError(errors);
71 PyDoc_STRVAR(iconvcodec_encode__doc__,
72 "I.encode(unicode, [,errors]) -> (string, length consumed)\n\
73 \n\
74 Return an encoded string version of `unicode'. errors may be given to\n\
75 set a different error handling scheme. Default is 'strict' meaning that\n\
76 encoding errors raise a UnicodeEncodeError. Other possible values are\n\
77 'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
78 registered with codecs.register_error that can handle UnicodeEncodeErrors.");
80 static PyObject *
81 iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
83 static char *kwlist[] = { "input", "errors", NULL };
84 Py_UNICODE *input;
85 int inputlen;
86 char *errors = NULL/*strict*/, *out, *out_top;
87 const char *inp, *inp_top;
88 size_t inplen, inplen_total, outlen, outlen_total, estep;
89 PyObject *outputobj = NULL, *errorcb = NULL,
90 *exceptionobj = NULL;
92 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode",
93 kwlist, &input, &inputlen, &errors))
94 return NULL; /* TypeError */
96 errorcb = get_errorcallback(errors);
97 if (errorcb == NULL)
98 return NULL; /* LookupError or something else from error handler */
100 inp = inp_top = (char *)input;
101 inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE);
103 outlen = inputlen * MBENCODED_LENGTH_MAX;
104 if (outlen < 16)
105 outlen = 16; /* for iso-2022 codecs */
107 outputobj = PyString_FromStringAndSize(NULL, outlen);
108 if (outputobj == NULL)
109 return NULL;
110 out = out_top = PyString_AS_STRING(outputobj);
111 outlen_total = outlen;
113 estep = inputlen * Py_UNICODE_SIZE / 2;
115 #define RESIZE_OUTBUFFER(size) { \
116 size_t toadd = (size); \
117 outlen_total += toadd; \
118 outlen += toadd; \
119 if (_PyString_Resize(&outputobj, outlen_total) == -1) \
120 goto errorexit; \
121 out = PyString_AS_STRING(outputobj) + (out - out_top); \
122 out_top = PyString_AS_STRING(outputobj); \
124 while (inplen > 0) {
125 if (iconv(self->enchdl, (char**)&inp, &inplen, &out, &outlen) == -1) {
126 char reason[128];
127 int errpos;
129 if (errno == E2BIG) {
130 RESIZE_OUTBUFFER(estep);
131 continue;
134 if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
135 inplen -= Py_UNICODE_SIZE;
136 inp += Py_UNICODE_SIZE;
137 if (errorcb == ERROR_REPLACE) {
138 if (outlen < 1)
139 RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep);
140 outlen--;
141 *out++ = REPLACEMENT_CHAR_ENCODE;
143 if (errno == EINVAL) break;
144 else continue;
147 errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE;
148 sprintf(reason, "Undefined character map from "
149 #if Py_UNICODE_SIZE == 2
150 "\\u%04x"
151 #elif Py_UNICODE_SIZE == 4
152 "\\u%08x"
153 #endif
154 , *(Py_UNICODE *)inp);
156 if (exceptionobj == NULL) {
157 if ((exceptionobj = PyUnicodeEncodeError_Create(
158 self->encoding, input, inputlen,
159 errpos, errpos + 1, reason)) == NULL)
160 goto errorexit;
161 } else {
162 if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0)
163 goto errorexit;
164 if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0)
165 goto errorexit;
166 if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0)
167 goto errorexit;
170 if (errorcb == ERROR_STRICT) {
171 PyCodec_StrictErrors(exceptionobj);
172 goto errorexit;
173 } else {
174 PyObject *argsobj, *retobj, *retuni;
175 long newpos;
177 argsobj = PyTuple_New(1);
178 if (argsobj == NULL)
179 goto errorexit;
180 PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
181 Py_INCREF(exceptionobj);
182 retobj = PyObject_CallObject(errorcb, argsobj);
183 Py_DECREF(argsobj);
184 if (retobj == NULL)
185 goto errorexit;
187 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
188 !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
189 !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
190 Py_DECREF(retobj);
191 PyErr_SetString(PyExc_ValueError, "encoding error handler "
192 "must return (unicode, int) tuple");
193 goto errorexit;
195 if (PyUnicode_GET_SIZE(retuni) > 0) {
196 #define errorexit errorexit_cbpad
197 PyObject *retstr = NULL;
198 int retstrsize;
200 retstr = PyUnicode_AsEncodedString(
201 retuni, self->encoding, NULL);
202 if (retstr == NULL || !PyString_Check(retstr))
203 goto errorexit;
205 retstrsize = PyString_GET_SIZE(retstr);
206 if (outlen < retstrsize)
207 RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep
208 ? retstrsize - outlen : estep);
210 memcpy(out, PyString_AS_STRING(retstr), retstrsize);
211 out += retstrsize;
212 outlen -= retstrsize;
213 #undef errorexit
214 if (0) {
215 errorexit_cbpad: Py_XDECREF(retobj);
216 Py_XDECREF(retstr);
217 goto errorexit;
219 Py_DECREF(retstr);
222 newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
223 Py_DECREF(retobj);
225 if (newpos < 0)
226 newpos = inputlen - newpos;
227 if (newpos < 0 || newpos >= inputlen)
228 break;
229 inp = inp_top + Py_UNICODE_SIZE * newpos;
230 inplen = inplen_total - Py_UNICODE_SIZE * newpos;
232 } else
233 break;
235 #undef RESIZE_OUTBUFFER
238 PyObject *rettup;
239 int finalsize;
241 finalsize = (int)(out - out_top);
243 if (finalsize != outlen_total) {
244 if (_PyString_Resize(&outputobj, finalsize) == -1)
245 goto errorexit;
248 if (errorcb > ERROR_MAX) {
249 Py_DECREF(errorcb);
251 Py_XDECREF(exceptionobj);
253 rettup = PyTuple_New(2);
254 if (rettup == NULL) {
255 Py_DECREF(outputobj);
256 return NULL;
258 PyTuple_SET_ITEM(rettup, 0, outputobj);
259 PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen));
260 return rettup;
263 errorexit:
264 Py_XDECREF(outputobj);
265 if (errorcb > ERROR_MAX) {
266 Py_DECREF(errorcb);
268 Py_XDECREF(exceptionobj);
270 return NULL;
273 PyDoc_STRVAR(iconvcodec_decode__doc__,
274 "I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\
276 Decodes `string' using I, an iconvcodec instance. errors may be given\n\
277 to set a different error handling scheme. Default is 'strict' meaning\n\
278 that encoding errors raise a UnicodeDecodeError. Other possible values\n\
279 are 'ignore' and 'replace' as well as any other name registerd with\n\
280 codecs.register_error that is able to handle UnicodeDecodeErrors.");
282 static PyObject *
283 iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
285 static char *kwlist[] = { "input", "errors", NULL };
286 char *errors = NULL/*strict*/, *out, *out_top;
287 const char *inp, *inp_top;
288 int inplen_int;
289 size_t inplen, inplen_total, outlen, outlen_total, estep;
290 PyObject *outputobj = NULL, *errorcb = NULL,
291 *exceptionobj = NULL;
293 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode",
294 kwlist, &inp, &inplen_int, &errors))
295 return NULL; /* TypeError */
297 errorcb = get_errorcallback(errors);
298 if (errorcb == NULL)
299 return NULL; /* LookupError or something else from error handler */
301 inp_top = inp;
302 inplen_total = inplen = (size_t)inplen_int;
304 outputobj = PyUnicode_FromUnicode(NULL, inplen);
305 if (outputobj == NULL)
306 return NULL;
307 outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj);
308 out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj);
310 estep = outlen / 2;
312 #define RESIZE_OUTBUFFER(size) { \
313 size_t toadd = (size); \
314 outlen_total += toadd; \
315 outlen += toadd; \
316 if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \
317 goto errorexit; \
318 out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \
319 out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \
321 while (inplen > 0) {
322 if (iconv(self->dechdl, (char**)&inp, &inplen, &out, &outlen) == -1) {
323 char reason[128], *reasonpos = (char *)reason;
324 int errpos;
326 if (errno == E2BIG) {
327 RESIZE_OUTBUFFER(estep);
328 continue;
331 if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
332 inplen--; inp++;
333 if (errorcb == ERROR_REPLACE) {
334 Py_UNICODE *replp;
336 if (outlen < Py_UNICODE_SIZE)
337 RESIZE_OUTBUFFER(
338 errno == EINVAL || Py_UNICODE_SIZE > estep
339 ? Py_UNICODE_SIZE : estep);
341 /* some compilers hate casted lvalue */
342 replp = (Py_UNICODE *)out;
343 assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */
344 *replp = REPLACEMENT_CHAR_DECODE;
346 out += Py_UNICODE_SIZE;
347 outlen -= Py_UNICODE_SIZE;
349 if (errno == EINVAL) break;
350 else continue;
353 errpos = (int)(inp - inp_top);
354 reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x",
355 (unsigned char)*inp);
356 if (inplen > 1) {
357 reasonpos += sprintf(reasonpos,
358 "\\x%02x", (unsigned char)*(inp+1));
359 if (inplen > 2)
360 sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2));
363 if (exceptionobj == NULL) {
364 exceptionobj = PyUnicodeDecodeError_Create(
365 self->encoding, inp_top, inplen_total,
366 errpos, errpos + 1, reason);
367 if (exceptionobj == NULL)
368 goto errorexit;
369 } else {
370 if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0)
371 goto errorexit;
372 if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0)
373 goto errorexit;
374 if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0)
375 goto errorexit;
378 if (errorcb == ERROR_STRICT) {
379 PyCodec_StrictErrors(exceptionobj);
380 goto errorexit;
381 } else {
382 PyObject *argsobj, *retobj, *retuni;
383 long newpos;
385 argsobj = PyTuple_New(1);
386 if (argsobj == NULL)
387 goto errorexit;
388 PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
389 Py_INCREF(exceptionobj);
390 retobj = PyObject_CallObject(errorcb, argsobj);
391 Py_DECREF(argsobj);
392 if (retobj == NULL)
393 goto errorexit;
395 if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
396 !PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
397 !PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
398 Py_DECREF(retobj);
399 PyErr_SetString(PyExc_ValueError, "decoding error handler "
400 "must return (unicode, int) tuple");
401 goto errorexit;
403 if (PyUnicode_GET_SIZE(retuni) > 0) {
404 #define errorexit errorexit_cbpad
405 size_t retunisize;
407 retunisize = PyUnicode_GET_DATA_SIZE(retuni);
408 if (outlen < retunisize)
409 RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep
410 ? retunisize - outlen : estep);
412 memcpy(out, PyUnicode_AS_DATA(retuni), retunisize);
413 out += retunisize;
414 outlen -= retunisize;
415 #undef errorexit
416 if (0) {
417 errorexit_cbpad: Py_DECREF(retobj);
418 goto errorexit;
422 newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
423 Py_DECREF(retobj);
425 if (newpos < 0)
426 newpos = inplen_total - newpos;
427 if (newpos < 0 || newpos >= inplen_total)
428 break;
429 inp = inp_top + newpos;
430 inplen = inplen_total - newpos;
432 } else
433 break;
435 #undef RESIZE_OUTBUFFER
438 PyObject *rettup;
439 int finalsize;
441 finalsize = (int)(out - out_top);
442 if (finalsize != outlen_total) {
443 if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE) == -1)
444 goto errorexit;
447 if (errorcb > ERROR_MAX) {
448 Py_DECREF(errorcb);
450 Py_XDECREF(exceptionobj);
452 rettup = PyTuple_New(2);
453 if (rettup == NULL) {
454 Py_DECREF(outputobj);
455 return NULL;
457 PyTuple_SET_ITEM(rettup, 0, outputobj);
458 PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total));
459 return rettup;
462 errorexit:
463 Py_XDECREF(outputobj);
464 if (errorcb > ERROR_MAX) {
465 Py_DECREF(errorcb);
467 Py_XDECREF(exceptionobj);
469 return NULL;
472 static struct PyMethodDef iconvcodec_methods[] = {
473 {"encode", (PyCFunction)iconvcodec_encode,
474 METH_VARARGS | METH_KEYWORDS,
475 iconvcodec_encode__doc__},
476 {"decode", (PyCFunction)iconvcodec_decode,
477 METH_VARARGS | METH_KEYWORDS,
478 iconvcodec_decode__doc__},
479 {NULL, NULL},
482 static PyObject *
483 iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
485 PyObject *encobj = NULL;
486 iconvcodecObject *new = NULL;
488 new = (iconvcodecObject *)type->tp_alloc(type, 0);
489 if (new == NULL)
490 return NULL;
492 new->encoding = NULL;
493 new->enchdl = new->dechdl = (iconv_t)(-1);
495 encobj = PyObject_GetAttrString((PyObject *)new, "encoding");
496 if (encobj == NULL) {
497 PyErr_Clear();
498 new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING));
499 strcpy(new->encoding, DEFAULT_ENCODING);
500 } else if (!PyString_Check(encobj)) {
501 Py_DECREF(encobj);
502 PyErr_SetString(PyExc_TypeError,
503 "`encoding' attribute must be a string.");
504 goto errorexit;
505 } else {
506 new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1);
507 strcpy(new->encoding, PyString_AS_STRING(encobj));
508 Py_DECREF(encobj);
511 new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding);
512 if (new->dechdl == (iconv_t)(-1)) {
513 PyErr_SetString(PyExc_ValueError, "unsupported decoding");
514 goto errorexit;
517 new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING);
518 if (new->enchdl == (iconv_t)(-1)) {
519 PyErr_SetString(PyExc_ValueError, "unsupported encoding");
520 iconv_close(new->dechdl);
521 new->dechdl = (iconv_t)(-1);
522 goto errorexit;
525 return (PyObject *)new;
527 errorexit:
528 Py_XDECREF(new);
530 return NULL;
533 static void
534 iconvcodec_dealloc(iconvcodecObject *self)
536 _PyObject_GC_UNTRACK(self);
538 if (self->enchdl != (iconv_t)-1)
539 iconv_close(self->enchdl);
540 if (self->dechdl != (iconv_t)-1)
541 iconv_close(self->dechdl);
542 if (self->encoding != NULL)
543 PyMem_Free(self->encoding);
545 PyObject_GC_Del(self);
548 static PyObject *
549 iconvcodec_repr(PyObject *self)
551 return PyString_FromFormat("<iconvcodec encoding='%s'>",
552 ((iconvcodecObject *)self)->encoding);
555 statichere PyTypeObject iconvcodec_Type = {
556 PyObject_HEAD_INIT(&PyType_Type)
557 0, /* Number of items for varobject */
558 "iconvcodec", /* Name of this type */
559 sizeof(iconvcodecObject), /* Basic object size */
560 0, /* Item size for varobject */
561 (destructor)iconvcodec_dealloc, /* tp_dealloc */
562 0, /* tp_print */
563 0, /* tp_getattr */
564 0, /* tp_setattr */
565 0, /* tp_compare */
566 iconvcodec_repr, /* tp_repr */
567 0, /* tp_as_number */
568 0, /* tp_as_sequence */
569 0, /* tp_as_mapping */
570 0, /* tp_hash */
571 0, /* tp_call */
572 0, /* tp_str */
573 PyObject_GenericGetAttr, /* tp_getattro */
574 0, /* tp_setattro */
575 0, /* tp_as_buffer */
576 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
577 Py_TPFLAGS_HAVE_GC, /* tp_flags */
578 iconvcodec_doc, /* tp_doc */
579 0, /* tp_traverse */
580 0, /* tp_clear */
581 0, /* tp_richcompare */
582 0, /* tp_weaklistoffset */
583 0, /* tp_iter */
584 0, /* tp_iterext */
585 iconvcodec_methods, /* tp_methods */
586 0, /* tp_members */
587 0, /* tp_getset */
588 0, /* tp_base */
589 0, /* tp_dict */
590 0, /* tp_descr_get */
591 0, /* tp_descr_set */
592 0, /* tp_dictoffset */
593 0, /* tp_init */
594 PyType_GenericAlloc, /* tp_alloc */
595 iconvcodec_new, /* tp_new */
596 PyObject_GC_Del, /* tp_free */
599 static struct PyMethodDef _iconv_codec_methods[] = {
600 {NULL, NULL},
603 void
604 init_iconv_codec(void)
606 PyObject *m;
608 m = Py_InitModule("_iconv_codec", _iconv_codec_methods);
610 PyModule_AddStringConstant(m, "__version__", (char*)__version__);
611 PyModule_AddObject(m, "iconvcodec", (PyObject *)(&iconvcodec_Type));
612 PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING);
614 if (PyErr_Occurred())
615 Py_FatalError("can't initialize the _iconv_codec module");
619 * ex: ts=8 sts=4 et
620 * $Id$