_make_boundary(): Fix for SF bug #745478, broken boundary calculation
[python/dscho.git] / Python / codecs.c
blob3675f3cc8080d4dcc65067be53441811a68ef0d8
1 /* ------------------------------------------------------------------------
3 Python Codec Registry and support functions
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
7 Copyright (c) Corporation for National Research Initiatives.
9 ------------------------------------------------------------------------ */
11 #include "Python.h"
12 #include <ctype.h>
14 /* --- Codec Registry ----------------------------------------------------- */
16 /* Import the standard encodings package which will register the first
17 codec search function.
19 This is done in a lazy way so that the Unicode implementation does
20 not downgrade startup time of scripts not needing it.
22 ImportErrors are silently ignored by this function. Only one try is
23 made.
27 static int _PyCodecRegistry_Init(void); /* Forward */
29 int PyCodec_Register(PyObject *search_function)
31 PyInterpreterState *interp = PyThreadState_Get()->interp;
32 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33 goto onError;
34 if (search_function == NULL) {
35 PyErr_BadArgument();
36 goto onError;
38 if (!PyCallable_Check(search_function)) {
39 PyErr_SetString(PyExc_TypeError,
40 "argument must be callable");
41 goto onError;
43 return PyList_Append(interp->codec_search_path, search_function);
45 onError:
46 return -1;
49 /* Convert a string to a normalized Python string: all characters are
50 converted to lower case, spaces are replaced with underscores. */
52 static
53 PyObject *normalizestring(const char *string)
55 register size_t i;
56 size_t len = strlen(string);
57 char *p;
58 PyObject *v;
60 if (len > INT_MAX) {
61 PyErr_SetString(PyExc_OverflowError, "string is too large");
62 return NULL;
65 v = PyString_FromStringAndSize(NULL, (int)len);
66 if (v == NULL)
67 return NULL;
68 p = PyString_AS_STRING(v);
69 for (i = 0; i < len; i++) {
70 register char ch = string[i];
71 if (ch == ' ')
72 ch = '-';
73 else
74 ch = tolower(ch);
75 p[i] = ch;
77 return v;
80 /* Lookup the given encoding and return a tuple providing the codec
81 facilities.
83 The encoding string is looked up converted to all lower-case
84 characters. This makes encodings looked up through this mechanism
85 effectively case-insensitive.
87 If no codec is found, a LookupError is set and NULL returned.
89 As side effect, this tries to load the encodings package, if not
90 yet done. This is part of the lazy load strategy for the encodings
91 package.
95 PyObject *_PyCodec_Lookup(const char *encoding)
97 PyInterpreterState *interp;
98 PyObject *result, *args = NULL, *v;
99 int i, len;
101 if (encoding == NULL) {
102 PyErr_BadArgument();
103 goto onError;
106 interp = PyThreadState_Get()->interp;
107 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
108 goto onError;
110 /* Convert the encoding to a normalized Python string: all
111 characters are converted to lower case, spaces and hyphens are
112 replaced with underscores. */
113 v = normalizestring(encoding);
114 if (v == NULL)
115 goto onError;
116 PyString_InternInPlace(&v);
118 /* First, try to lookup the name in the registry dictionary */
119 result = PyDict_GetItem(interp->codec_search_cache, v);
120 if (result != NULL) {
121 Py_INCREF(result);
122 Py_DECREF(v);
123 return result;
126 /* Next, scan the search functions in order of registration */
127 args = PyTuple_New(1);
128 if (args == NULL)
129 goto onError;
130 PyTuple_SET_ITEM(args,0,v);
132 len = PyList_Size(interp->codec_search_path);
133 if (len < 0)
134 goto onError;
135 if (len == 0) {
136 PyErr_SetString(PyExc_LookupError,
137 "no codec search functions registered: "
138 "can't find encoding");
139 goto onError;
142 for (i = 0; i < len; i++) {
143 PyObject *func;
145 func = PyList_GetItem(interp->codec_search_path, i);
146 if (func == NULL)
147 goto onError;
148 result = PyEval_CallObject(func, args);
149 if (result == NULL)
150 goto onError;
151 if (result == Py_None) {
152 Py_DECREF(result);
153 continue;
155 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
156 PyErr_SetString(PyExc_TypeError,
157 "codec search functions must return 4-tuples");
158 Py_DECREF(result);
159 goto onError;
161 break;
163 if (i == len) {
164 /* XXX Perhaps we should cache misses too ? */
165 PyErr_Format(PyExc_LookupError,
166 "unknown encoding: %s", encoding);
167 goto onError;
170 /* Cache and return the result */
171 PyDict_SetItem(interp->codec_search_cache, v, result);
172 Py_DECREF(args);
173 return result;
175 onError:
176 Py_XDECREF(args);
177 return NULL;
180 static
181 PyObject *args_tuple(PyObject *object,
182 const char *errors)
184 PyObject *args;
186 args = PyTuple_New(1 + (errors != NULL));
187 if (args == NULL)
188 return NULL;
189 Py_INCREF(object);
190 PyTuple_SET_ITEM(args,0,object);
191 if (errors) {
192 PyObject *v;
194 v = PyString_FromString(errors);
195 if (v == NULL) {
196 Py_DECREF(args);
197 return NULL;
199 PyTuple_SET_ITEM(args, 1, v);
201 return args;
204 /* Build a codec by calling factory(stream[,errors]) or just
205 factory(errors) depending on whether the given parameters are
206 non-NULL. */
208 static
209 PyObject *build_stream_codec(PyObject *factory,
210 PyObject *stream,
211 const char *errors)
213 PyObject *args, *codec;
215 args = args_tuple(stream, errors);
216 if (args == NULL)
217 return NULL;
219 codec = PyEval_CallObject(factory, args);
220 Py_DECREF(args);
221 return codec;
224 /* Convenience APIs to query the Codec registry.
226 All APIs return a codec object with incremented refcount.
230 PyObject *PyCodec_Encoder(const char *encoding)
232 PyObject *codecs;
233 PyObject *v;
235 codecs = _PyCodec_Lookup(encoding);
236 if (codecs == NULL)
237 goto onError;
238 v = PyTuple_GET_ITEM(codecs,0);
239 Py_DECREF(codecs);
240 Py_INCREF(v);
241 return v;
243 onError:
244 return NULL;
247 PyObject *PyCodec_Decoder(const char *encoding)
249 PyObject *codecs;
250 PyObject *v;
252 codecs = _PyCodec_Lookup(encoding);
253 if (codecs == NULL)
254 goto onError;
255 v = PyTuple_GET_ITEM(codecs,1);
256 Py_DECREF(codecs);
257 Py_INCREF(v);
258 return v;
260 onError:
261 return NULL;
264 PyObject *PyCodec_StreamReader(const char *encoding,
265 PyObject *stream,
266 const char *errors)
268 PyObject *codecs, *ret;
270 codecs = _PyCodec_Lookup(encoding);
271 if (codecs == NULL)
272 goto onError;
273 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,2),stream,errors);
274 Py_DECREF(codecs);
275 return ret;
277 onError:
278 return NULL;
281 PyObject *PyCodec_StreamWriter(const char *encoding,
282 PyObject *stream,
283 const char *errors)
285 PyObject *codecs, *ret;
287 codecs = _PyCodec_Lookup(encoding);
288 if (codecs == NULL)
289 goto onError;
290 ret = build_stream_codec(PyTuple_GET_ITEM(codecs,3),stream,errors);
291 Py_DECREF(codecs);
292 return ret;
294 onError:
295 return NULL;
298 /* Encode an object (e.g. an Unicode object) using the given encoding
299 and return the resulting encoded object (usually a Python string).
301 errors is passed to the encoder factory as argument if non-NULL. */
303 PyObject *PyCodec_Encode(PyObject *object,
304 const char *encoding,
305 const char *errors)
307 PyObject *encoder = NULL;
308 PyObject *args = NULL, *result;
309 PyObject *v;
311 encoder = PyCodec_Encoder(encoding);
312 if (encoder == NULL)
313 goto onError;
315 args = args_tuple(object, errors);
316 if (args == NULL)
317 goto onError;
319 result = PyEval_CallObject(encoder,args);
320 if (result == NULL)
321 goto onError;
323 if (!PyTuple_Check(result) ||
324 PyTuple_GET_SIZE(result) != 2) {
325 PyErr_SetString(PyExc_TypeError,
326 "encoder must return a tuple (object,integer)");
327 goto onError;
329 v = PyTuple_GET_ITEM(result,0);
330 Py_INCREF(v);
331 /* We don't check or use the second (integer) entry. */
333 Py_DECREF(args);
334 Py_DECREF(encoder);
335 Py_DECREF(result);
336 return v;
338 onError:
339 Py_XDECREF(args);
340 Py_XDECREF(encoder);
341 return NULL;
344 /* Decode an object (usually a Python string) using the given encoding
345 and return an equivalent object (e.g. an Unicode object).
347 errors is passed to the decoder factory as argument if non-NULL. */
349 PyObject *PyCodec_Decode(PyObject *object,
350 const char *encoding,
351 const char *errors)
353 PyObject *decoder = NULL;
354 PyObject *args = NULL, *result = NULL;
355 PyObject *v;
357 decoder = PyCodec_Decoder(encoding);
358 if (decoder == NULL)
359 goto onError;
361 args = args_tuple(object, errors);
362 if (args == NULL)
363 goto onError;
365 result = PyEval_CallObject(decoder,args);
366 if (result == NULL)
367 goto onError;
368 if (!PyTuple_Check(result) ||
369 PyTuple_GET_SIZE(result) != 2) {
370 PyErr_SetString(PyExc_TypeError,
371 "decoder must return a tuple (object,integer)");
372 goto onError;
374 v = PyTuple_GET_ITEM(result,0);
375 Py_INCREF(v);
376 /* We don't check or use the second (integer) entry. */
378 Py_DECREF(args);
379 Py_DECREF(decoder);
380 Py_DECREF(result);
381 return v;
383 onError:
384 Py_XDECREF(args);
385 Py_XDECREF(decoder);
386 Py_XDECREF(result);
387 return NULL;
390 /* Register the error handling callback function error under the name
391 name. This function will be called by the codec when it encounters
392 an unencodable characters/undecodable bytes and doesn't know the
393 callback name, when name is specified as the error parameter
394 in the call to the encode/decode function.
395 Return 0 on success, -1 on error */
396 int PyCodec_RegisterError(const char *name, PyObject *error)
398 PyInterpreterState *interp = PyThreadState_Get()->interp;
399 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
400 return -1;
401 if (!PyCallable_Check(error)) {
402 PyErr_SetString(PyExc_TypeError, "handler must be callable");
403 return -1;
405 return PyDict_SetItemString(interp->codec_error_registry,
406 (char *)name, error);
409 /* Lookup the error handling callback function registered under the
410 name error. As a special case NULL can be passed, in which case
411 the error handling callback for strict encoding will be returned. */
412 PyObject *PyCodec_LookupError(const char *name)
414 PyObject *handler = NULL;
416 PyInterpreterState *interp = PyThreadState_Get()->interp;
417 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
418 return NULL;
420 if (name==NULL)
421 name = "strict";
422 handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
423 if (!handler)
424 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
425 else
426 Py_INCREF(handler);
427 return handler;
430 static void wrong_exception_type(PyObject *exc)
432 PyObject *type = PyObject_GetAttrString(exc, "__class__");
433 if (type != NULL) {
434 PyObject *name = PyObject_GetAttrString(type, "__name__");
435 Py_DECREF(type);
436 if (name != NULL) {
437 PyObject *string = PyObject_Str(name);
438 Py_DECREF(name);
439 if (string != NULL) {
440 PyErr_Format(PyExc_TypeError,
441 "don't know how to handle %.400s in error callback",
442 PyString_AS_STRING(string));
443 Py_DECREF(string);
449 PyObject *PyCodec_StrictErrors(PyObject *exc)
451 if (PyInstance_Check(exc))
452 PyErr_SetObject((PyObject*)((PyInstanceObject*)exc)->in_class,
453 exc);
454 else
455 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
456 return NULL;
460 #ifdef Py_USING_UNICODE
461 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
463 int end;
464 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
465 if (PyUnicodeEncodeError_GetEnd(exc, &end))
466 return NULL;
468 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
469 if (PyUnicodeDecodeError_GetEnd(exc, &end))
470 return NULL;
472 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
473 if (PyUnicodeTranslateError_GetEnd(exc, &end))
474 return NULL;
476 else {
477 wrong_exception_type(exc);
478 return NULL;
480 /* ouch: passing NULL, 0, pos gives None instead of u'' */
481 return Py_BuildValue("(u#i)", &end, 0, end);
485 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
487 PyObject *restuple;
488 int start;
489 int end;
490 int i;
492 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
493 PyObject *res;
494 Py_UNICODE *p;
495 if (PyUnicodeEncodeError_GetStart(exc, &start))
496 return NULL;
497 if (PyUnicodeEncodeError_GetEnd(exc, &end))
498 return NULL;
499 res = PyUnicode_FromUnicode(NULL, end-start);
500 if (res == NULL)
501 return NULL;
502 for (p = PyUnicode_AS_UNICODE(res), i = start;
503 i<end; ++p, ++i)
504 *p = '?';
505 restuple = Py_BuildValue("(Oi)", res, end);
506 Py_DECREF(res);
507 return restuple;
509 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
510 Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
511 if (PyUnicodeDecodeError_GetEnd(exc, &end))
512 return NULL;
513 return Py_BuildValue("(u#i)", &res, 1, end);
515 else if (PyObject_IsInstance(exc, PyExc_UnicodeTranslateError)) {
516 PyObject *res;
517 Py_UNICODE *p;
518 if (PyUnicodeTranslateError_GetStart(exc, &start))
519 return NULL;
520 if (PyUnicodeTranslateError_GetEnd(exc, &end))
521 return NULL;
522 res = PyUnicode_FromUnicode(NULL, end-start);
523 if (res == NULL)
524 return NULL;
525 for (p = PyUnicode_AS_UNICODE(res), i = start;
526 i<end; ++p, ++i)
527 *p = Py_UNICODE_REPLACEMENT_CHARACTER;
528 restuple = Py_BuildValue("(Oi)", res, end);
529 Py_DECREF(res);
530 return restuple;
532 else {
533 wrong_exception_type(exc);
534 return NULL;
538 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
540 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
541 PyObject *restuple;
542 PyObject *object;
543 int start;
544 int end;
545 PyObject *res;
546 Py_UNICODE *p;
547 Py_UNICODE *startp;
548 Py_UNICODE *outp;
549 int ressize;
550 if (PyUnicodeEncodeError_GetStart(exc, &start))
551 return NULL;
552 if (PyUnicodeEncodeError_GetEnd(exc, &end))
553 return NULL;
554 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
555 return NULL;
556 startp = PyUnicode_AS_UNICODE(object);
557 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
558 if (*p<10)
559 ressize += 2+1+1;
560 else if (*p<100)
561 ressize += 2+2+1;
562 else if (*p<1000)
563 ressize += 2+3+1;
564 else if (*p<10000)
565 ressize += 2+4+1;
566 else if (*p<100000)
567 ressize += 2+5+1;
568 else if (*p<1000000)
569 ressize += 2+6+1;
570 else
571 ressize += 2+7+1;
573 /* allocate replacement */
574 res = PyUnicode_FromUnicode(NULL, ressize);
575 if (res == NULL) {
576 Py_DECREF(object);
577 return NULL;
579 /* generate replacement */
580 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
581 p < startp+end; ++p) {
582 Py_UNICODE c = *p;
583 int digits;
584 int base;
585 *outp++ = '&';
586 *outp++ = '#';
587 if (*p<10) {
588 digits = 1;
589 base = 1;
591 else if (*p<100) {
592 digits = 2;
593 base = 10;
595 else if (*p<1000) {
596 digits = 3;
597 base = 100;
599 else if (*p<10000) {
600 digits = 4;
601 base = 1000;
603 else if (*p<100000) {
604 digits = 5;
605 base = 10000;
607 else if (*p<1000000) {
608 digits = 6;
609 base = 100000;
611 else {
612 digits = 7;
613 base = 1000000;
615 while (digits-->0) {
616 *outp++ = '0' + c/base;
617 c %= base;
618 base /= 10;
620 *outp++ = ';';
622 restuple = Py_BuildValue("(Oi)", res, end);
623 Py_DECREF(res);
624 Py_DECREF(object);
625 return restuple;
627 else {
628 wrong_exception_type(exc);
629 return NULL;
633 static Py_UNICODE hexdigits[] = {
634 '0', '1', '2', '3', '4', '5', '6', '7',
635 '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
638 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
640 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
641 PyObject *restuple;
642 PyObject *object;
643 int start;
644 int end;
645 PyObject *res;
646 Py_UNICODE *p;
647 Py_UNICODE *startp;
648 Py_UNICODE *outp;
649 int ressize;
650 if (PyUnicodeEncodeError_GetStart(exc, &start))
651 return NULL;
652 if (PyUnicodeEncodeError_GetEnd(exc, &end))
653 return NULL;
654 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
655 return NULL;
656 startp = PyUnicode_AS_UNICODE(object);
657 for (p = startp+start, ressize = 0; p < startp+end; ++p) {
658 if (*p >= 0x00010000)
659 ressize += 1+1+8;
660 else if (*p >= 0x100) {
661 ressize += 1+1+4;
663 else
664 ressize += 1+1+2;
666 res = PyUnicode_FromUnicode(NULL, ressize);
667 if (res==NULL)
668 return NULL;
669 for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
670 p < startp+end; ++p) {
671 Py_UNICODE c = *p;
672 *outp++ = '\\';
673 if (c >= 0x00010000) {
674 *outp++ = 'U';
675 *outp++ = hexdigits[(c>>28)&0xf];
676 *outp++ = hexdigits[(c>>24)&0xf];
677 *outp++ = hexdigits[(c>>20)&0xf];
678 *outp++ = hexdigits[(c>>16)&0xf];
679 *outp++ = hexdigits[(c>>12)&0xf];
680 *outp++ = hexdigits[(c>>8)&0xf];
682 else if (c >= 0x100) {
683 *outp++ = 'u';
684 *outp++ = hexdigits[(c>>12)&0xf];
685 *outp++ = hexdigits[(c>>8)&0xf];
687 else
688 *outp++ = 'x';
689 *outp++ = hexdigits[(c>>4)&0xf];
690 *outp++ = hexdigits[c&0xf];
693 restuple = Py_BuildValue("(Oi)", res, end);
694 Py_DECREF(res);
695 Py_DECREF(object);
696 return restuple;
698 else {
699 wrong_exception_type(exc);
700 return NULL;
703 #endif
705 static PyObject *strict_errors(PyObject *self, PyObject *exc)
707 return PyCodec_StrictErrors(exc);
711 #ifdef Py_USING_UNICODE
712 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
714 return PyCodec_IgnoreErrors(exc);
718 static PyObject *replace_errors(PyObject *self, PyObject *exc)
720 return PyCodec_ReplaceErrors(exc);
724 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
726 return PyCodec_XMLCharRefReplaceErrors(exc);
730 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
732 return PyCodec_BackslashReplaceErrors(exc);
734 #endif
736 static int _PyCodecRegistry_Init(void)
738 static struct {
739 char *name;
740 PyMethodDef def;
741 } methods[] =
744 "strict",
746 "strict_errors",
747 strict_errors,
748 METH_O
751 #ifdef Py_USING_UNICODE
753 "ignore",
755 "ignore_errors",
756 ignore_errors,
757 METH_O
761 "replace",
763 "replace_errors",
764 replace_errors,
765 METH_O
769 "xmlcharrefreplace",
771 "xmlcharrefreplace_errors",
772 xmlcharrefreplace_errors,
773 METH_O
777 "backslashreplace",
779 "backslashreplace_errors",
780 backslashreplace_errors,
781 METH_O
784 #endif
787 PyInterpreterState *interp = PyThreadState_Get()->interp;
788 PyObject *mod;
789 int i;
791 if (interp->codec_search_path != NULL)
792 return 0;
794 interp->codec_search_path = PyList_New(0);
795 interp->codec_search_cache = PyDict_New();
796 interp->codec_error_registry = PyDict_New();
798 if (interp->codec_error_registry) {
799 for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
800 PyObject *func = PyCFunction_New(&methods[i].def, NULL);
801 int res;
802 if (!func)
803 Py_FatalError("can't initialize codec error registry");
804 res = PyCodec_RegisterError(methods[i].name, func);
805 Py_DECREF(func);
806 if (res)
807 Py_FatalError("can't initialize codec error registry");
811 if (interp->codec_search_path == NULL ||
812 interp->codec_search_cache == NULL ||
813 interp->codec_error_registry == NULL)
814 Py_FatalError("can't initialize codec registry");
816 mod = PyImport_ImportModuleEx("encodings", NULL, NULL, NULL);
817 if (mod == NULL) {
818 if (PyErr_ExceptionMatches(PyExc_ImportError)) {
819 /* Ignore ImportErrors... this is done so that
820 distributions can disable the encodings package. Note
821 that other errors are not masked, e.g. SystemErrors
822 raised to inform the user of an error in the Python
823 configuration are still reported back to the user. */
824 PyErr_Clear();
825 return 0;
827 return -1;
829 Py_DECREF(mod);
830 return 0;