This commit was manufactured by cvs2svn to create tag 'r23b1-mac'.
[python/dscho.git] / Objects / unicodeobject.c
blob096dfcb7c9afc39059129eb765bca180438f3b8e
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WINDOWS
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_Del(unicode);
223 return NULL;
226 static
227 void unicode_dealloc(register PyUnicodeObject *unicode)
229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233 PyMem_DEL(unicode->str);
234 unicode->str = NULL;
235 unicode->length = 0;
237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
241 /* Add to free list */
242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
246 else {
247 PyMem_DEL(unicode->str);
248 Py_XDECREF(unicode->defenc);
249 unicode->ob_type->tp_free((PyObject *)unicode);
253 int PyUnicode_Resize(PyObject **unicode,
254 int length)
256 register PyUnicodeObject *v;
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
265 PyErr_BadInternalCall();
266 return -1;
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 Py_DECREF(*unicode);
280 *unicode = (PyObject *)w;
281 return 0;
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v, length);
289 /* Internal API for use in unicodeobject.c only ! */
290 #define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
293 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
294 int size)
296 PyUnicodeObject *unicode;
298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
300 if (u != NULL) {
302 /* Optimization for empty strings */
303 if (size == 0 && unicode_empty != NULL) {
304 Py_INCREF(unicode_empty);
305 return (PyObject *)unicode_empty;
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size == 1 && *u < 256) {
311 unicode = unicode_latin1[*u];
312 if (!unicode) {
313 unicode = _PyUnicode_New(1);
314 if (!unicode)
315 return NULL;
316 unicode->str[0] = *u;
317 unicode_latin1[*u] = unicode;
319 Py_INCREF(unicode);
320 return (PyObject *)unicode;
324 unicode = _PyUnicode_New(size);
325 if (!unicode)
326 return NULL;
328 /* Copy the Unicode data into the new object */
329 if (u != NULL)
330 Py_UNICODE_COPY(unicode->str, u, size);
332 return (PyObject *)unicode;
335 #ifdef HAVE_WCHAR_H
337 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
338 int size)
340 PyUnicodeObject *unicode;
342 if (w == NULL) {
343 PyErr_BadInternalCall();
344 return NULL;
347 unicode = _PyUnicode_New(size);
348 if (!unicode)
349 return NULL;
351 /* Copy the wchar_t data into the new object */
352 #ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode->str, w, size * sizeof(wchar_t));
354 #else
356 register Py_UNICODE *u;
357 register int i;
358 u = PyUnicode_AS_UNICODE(unicode);
359 for (i = size; i >= 0; i--)
360 *u++ = *w++;
362 #endif
364 return (PyObject *)unicode;
367 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
368 register wchar_t *w,
369 int size)
371 if (unicode == NULL) {
372 PyErr_BadInternalCall();
373 return -1;
375 if (size > PyUnicode_GET_SIZE(unicode))
376 size = PyUnicode_GET_SIZE(unicode);
377 #ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w, unicode->str, size * sizeof(wchar_t));
379 #else
381 register Py_UNICODE *u;
382 register int i;
383 u = PyUnicode_AS_UNICODE(unicode);
384 for (i = size; i >= 0; i--)
385 *w++ = *u++;
387 #endif
389 return size;
392 #endif
394 PyObject *PyUnicode_FromOrdinal(int ordinal)
396 Py_UNICODE s[2];
398 #ifdef Py_UNICODE_WIDE
399 if (ordinal < 0 || ordinal > 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
403 return NULL;
405 #else
406 if (ordinal < 0 || ordinal > 0xffff) {
407 PyErr_SetString(PyExc_ValueError,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
410 return NULL;
412 #endif
414 if (ordinal <= 0xffff) {
415 /* UCS-2 character */
416 s[0] = (Py_UNICODE) ordinal;
417 return PyUnicode_FromUnicode(s, 1);
419 else {
420 #ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
422 ordinal -= 0x10000L;
423 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
424 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
425 return PyUnicode_FromUnicode(s, 2);
426 #else
427 s[0] = (Py_UNICODE)ordinal;
428 return PyUnicode_FromUnicode(s, 1);
429 #endif
433 PyObject *PyUnicode_FromObject(register PyObject *obj)
435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj)) {
438 Py_INCREF(obj);
439 return obj;
441 if (PyUnicode_Check(obj)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
445 PyUnicode_GET_SIZE(obj));
447 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
450 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
451 const char *encoding,
452 const char *errors)
454 const char *s = NULL;
455 int len;
456 PyObject *v;
458 if (obj == NULL) {
459 PyErr_BadInternalCall();
460 return NULL;
463 #if 0
464 /* For b/w compatibility we also accept Unicode objects provided
465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
467 Unicode subclasses.
469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
473 if (PyUnicode_Check(obj)) {
474 if (encoding) {
475 PyErr_SetString(PyExc_TypeError,
476 "decoding Unicode is not supported");
477 return NULL;
479 return PyObject_Unicode(obj);
481 #else
482 if (PyUnicode_Check(obj)) {
483 PyErr_SetString(PyExc_TypeError,
484 "decoding Unicode is not supported");
485 return NULL;
487 #endif
489 /* Coerce object */
490 if (PyString_Check(obj)) {
491 s = PyString_AS_STRING(obj);
492 len = PyString_GET_SIZE(obj);
494 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError))
498 PyErr_Format(PyExc_TypeError,
499 "coercing to Unicode: need string or buffer, "
500 "%.80s found",
501 obj->ob_type->tp_name);
502 goto onError;
505 /* Convert to Unicode */
506 if (len == 0) {
507 Py_INCREF(unicode_empty);
508 v = (PyObject *)unicode_empty;
510 else
511 v = PyUnicode_Decode(s, len, encoding, errors);
513 return v;
515 onError:
516 return NULL;
519 PyObject *PyUnicode_Decode(const char *s,
520 int size,
521 const char *encoding,
522 const char *errors)
524 PyObject *buffer = NULL, *unicode;
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding, "utf-8") == 0)
531 return PyUnicode_DecodeUTF8(s, size, errors);
532 else if (strcmp(encoding, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s, size, errors);
534 else if (strcmp(encoding, "ascii") == 0)
535 return PyUnicode_DecodeASCII(s, size, errors);
537 /* Decode via the codec registry */
538 buffer = PyBuffer_FromMemory((void *)s, size);
539 if (buffer == NULL)
540 goto onError;
541 unicode = PyCodec_Decode(buffer, encoding, errors);
542 if (unicode == NULL)
543 goto onError;
544 if (!PyUnicode_Check(unicode)) {
545 PyErr_Format(PyExc_TypeError,
546 "decoder did not return an unicode object (type=%.400s)",
547 unicode->ob_type->tp_name);
548 Py_DECREF(unicode);
549 goto onError;
551 Py_DECREF(buffer);
552 return unicode;
554 onError:
555 Py_XDECREF(buffer);
556 return NULL;
559 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
560 int size,
561 const char *encoding,
562 const char *errors)
564 PyObject *v, *unicode;
566 unicode = PyUnicode_FromUnicode(s, size);
567 if (unicode == NULL)
568 return NULL;
569 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
570 Py_DECREF(unicode);
571 return v;
574 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
575 const char *encoding,
576 const char *errors)
578 PyObject *v;
580 if (!PyUnicode_Check(unicode)) {
581 PyErr_BadArgument();
582 goto onError;
585 if (encoding == NULL)
586 encoding = PyUnicode_GetDefaultEncoding();
588 /* Shortcuts for common default encodings */
589 if (errors == NULL) {
590 if (strcmp(encoding, "utf-8") == 0)
591 return PyUnicode_AsUTF8String(unicode);
592 else if (strcmp(encoding, "latin-1") == 0)
593 return PyUnicode_AsLatin1String(unicode);
594 else if (strcmp(encoding, "ascii") == 0)
595 return PyUnicode_AsASCIIString(unicode);
598 /* Encode via the codec registry */
599 v = PyCodec_Encode(unicode, encoding, errors);
600 if (v == NULL)
601 goto onError;
602 /* XXX Should we really enforce this ? */
603 if (!PyString_Check(v)) {
604 PyErr_Format(PyExc_TypeError,
605 "encoder did not return a string object (type=%.400s)",
606 v->ob_type->tp_name);
607 Py_DECREF(v);
608 goto onError;
610 return v;
612 onError:
613 return NULL;
616 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
617 const char *errors)
619 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
621 if (v)
622 return v;
623 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
624 if (v && errors == NULL)
625 ((PyUnicodeObject *)unicode)->defenc = v;
626 return v;
629 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
631 if (!PyUnicode_Check(unicode)) {
632 PyErr_BadArgument();
633 goto onError;
635 return PyUnicode_AS_UNICODE(unicode);
637 onError:
638 return NULL;
641 int PyUnicode_GetSize(PyObject *unicode)
643 if (!PyUnicode_Check(unicode)) {
644 PyErr_BadArgument();
645 goto onError;
647 return PyUnicode_GET_SIZE(unicode);
649 onError:
650 return -1;
653 const char *PyUnicode_GetDefaultEncoding(void)
655 return unicode_default_encoding;
658 int PyUnicode_SetDefaultEncoding(const char *encoding)
660 PyObject *v;
662 /* Make sure the encoding is valid. As side effect, this also
663 loads the encoding into the codec registry cache. */
664 v = _PyCodec_Lookup(encoding);
665 if (v == NULL)
666 goto onError;
667 Py_DECREF(v);
668 strncpy(unicode_default_encoding,
669 encoding,
670 sizeof(unicode_default_encoding));
671 return 0;
673 onError:
674 return -1;
677 /* error handling callback helper:
678 build arguments, call the callback and check the arguments,
679 if no exception occured, copy the replacement to the output
680 and adjust various state variables.
681 return 0 on success, -1 on error
684 static
685 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
686 const char *encoding, const char *reason,
687 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
688 PyObject **output, int *outpos, Py_UNICODE **outptr)
690 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
692 PyObject *restuple = NULL;
693 PyObject *repunicode = NULL;
694 int outsize = PyUnicode_GET_SIZE(*output);
695 int requiredsize;
696 int newpos;
697 Py_UNICODE *repptr;
698 int repsize;
699 int res = -1;
701 if (*errorHandler == NULL) {
702 *errorHandler = PyCodec_LookupError(errors);
703 if (*errorHandler == NULL)
704 goto onError;
707 if (*exceptionObject == NULL) {
708 *exceptionObject = PyUnicodeDecodeError_Create(
709 encoding, input, insize, *startinpos, *endinpos, reason);
710 if (*exceptionObject == NULL)
711 goto onError;
713 else {
714 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
715 goto onError;
716 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
717 goto onError;
718 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
719 goto onError;
722 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
723 if (restuple == NULL)
724 goto onError;
725 if (!PyTuple_Check(restuple)) {
726 PyErr_Format(PyExc_TypeError, &argparse[4]);
727 goto onError;
729 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
730 goto onError;
731 if (newpos<0)
732 newpos = insize+newpos;
733 if (newpos<0 || newpos>insize) {
734 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
735 goto onError;
738 /* need more space? (at least enough for what we
739 have+the replacement+the rest of the string (starting
740 at the new input position), so we won't have to check space
741 when there are no errors in the rest of the string) */
742 repptr = PyUnicode_AS_UNICODE(repunicode);
743 repsize = PyUnicode_GET_SIZE(repunicode);
744 requiredsize = *outpos + repsize + insize-newpos;
745 if (requiredsize > outsize) {
746 if (requiredsize<2*outsize)
747 requiredsize = 2*outsize;
748 if (PyUnicode_Resize(output, requiredsize))
749 goto onError;
750 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
752 *endinpos = newpos;
753 *inptr = input + newpos;
754 Py_UNICODE_COPY(*outptr, repptr, repsize);
755 *outptr += repsize;
756 *outpos += repsize;
757 /* we made it! */
758 res = 0;
760 onError:
761 Py_XDECREF(restuple);
762 return res;
765 /* --- UTF-7 Codec -------------------------------------------------------- */
767 /* see RFC2152 for details */
769 static
770 char utf7_special[128] = {
771 /* indicate whether a UTF-7 character is special i.e. cannot be directly
772 encoded:
773 0 - not special
774 1 - special
775 2 - whitespace (optional)
776 3 - RFC2152 Set O (optional) */
777 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
779 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
781 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
783 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
788 #define SPECIAL(c, encodeO, encodeWS) \
789 (((c)>127 || utf7_special[(c)] == 1) || \
790 (encodeWS && (utf7_special[(c)] == 2)) || \
791 (encodeO && (utf7_special[(c)] == 3)))
793 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
794 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
795 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
796 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
798 #define ENCODE(out, ch, bits) \
799 while (bits >= 6) { \
800 *out++ = B64(ch >> (bits-6)); \
801 bits -= 6; \
804 #define DECODE(out, ch, bits, surrogate) \
805 while (bits >= 16) { \
806 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
807 bits -= 16; \
808 if (surrogate) { \
809 /* We have already generated an error for the high surrogate
810 so let's not bother seeing if the low surrogate is correct or not */\
811 surrogate = 0; \
812 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
813 /* This is a surrogate pair. Unfortunately we can't represent \
814 it in a 16-bit character */ \
815 surrogate = 1; \
816 errmsg = "code pairs are not supported"; \
817 goto utf7Error; \
818 } else { \
819 *out++ = outCh; \
823 PyObject *PyUnicode_DecodeUTF7(const char *s,
824 int size,
825 const char *errors)
827 const char *starts = s;
828 int startinpos;
829 int endinpos;
830 int outpos;
831 const char *e;
832 PyUnicodeObject *unicode;
833 Py_UNICODE *p;
834 const char *errmsg = "";
835 int inShift = 0;
836 unsigned int bitsleft = 0;
837 unsigned long charsleft = 0;
838 int surrogate = 0;
839 PyObject *errorHandler = NULL;
840 PyObject *exc = NULL;
842 unicode = _PyUnicode_New(size);
843 if (!unicode)
844 return NULL;
845 if (size == 0)
846 return (PyObject *)unicode;
848 p = unicode->str;
849 e = s + size;
851 while (s < e) {
852 Py_UNICODE ch;
853 restart:
854 ch = *s;
856 if (inShift) {
857 if ((ch == '-') || !B64CHAR(ch)) {
858 inShift = 0;
859 s++;
861 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
862 if (bitsleft >= 6) {
863 /* The shift sequence has a partial character in it. If
864 bitsleft < 6 then we could just classify it as padding
865 but that is not the case here */
867 errmsg = "partial character in shift sequence";
868 goto utf7Error;
870 /* According to RFC2152 the remaining bits should be zero. We
871 choose to signal an error/insert a replacement character
872 here so indicate the potential of a misencoded character. */
874 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
875 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
876 errmsg = "non-zero padding bits in shift sequence";
877 goto utf7Error;
880 if (ch == '-') {
881 if ((s < e) && (*(s) == '-')) {
882 *p++ = '-';
883 inShift = 1;
885 } else if (SPECIAL(ch,0,0)) {
886 errmsg = "unexpected special character";
887 goto utf7Error;
888 } else {
889 *p++ = ch;
891 } else {
892 charsleft = (charsleft << 6) | UB64(ch);
893 bitsleft += 6;
894 s++;
895 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
898 else if ( ch == '+' ) {
899 startinpos = s-starts;
900 s++;
901 if (s < e && *s == '-') {
902 s++;
903 *p++ = '+';
904 } else
906 inShift = 1;
907 bitsleft = 0;
910 else if (SPECIAL(ch,0,0)) {
911 errmsg = "unexpected special character";
912 s++;
913 goto utf7Error;
915 else {
916 *p++ = ch;
917 s++;
919 continue;
920 utf7Error:
921 outpos = p-PyUnicode_AS_UNICODE(unicode);
922 endinpos = s-starts;
923 if (unicode_decode_call_errorhandler(
924 errors, &errorHandler,
925 "utf7", errmsg,
926 starts, size, &startinpos, &endinpos, &exc, &s,
927 (PyObject **)&unicode, &outpos, &p))
928 goto onError;
931 if (inShift) {
932 outpos = p-PyUnicode_AS_UNICODE(unicode);
933 endinpos = size;
934 if (unicode_decode_call_errorhandler(
935 errors, &errorHandler,
936 "utf7", "unterminated shift sequence",
937 starts, size, &startinpos, &endinpos, &exc, &s,
938 (PyObject **)&unicode, &outpos, &p))
939 goto onError;
940 if (s < e)
941 goto restart;
944 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
945 goto onError;
947 Py_XDECREF(errorHandler);
948 Py_XDECREF(exc);
949 return (PyObject *)unicode;
951 onError:
952 Py_XDECREF(errorHandler);
953 Py_XDECREF(exc);
954 Py_DECREF(unicode);
955 return NULL;
959 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
960 int size,
961 int encodeSetO,
962 int encodeWhiteSpace,
963 const char *errors)
965 PyObject *v;
966 /* It might be possible to tighten this worst case */
967 unsigned int cbAllocated = 5 * size;
968 int inShift = 0;
969 int i = 0;
970 unsigned int bitsleft = 0;
971 unsigned long charsleft = 0;
972 char * out;
973 char * start;
975 if (size == 0)
976 return PyString_FromStringAndSize(NULL, 0);
978 v = PyString_FromStringAndSize(NULL, cbAllocated);
979 if (v == NULL)
980 return NULL;
982 start = out = PyString_AS_STRING(v);
983 for (;i < size; ++i) {
984 Py_UNICODE ch = s[i];
986 if (!inShift) {
987 if (ch == '+') {
988 *out++ = '+';
989 *out++ = '-';
990 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
991 charsleft = ch;
992 bitsleft = 16;
993 *out++ = '+';
994 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
995 inShift = bitsleft > 0;
996 } else {
997 *out++ = (char) ch;
999 } else {
1000 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1001 *out++ = B64(charsleft << (6-bitsleft));
1002 charsleft = 0;
1003 bitsleft = 0;
1004 /* Characters not in the BASE64 set implicitly unshift the sequence
1005 so no '-' is required, except if the character is itself a '-' */
1006 if (B64CHAR(ch) || ch == '-') {
1007 *out++ = '-';
1009 inShift = 0;
1010 *out++ = (char) ch;
1011 } else {
1012 bitsleft += 16;
1013 charsleft = (charsleft << 16) | ch;
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1016 /* If the next character is special then we dont' need to terminate
1017 the shift sequence. If the next character is not a BASE64 character
1018 or '-' then the shift sequence will be terminated implicitly and we
1019 don't have to insert a '-'. */
1021 if (bitsleft == 0) {
1022 if (i + 1 < size) {
1023 Py_UNICODE ch2 = s[i+1];
1025 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1027 } else if (B64CHAR(ch2) || ch2 == '-') {
1028 *out++ = '-';
1029 inShift = 0;
1030 } else {
1031 inShift = 0;
1035 else {
1036 *out++ = '-';
1037 inShift = 0;
1043 if (bitsleft) {
1044 *out++= B64(charsleft << (6-bitsleft) );
1045 *out++ = '-';
1048 _PyString_Resize(&v, out - start);
1049 return v;
1052 #undef SPECIAL
1053 #undef B64
1054 #undef B64CHAR
1055 #undef UB64
1056 #undef ENCODE
1057 #undef DECODE
1059 /* --- UTF-8 Codec -------------------------------------------------------- */
1061 static
1062 char utf8_code_length[256] = {
1063 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1064 illegal prefix. see RFC 2279 for details */
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1083 PyObject *PyUnicode_DecodeUTF8(const char *s,
1084 int size,
1085 const char *errors)
1087 const char *starts = s;
1088 int n;
1089 int startinpos;
1090 int endinpos;
1091 int outpos;
1092 const char *e;
1093 PyUnicodeObject *unicode;
1094 Py_UNICODE *p;
1095 const char *errmsg = "";
1096 PyObject *errorHandler = NULL;
1097 PyObject *exc = NULL;
1099 /* Note: size will always be longer than the resulting Unicode
1100 character count */
1101 unicode = _PyUnicode_New(size);
1102 if (!unicode)
1103 return NULL;
1104 if (size == 0)
1105 return (PyObject *)unicode;
1107 /* Unpack UTF-8 encoded data */
1108 p = unicode->str;
1109 e = s + size;
1111 while (s < e) {
1112 Py_UCS4 ch = (unsigned char)*s;
1114 if (ch < 0x80) {
1115 *p++ = (Py_UNICODE)ch;
1116 s++;
1117 continue;
1120 n = utf8_code_length[ch];
1122 if (s + n > e) {
1123 errmsg = "unexpected end of data";
1124 startinpos = s-starts;
1125 endinpos = size;
1126 goto utf8Error;
1129 switch (n) {
1131 case 0:
1132 errmsg = "unexpected code byte";
1133 startinpos = s-starts;
1134 endinpos = startinpos+1;
1135 goto utf8Error;
1137 case 1:
1138 errmsg = "internal error";
1139 startinpos = s-starts;
1140 endinpos = startinpos+1;
1141 goto utf8Error;
1143 case 2:
1144 if ((s[1] & 0xc0) != 0x80) {
1145 errmsg = "invalid data";
1146 startinpos = s-starts;
1147 endinpos = startinpos+2;
1148 goto utf8Error;
1150 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1151 if (ch < 0x80) {
1152 startinpos = s-starts;
1153 endinpos = startinpos+2;
1154 errmsg = "illegal encoding";
1155 goto utf8Error;
1157 else
1158 *p++ = (Py_UNICODE)ch;
1159 break;
1161 case 3:
1162 if ((s[1] & 0xc0) != 0x80 ||
1163 (s[2] & 0xc0) != 0x80) {
1164 errmsg = "invalid data";
1165 startinpos = s-starts;
1166 endinpos = startinpos+3;
1167 goto utf8Error;
1169 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1170 if (ch < 0x0800) {
1171 /* Note: UTF-8 encodings of surrogates are considered
1172 legal UTF-8 sequences;
1174 XXX For wide builds (UCS-4) we should probably try
1175 to recombine the surrogates into a single code
1176 unit.
1178 errmsg = "illegal encoding";
1179 startinpos = s-starts;
1180 endinpos = startinpos+3;
1181 goto utf8Error;
1183 else
1184 *p++ = (Py_UNICODE)ch;
1185 break;
1187 case 4:
1188 if ((s[1] & 0xc0) != 0x80 ||
1189 (s[2] & 0xc0) != 0x80 ||
1190 (s[3] & 0xc0) != 0x80) {
1191 errmsg = "invalid data";
1192 startinpos = s-starts;
1193 endinpos = startinpos+4;
1194 goto utf8Error;
1196 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1197 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1198 /* validate and convert to UTF-16 */
1199 if ((ch < 0x10000) /* minimum value allowed for 4
1200 byte encoding */
1201 || (ch > 0x10ffff)) /* maximum value allowed for
1202 UTF-16 */
1204 errmsg = "illegal encoding";
1205 startinpos = s-starts;
1206 endinpos = startinpos+4;
1207 goto utf8Error;
1209 #ifdef Py_UNICODE_WIDE
1210 *p++ = (Py_UNICODE)ch;
1211 #else
1212 /* compute and append the two surrogates: */
1214 /* translate from 10000..10FFFF to 0..FFFF */
1215 ch -= 0x10000;
1217 /* high surrogate = top 10 bits added to D800 */
1218 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1220 /* low surrogate = bottom 10 bits added to DC00 */
1221 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1222 #endif
1223 break;
1225 default:
1226 /* Other sizes are only needed for UCS-4 */
1227 errmsg = "unsupported Unicode code range";
1228 startinpos = s-starts;
1229 endinpos = startinpos+n;
1230 goto utf8Error;
1232 s += n;
1233 continue;
1235 utf8Error:
1236 outpos = p-PyUnicode_AS_UNICODE(unicode);
1237 if (unicode_decode_call_errorhandler(
1238 errors, &errorHandler,
1239 "utf8", errmsg,
1240 starts, size, &startinpos, &endinpos, &exc, &s,
1241 (PyObject **)&unicode, &outpos, &p))
1242 goto onError;
1245 /* Adjust length */
1246 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1247 goto onError;
1249 Py_XDECREF(errorHandler);
1250 Py_XDECREF(exc);
1251 return (PyObject *)unicode;
1253 onError:
1254 Py_XDECREF(errorHandler);
1255 Py_XDECREF(exc);
1256 Py_DECREF(unicode);
1257 return NULL;
1260 /* Allocation strategy: if the string is short, convert into a stack buffer
1261 and allocate exactly as much space needed at the end. Else allocate the
1262 maximum possible needed (4 result bytes per Unicode character), and return
1263 the excess memory at the end.
1265 PyObject *
1266 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1267 int size,
1268 const char *errors)
1270 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1272 int i; /* index into s of next input byte */
1273 PyObject *v; /* result string object */
1274 char *p; /* next free byte in output buffer */
1275 int nallocated; /* number of result bytes allocated */
1276 int nneeded; /* number of result bytes needed */
1277 char stackbuf[MAX_SHORT_UNICHARS * 4];
1279 assert(s != NULL);
1280 assert(size >= 0);
1282 if (size <= MAX_SHORT_UNICHARS) {
1283 /* Write into the stack buffer; nallocated can't overflow.
1284 * At the end, we'll allocate exactly as much heap space as it
1285 * turns out we need.
1287 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1288 v = NULL; /* will allocate after we're done */
1289 p = stackbuf;
1291 else {
1292 /* Overallocate on the heap, and give the excess back at the end. */
1293 nallocated = size * 4;
1294 if (nallocated / 4 != size) /* overflow! */
1295 return PyErr_NoMemory();
1296 v = PyString_FromStringAndSize(NULL, nallocated);
1297 if (v == NULL)
1298 return NULL;
1299 p = PyString_AS_STRING(v);
1302 for (i = 0; i < size;) {
1303 Py_UCS4 ch = s[i++];
1305 if (ch < 0x80)
1306 /* Encode ASCII */
1307 *p++ = (char) ch;
1309 else if (ch < 0x0800) {
1310 /* Encode Latin-1 */
1311 *p++ = (char)(0xc0 | (ch >> 6));
1312 *p++ = (char)(0x80 | (ch & 0x3f));
1314 else {
1315 /* Encode UCS2 Unicode ordinals */
1316 if (ch < 0x10000) {
1317 /* Special case: check for high surrogate */
1318 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1319 Py_UCS4 ch2 = s[i];
1320 /* Check for low surrogate and combine the two to
1321 form a UCS4 value */
1322 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1323 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1324 i++;
1325 goto encodeUCS4;
1327 /* Fall through: handles isolated high surrogates */
1329 *p++ = (char)(0xe0 | (ch >> 12));
1330 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1331 *p++ = (char)(0x80 | (ch & 0x3f));
1332 continue;
1334 encodeUCS4:
1335 /* Encode UCS4 Unicode ordinals */
1336 *p++ = (char)(0xf0 | (ch >> 18));
1337 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1338 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339 *p++ = (char)(0x80 | (ch & 0x3f));
1343 if (v == NULL) {
1344 /* This was stack allocated. */
1345 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1346 assert(nneeded <= nallocated);
1347 v = PyString_FromStringAndSize(stackbuf, nneeded);
1349 else {
1350 /* Cut back to size actually needed. */
1351 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1352 assert(nneeded <= nallocated);
1353 _PyString_Resize(&v, nneeded);
1355 return v;
1357 #undef MAX_SHORT_UNICHARS
1360 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1362 if (!PyUnicode_Check(unicode)) {
1363 PyErr_BadArgument();
1364 return NULL;
1366 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1367 PyUnicode_GET_SIZE(unicode),
1368 NULL);
1371 /* --- UTF-16 Codec ------------------------------------------------------- */
1373 PyObject *
1374 PyUnicode_DecodeUTF16(const char *s,
1375 int size,
1376 const char *errors,
1377 int *byteorder)
1379 const char *starts = s;
1380 int startinpos;
1381 int endinpos;
1382 int outpos;
1383 PyUnicodeObject *unicode;
1384 Py_UNICODE *p;
1385 const unsigned char *q, *e;
1386 int bo = 0; /* assume native ordering by default */
1387 const char *errmsg = "";
1388 /* Offsets from q for retrieving byte pairs in the right order. */
1389 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390 int ihi = 1, ilo = 0;
1391 #else
1392 int ihi = 0, ilo = 1;
1393 #endif
1394 PyObject *errorHandler = NULL;
1395 PyObject *exc = NULL;
1397 /* Note: size will always be longer than the resulting Unicode
1398 character count */
1399 unicode = _PyUnicode_New(size);
1400 if (!unicode)
1401 return NULL;
1402 if (size == 0)
1403 return (PyObject *)unicode;
1405 /* Unpack UTF-16 encoded data */
1406 p = unicode->str;
1407 q = (unsigned char *)s;
1408 e = q + size;
1410 if (byteorder)
1411 bo = *byteorder;
1413 /* Check for BOM marks (U+FEFF) in the input and adjust current
1414 byte order setting accordingly. In native mode, the leading BOM
1415 mark is skipped, in all other modes, it is copied to the output
1416 stream as-is (giving a ZWNBSP character). */
1417 if (bo == 0) {
1418 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1419 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1420 if (bom == 0xFEFF) {
1421 q += 2;
1422 bo = -1;
1424 else if (bom == 0xFFFE) {
1425 q += 2;
1426 bo = 1;
1428 #else
1429 if (bom == 0xFEFF) {
1430 q += 2;
1431 bo = 1;
1433 else if (bom == 0xFFFE) {
1434 q += 2;
1435 bo = -1;
1437 #endif
1440 if (bo == -1) {
1441 /* force LE */
1442 ihi = 1;
1443 ilo = 0;
1445 else if (bo == 1) {
1446 /* force BE */
1447 ihi = 0;
1448 ilo = 1;
1451 while (q < e) {
1452 Py_UNICODE ch;
1453 /* remaing bytes at the end? (size should be even) */
1454 if (e-q<2) {
1455 errmsg = "truncated data";
1456 startinpos = ((const char *)q)-starts;
1457 endinpos = ((const char *)e)-starts;
1458 goto utf16Error;
1459 /* The remaining input chars are ignored if the callback
1460 chooses to skip the input */
1462 ch = (q[ihi] << 8) | q[ilo];
1464 q += 2;
1466 if (ch < 0xD800 || ch > 0xDFFF) {
1467 *p++ = ch;
1468 continue;
1471 /* UTF-16 code pair: */
1472 if (q >= e) {
1473 errmsg = "unexpected end of data";
1474 startinpos = (((const char *)q)-2)-starts;
1475 endinpos = ((const char *)e)-starts;
1476 goto utf16Error;
1478 if (0xD800 <= ch && ch <= 0xDBFF) {
1479 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1480 q += 2;
1481 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1482 #ifndef Py_UNICODE_WIDE
1483 *p++ = ch;
1484 *p++ = ch2;
1485 #else
1486 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1487 #endif
1488 continue;
1490 else {
1491 errmsg = "illegal UTF-16 surrogate";
1492 startinpos = (((const char *)q)-4)-starts;
1493 endinpos = startinpos+2;
1494 goto utf16Error;
1498 errmsg = "illegal encoding";
1499 startinpos = (((const char *)q)-2)-starts;
1500 endinpos = startinpos+2;
1501 /* Fall through to report the error */
1503 utf16Error:
1504 outpos = p-PyUnicode_AS_UNICODE(unicode);
1505 if (unicode_decode_call_errorhandler(
1506 errors, &errorHandler,
1507 "utf16", errmsg,
1508 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1509 (PyObject **)&unicode, &outpos, &p))
1510 goto onError;
1513 if (byteorder)
1514 *byteorder = bo;
1516 /* Adjust length */
1517 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1518 goto onError;
1520 Py_XDECREF(errorHandler);
1521 Py_XDECREF(exc);
1522 return (PyObject *)unicode;
1524 onError:
1525 Py_DECREF(unicode);
1526 Py_XDECREF(errorHandler);
1527 Py_XDECREF(exc);
1528 return NULL;
1531 PyObject *
1532 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1533 int size,
1534 const char *errors,
1535 int byteorder)
1537 PyObject *v;
1538 unsigned char *p;
1539 int i, pairs;
1540 /* Offsets from p for storing byte pairs in the right order. */
1541 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542 int ihi = 1, ilo = 0;
1543 #else
1544 int ihi = 0, ilo = 1;
1545 #endif
1547 #define STORECHAR(CH) \
1548 do { \
1549 p[ihi] = ((CH) >> 8) & 0xff; \
1550 p[ilo] = (CH) & 0xff; \
1551 p += 2; \
1552 } while(0)
1554 for (i = pairs = 0; i < size; i++)
1555 if (s[i] >= 0x10000)
1556 pairs++;
1557 v = PyString_FromStringAndSize(NULL,
1558 2 * (size + pairs + (byteorder == 0)));
1559 if (v == NULL)
1560 return NULL;
1562 p = (unsigned char *)PyString_AS_STRING(v);
1563 if (byteorder == 0)
1564 STORECHAR(0xFEFF);
1565 if (size == 0)
1566 return v;
1568 if (byteorder == -1) {
1569 /* force LE */
1570 ihi = 1;
1571 ilo = 0;
1573 else if (byteorder == 1) {
1574 /* force BE */
1575 ihi = 0;
1576 ilo = 1;
1579 while (size-- > 0) {
1580 Py_UNICODE ch = *s++;
1581 Py_UNICODE ch2 = 0;
1582 if (ch >= 0x10000) {
1583 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1584 ch = 0xD800 | ((ch-0x10000) >> 10);
1586 STORECHAR(ch);
1587 if (ch2)
1588 STORECHAR(ch2);
1590 return v;
1591 #undef STORECHAR
1594 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1596 if (!PyUnicode_Check(unicode)) {
1597 PyErr_BadArgument();
1598 return NULL;
1600 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1601 PyUnicode_GET_SIZE(unicode),
1602 NULL,
1606 /* --- Unicode Escape Codec ----------------------------------------------- */
1608 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1610 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1611 int size,
1612 const char *errors)
1614 const char *starts = s;
1615 int startinpos;
1616 int endinpos;
1617 int outpos;
1618 int i;
1619 PyUnicodeObject *v;
1620 Py_UNICODE *p;
1621 const char *end;
1622 char* message;
1623 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1624 PyObject *errorHandler = NULL;
1625 PyObject *exc = NULL;
1627 /* Escaped strings will always be longer than the resulting
1628 Unicode string, so we start with size here and then reduce the
1629 length after conversion to the true value.
1630 (but if the error callback returns a long replacement string
1631 we'll have to allocate more space) */
1632 v = _PyUnicode_New(size);
1633 if (v == NULL)
1634 goto onError;
1635 if (size == 0)
1636 return (PyObject *)v;
1638 p = PyUnicode_AS_UNICODE(v);
1639 end = s + size;
1641 while (s < end) {
1642 unsigned char c;
1643 Py_UNICODE x;
1644 int digits;
1646 /* Non-escape characters are interpreted as Unicode ordinals */
1647 if (*s != '\\') {
1648 *p++ = (unsigned char) *s++;
1649 continue;
1652 startinpos = s-starts;
1653 /* \ - Escapes */
1654 s++;
1655 switch (*s++) {
1657 /* \x escapes */
1658 case '\n': break;
1659 case '\\': *p++ = '\\'; break;
1660 case '\'': *p++ = '\''; break;
1661 case '\"': *p++ = '\"'; break;
1662 case 'b': *p++ = '\b'; break;
1663 case 'f': *p++ = '\014'; break; /* FF */
1664 case 't': *p++ = '\t'; break;
1665 case 'n': *p++ = '\n'; break;
1666 case 'r': *p++ = '\r'; break;
1667 case 'v': *p++ = '\013'; break; /* VT */
1668 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1670 /* \OOO (octal) escapes */
1671 case '0': case '1': case '2': case '3':
1672 case '4': case '5': case '6': case '7':
1673 x = s[-1] - '0';
1674 if ('0' <= *s && *s <= '7') {
1675 x = (x<<3) + *s++ - '0';
1676 if ('0' <= *s && *s <= '7')
1677 x = (x<<3) + *s++ - '0';
1679 *p++ = x;
1680 break;
1682 /* hex escapes */
1683 /* \xXX */
1684 case 'x':
1685 digits = 2;
1686 message = "truncated \\xXX escape";
1687 goto hexescape;
1689 /* \uXXXX */
1690 case 'u':
1691 digits = 4;
1692 message = "truncated \\uXXXX escape";
1693 goto hexescape;
1695 /* \UXXXXXXXX */
1696 case 'U':
1697 digits = 8;
1698 message = "truncated \\UXXXXXXXX escape";
1699 hexescape:
1700 chr = 0;
1701 outpos = p-PyUnicode_AS_UNICODE(v);
1702 if (s+digits>end) {
1703 endinpos = size;
1704 if (unicode_decode_call_errorhandler(
1705 errors, &errorHandler,
1706 "unicodeescape", "end of string in escape sequence",
1707 starts, size, &startinpos, &endinpos, &exc, &s,
1708 (PyObject **)&v, &outpos, &p))
1709 goto onError;
1710 goto nextByte;
1712 for (i = 0; i < digits; ++i) {
1713 c = (unsigned char) s[i];
1714 if (!isxdigit(c)) {
1715 endinpos = (s+i+1)-starts;
1716 if (unicode_decode_call_errorhandler(
1717 errors, &errorHandler,
1718 "unicodeescape", message,
1719 starts, size, &startinpos, &endinpos, &exc, &s,
1720 (PyObject **)&v, &outpos, &p))
1721 goto onError;
1722 goto nextByte;
1724 chr = (chr<<4) & ~0xF;
1725 if (c >= '0' && c <= '9')
1726 chr += c - '0';
1727 else if (c >= 'a' && c <= 'f')
1728 chr += 10 + c - 'a';
1729 else
1730 chr += 10 + c - 'A';
1732 s += i;
1733 if (chr == 0xffffffff)
1734 /* _decoding_error will have already written into the
1735 target buffer. */
1736 break;
1737 store:
1738 /* when we get here, chr is a 32-bit unicode character */
1739 if (chr <= 0xffff)
1740 /* UCS-2 character */
1741 *p++ = (Py_UNICODE) chr;
1742 else if (chr <= 0x10ffff) {
1743 /* UCS-4 character. Either store directly, or as
1744 surrogate pair. */
1745 #ifdef Py_UNICODE_WIDE
1746 *p++ = chr;
1747 #else
1748 chr -= 0x10000L;
1749 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1750 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1751 #endif
1752 } else {
1753 endinpos = s-starts;
1754 outpos = p-PyUnicode_AS_UNICODE(v);
1755 if (unicode_decode_call_errorhandler(
1756 errors, &errorHandler,
1757 "unicodeescape", "illegal Unicode character",
1758 starts, size, &startinpos, &endinpos, &exc, &s,
1759 (PyObject **)&v, &outpos, &p))
1760 goto onError;
1762 break;
1764 /* \N{name} */
1765 case 'N':
1766 message = "malformed \\N character escape";
1767 if (ucnhash_CAPI == NULL) {
1768 /* load the unicode data module */
1769 PyObject *m, *v;
1770 m = PyImport_ImportModule("unicodedata");
1771 if (m == NULL)
1772 goto ucnhashError;
1773 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1774 Py_DECREF(m);
1775 if (v == NULL)
1776 goto ucnhashError;
1777 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1778 Py_DECREF(v);
1779 if (ucnhash_CAPI == NULL)
1780 goto ucnhashError;
1782 if (*s == '{') {
1783 const char *start = s+1;
1784 /* look for the closing brace */
1785 while (*s != '}' && s < end)
1786 s++;
1787 if (s > start && s < end && *s == '}') {
1788 /* found a name. look it up in the unicode database */
1789 message = "unknown Unicode character name";
1790 s++;
1791 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1792 goto store;
1795 endinpos = s-starts;
1796 outpos = p-PyUnicode_AS_UNICODE(v);
1797 if (unicode_decode_call_errorhandler(
1798 errors, &errorHandler,
1799 "unicodeescape", message,
1800 starts, size, &startinpos, &endinpos, &exc, &s,
1801 (PyObject **)&v, &outpos, &p))
1802 goto onError;
1803 break;
1805 default:
1806 if (s > end) {
1807 message = "\\ at end of string";
1808 s--;
1809 endinpos = s-starts;
1810 outpos = p-PyUnicode_AS_UNICODE(v);
1811 if (unicode_decode_call_errorhandler(
1812 errors, &errorHandler,
1813 "unicodeescape", message,
1814 starts, size, &startinpos, &endinpos, &exc, &s,
1815 (PyObject **)&v, &outpos, &p))
1816 goto onError;
1818 else {
1819 *p++ = '\\';
1820 *p++ = (unsigned char)s[-1];
1822 break;
1824 nextByte:
1827 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1828 goto onError;
1829 return (PyObject *)v;
1831 ucnhashError:
1832 PyErr_SetString(
1833 PyExc_UnicodeError,
1834 "\\N escapes not supported (can't load unicodedata module)"
1836 Py_XDECREF(errorHandler);
1837 Py_XDECREF(exc);
1838 return NULL;
1840 onError:
1841 Py_XDECREF(v);
1842 Py_XDECREF(errorHandler);
1843 Py_XDECREF(exc);
1844 return NULL;
1847 /* Return a Unicode-Escape string version of the Unicode object.
1849 If quotes is true, the string is enclosed in u"" or u'' quotes as
1850 appropriate.
1854 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1855 int size,
1856 Py_UNICODE ch);
1858 static
1859 PyObject *unicodeescape_string(const Py_UNICODE *s,
1860 int size,
1861 int quotes)
1863 PyObject *repr;
1864 char *p;
1866 static const char *hexdigit = "0123456789abcdef";
1868 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1869 if (repr == NULL)
1870 return NULL;
1872 p = PyString_AS_STRING(repr);
1874 if (quotes) {
1875 *p++ = 'u';
1876 *p++ = (findchar(s, size, '\'') &&
1877 !findchar(s, size, '"')) ? '"' : '\'';
1879 while (size-- > 0) {
1880 Py_UNICODE ch = *s++;
1882 /* Escape quotes */
1883 if (quotes &&
1884 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1885 *p++ = '\\';
1886 *p++ = (char) ch;
1887 continue;
1890 #ifdef Py_UNICODE_WIDE
1891 /* Map 21-bit characters to '\U00xxxxxx' */
1892 else if (ch >= 0x10000) {
1893 int offset = p - PyString_AS_STRING(repr);
1895 /* Resize the string if necessary */
1896 if (offset + 12 > PyString_GET_SIZE(repr)) {
1897 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1898 return NULL;
1899 p = PyString_AS_STRING(repr) + offset;
1902 *p++ = '\\';
1903 *p++ = 'U';
1904 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1905 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1906 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1907 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1908 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1910 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1911 *p++ = hexdigit[ch & 0x0000000F];
1912 continue;
1914 #endif
1915 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916 else if (ch >= 0xD800 && ch < 0xDC00) {
1917 Py_UNICODE ch2;
1918 Py_UCS4 ucs;
1920 ch2 = *s++;
1921 size--;
1922 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1923 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1924 *p++ = '\\';
1925 *p++ = 'U';
1926 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1933 *p++ = hexdigit[ucs & 0x0000000F];
1934 continue;
1936 /* Fall through: isolated surrogates are copied as-is */
1937 s--;
1938 size++;
1941 /* Map 16-bit characters to '\uxxxx' */
1942 if (ch >= 256) {
1943 *p++ = '\\';
1944 *p++ = 'u';
1945 *p++ = hexdigit[(ch >> 12) & 0x000F];
1946 *p++ = hexdigit[(ch >> 8) & 0x000F];
1947 *p++ = hexdigit[(ch >> 4) & 0x000F];
1948 *p++ = hexdigit[ch & 0x000F];
1951 /* Map special whitespace to '\t', \n', '\r' */
1952 else if (ch == '\t') {
1953 *p++ = '\\';
1954 *p++ = 't';
1956 else if (ch == '\n') {
1957 *p++ = '\\';
1958 *p++ = 'n';
1960 else if (ch == '\r') {
1961 *p++ = '\\';
1962 *p++ = 'r';
1965 /* Map non-printable US ASCII to '\xhh' */
1966 else if (ch < ' ' || ch >= 0x7F) {
1967 *p++ = '\\';
1968 *p++ = 'x';
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
1973 /* Copy everything else as-is */
1974 else
1975 *p++ = (char) ch;
1977 if (quotes)
1978 *p++ = PyString_AS_STRING(repr)[1];
1980 *p = '\0';
1981 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1982 return repr;
1985 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1986 int size)
1988 return unicodeescape_string(s, size, 0);
1991 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1993 if (!PyUnicode_Check(unicode)) {
1994 PyErr_BadArgument();
1995 return NULL;
1997 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1998 PyUnicode_GET_SIZE(unicode));
2001 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2003 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2004 int size,
2005 const char *errors)
2007 const char *starts = s;
2008 int startinpos;
2009 int endinpos;
2010 int outpos;
2011 PyUnicodeObject *v;
2012 Py_UNICODE *p;
2013 const char *end;
2014 const char *bs;
2015 PyObject *errorHandler = NULL;
2016 PyObject *exc = NULL;
2018 /* Escaped strings will always be longer than the resulting
2019 Unicode string, so we start with size here and then reduce the
2020 length after conversion to the true value. (But decoding error
2021 handler might have to resize the string) */
2022 v = _PyUnicode_New(size);
2023 if (v == NULL)
2024 goto onError;
2025 if (size == 0)
2026 return (PyObject *)v;
2027 p = PyUnicode_AS_UNICODE(v);
2028 end = s + size;
2029 while (s < end) {
2030 unsigned char c;
2031 Py_UCS4 x;
2032 int i;
2034 /* Non-escape characters are interpreted as Unicode ordinals */
2035 if (*s != '\\') {
2036 *p++ = (unsigned char)*s++;
2037 continue;
2039 startinpos = s-starts;
2041 /* \u-escapes are only interpreted iff the number of leading
2042 backslashes if odd */
2043 bs = s;
2044 for (;s < end;) {
2045 if (*s != '\\')
2046 break;
2047 *p++ = (unsigned char)*s++;
2049 if (((s - bs) & 1) == 0 ||
2050 s >= end ||
2051 *s != 'u') {
2052 continue;
2054 p--;
2055 s++;
2057 /* \uXXXX with 4 hex digits */
2058 outpos = p-PyUnicode_AS_UNICODE(v);
2059 for (x = 0, i = 0; i < 4; ++i, ++s) {
2060 c = (unsigned char)*s;
2061 if (!isxdigit(c)) {
2062 endinpos = s-starts;
2063 if (unicode_decode_call_errorhandler(
2064 errors, &errorHandler,
2065 "rawunicodeescape", "truncated \\uXXXX",
2066 starts, size, &startinpos, &endinpos, &exc, &s,
2067 (PyObject **)&v, &outpos, &p))
2068 goto onError;
2069 goto nextByte;
2071 x = (x<<4) & ~0xF;
2072 if (c >= '0' && c <= '9')
2073 x += c - '0';
2074 else if (c >= 'a' && c <= 'f')
2075 x += 10 + c - 'a';
2076 else
2077 x += 10 + c - 'A';
2079 *p++ = x;
2080 nextByte:
2083 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2084 goto onError;
2085 Py_XDECREF(errorHandler);
2086 Py_XDECREF(exc);
2087 return (PyObject *)v;
2089 onError:
2090 Py_XDECREF(v);
2091 Py_XDECREF(errorHandler);
2092 Py_XDECREF(exc);
2093 return NULL;
2096 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2097 int size)
2099 PyObject *repr;
2100 char *p;
2101 char *q;
2103 static const char *hexdigit = "0123456789abcdef";
2105 repr = PyString_FromStringAndSize(NULL, 6 * size);
2106 if (repr == NULL)
2107 return NULL;
2108 if (size == 0)
2109 return repr;
2111 p = q = PyString_AS_STRING(repr);
2112 while (size-- > 0) {
2113 Py_UNICODE ch = *s++;
2114 /* Map 16-bit characters to '\uxxxx' */
2115 if (ch >= 256) {
2116 *p++ = '\\';
2117 *p++ = 'u';
2118 *p++ = hexdigit[(ch >> 12) & 0xf];
2119 *p++ = hexdigit[(ch >> 8) & 0xf];
2120 *p++ = hexdigit[(ch >> 4) & 0xf];
2121 *p++ = hexdigit[ch & 15];
2123 /* Copy everything else as-is */
2124 else
2125 *p++ = (char) ch;
2127 *p = '\0';
2128 _PyString_Resize(&repr, p - q);
2129 return repr;
2132 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2134 if (!PyUnicode_Check(unicode)) {
2135 PyErr_BadArgument();
2136 return NULL;
2138 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2139 PyUnicode_GET_SIZE(unicode));
2142 /* --- Latin-1 Codec ------------------------------------------------------ */
2144 PyObject *PyUnicode_DecodeLatin1(const char *s,
2145 int size,
2146 const char *errors)
2148 PyUnicodeObject *v;
2149 Py_UNICODE *p;
2151 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2152 if (size == 1 && *(unsigned char*)s < 256) {
2153 Py_UNICODE r = *(unsigned char*)s;
2154 return PyUnicode_FromUnicode(&r, 1);
2157 v = _PyUnicode_New(size);
2158 if (v == NULL)
2159 goto onError;
2160 if (size == 0)
2161 return (PyObject *)v;
2162 p = PyUnicode_AS_UNICODE(v);
2163 while (size-- > 0)
2164 *p++ = (unsigned char)*s++;
2165 return (PyObject *)v;
2167 onError:
2168 Py_XDECREF(v);
2169 return NULL;
2172 /* create or adjust a UnicodeEncodeError */
2173 static void make_encode_exception(PyObject **exceptionObject,
2174 const char *encoding,
2175 const Py_UNICODE *unicode, int size,
2176 int startpos, int endpos,
2177 const char *reason)
2179 if (*exceptionObject == NULL) {
2180 *exceptionObject = PyUnicodeEncodeError_Create(
2181 encoding, unicode, size, startpos, endpos, reason);
2183 else {
2184 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2185 goto onError;
2186 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2187 goto onError;
2188 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2189 goto onError;
2190 return;
2191 onError:
2192 Py_DECREF(*exceptionObject);
2193 *exceptionObject = NULL;
2197 /* raises a UnicodeEncodeError */
2198 static void raise_encode_exception(PyObject **exceptionObject,
2199 const char *encoding,
2200 const Py_UNICODE *unicode, int size,
2201 int startpos, int endpos,
2202 const char *reason)
2204 make_encode_exception(exceptionObject,
2205 encoding, unicode, size, startpos, endpos, reason);
2206 if (*exceptionObject != NULL)
2207 PyCodec_StrictErrors(*exceptionObject);
2210 /* error handling callback helper:
2211 build arguments, call the callback and check the arguments,
2212 put the result into newpos and return the replacement string, which
2213 has to be freed by the caller */
2214 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2215 PyObject **errorHandler,
2216 const char *encoding, const char *reason,
2217 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2218 int startpos, int endpos,
2219 int *newpos)
2221 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2223 PyObject *restuple;
2224 PyObject *resunicode;
2226 if (*errorHandler == NULL) {
2227 *errorHandler = PyCodec_LookupError(errors);
2228 if (*errorHandler == NULL)
2229 return NULL;
2232 make_encode_exception(exceptionObject,
2233 encoding, unicode, size, startpos, endpos, reason);
2234 if (*exceptionObject == NULL)
2235 return NULL;
2237 restuple = PyObject_CallFunctionObjArgs(
2238 *errorHandler, *exceptionObject, NULL);
2239 if (restuple == NULL)
2240 return NULL;
2241 if (!PyTuple_Check(restuple)) {
2242 PyErr_Format(PyExc_TypeError, &argparse[4]);
2243 Py_DECREF(restuple);
2244 return NULL;
2246 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2247 &resunicode, newpos)) {
2248 Py_DECREF(restuple);
2249 return NULL;
2251 if (*newpos<0)
2252 *newpos = size+*newpos;
2253 if (*newpos<0 || *newpos>size) {
2254 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2255 Py_DECREF(restuple);
2256 return NULL;
2258 Py_INCREF(resunicode);
2259 Py_DECREF(restuple);
2260 return resunicode;
2263 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2264 int size,
2265 const char *errors,
2266 int limit)
2268 /* output object */
2269 PyObject *res;
2270 /* pointers to the beginning and end+1 of input */
2271 const Py_UNICODE *startp = p;
2272 const Py_UNICODE *endp = p + size;
2273 /* pointer to the beginning of the unencodable characters */
2274 /* const Py_UNICODE *badp = NULL; */
2275 /* pointer into the output */
2276 char *str;
2277 /* current output position */
2278 int respos = 0;
2279 int ressize;
2280 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2281 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2282 PyObject *errorHandler = NULL;
2283 PyObject *exc = NULL;
2284 /* the following variable is used for caching string comparisons
2285 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2286 int known_errorHandler = -1;
2288 /* allocate enough for a simple encoding without
2289 replacements, if we need more, we'll resize */
2290 res = PyString_FromStringAndSize(NULL, size);
2291 if (res == NULL)
2292 goto onError;
2293 if (size == 0)
2294 return res;
2295 str = PyString_AS_STRING(res);
2296 ressize = size;
2298 while (p<endp) {
2299 Py_UNICODE c = *p;
2301 /* can we encode this? */
2302 if (c<limit) {
2303 /* no overflow check, because we know that the space is enough */
2304 *str++ = (char)c;
2305 ++p;
2307 else {
2308 int unicodepos = p-startp;
2309 int requiredsize;
2310 PyObject *repunicode;
2311 int repsize;
2312 int newpos;
2313 int respos;
2314 Py_UNICODE *uni2;
2315 /* startpos for collecting unencodable chars */
2316 const Py_UNICODE *collstart = p;
2317 const Py_UNICODE *collend = p;
2318 /* find all unecodable characters */
2319 while ((collend < endp) && ((*collend)>=limit))
2320 ++collend;
2321 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2322 if (known_errorHandler==-1) {
2323 if ((errors==NULL) || (!strcmp(errors, "strict")))
2324 known_errorHandler = 1;
2325 else if (!strcmp(errors, "replace"))
2326 known_errorHandler = 2;
2327 else if (!strcmp(errors, "ignore"))
2328 known_errorHandler = 3;
2329 else if (!strcmp(errors, "xmlcharrefreplace"))
2330 known_errorHandler = 4;
2331 else
2332 known_errorHandler = 0;
2334 switch (known_errorHandler) {
2335 case 1: /* strict */
2336 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2337 goto onError;
2338 case 2: /* replace */
2339 while (collstart++<collend)
2340 *str++ = '?'; /* fall through */
2341 case 3: /* ignore */
2342 p = collend;
2343 break;
2344 case 4: /* xmlcharrefreplace */
2345 respos = str-PyString_AS_STRING(res);
2346 /* determine replacement size (temporarily (mis)uses p) */
2347 for (p = collstart, repsize = 0; p < collend; ++p) {
2348 if (*p<10)
2349 repsize += 2+1+1;
2350 else if (*p<100)
2351 repsize += 2+2+1;
2352 else if (*p<1000)
2353 repsize += 2+3+1;
2354 else if (*p<10000)
2355 repsize += 2+4+1;
2356 else if (*p<100000)
2357 repsize += 2+5+1;
2358 else if (*p<1000000)
2359 repsize += 2+6+1;
2360 else
2361 repsize += 2+7+1;
2363 requiredsize = respos+repsize+(endp-collend);
2364 if (requiredsize > ressize) {
2365 if (requiredsize<2*ressize)
2366 requiredsize = 2*ressize;
2367 if (_PyString_Resize(&res, requiredsize))
2368 goto onError;
2369 str = PyString_AS_STRING(res) + respos;
2370 ressize = requiredsize;
2372 /* generate replacement (temporarily (mis)uses p) */
2373 for (p = collstart; p < collend; ++p) {
2374 str += sprintf(str, "&#%d;", (int)*p);
2376 p = collend;
2377 break;
2378 default:
2379 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2380 encoding, reason, startp, size, &exc,
2381 collstart-startp, collend-startp, &newpos);
2382 if (repunicode == NULL)
2383 goto onError;
2384 /* need more space? (at least enough for what we
2385 have+the replacement+the rest of the string, so
2386 we won't have to check space for encodable characters) */
2387 respos = str-PyString_AS_STRING(res);
2388 repsize = PyUnicode_GET_SIZE(repunicode);
2389 requiredsize = respos+repsize+(endp-collend);
2390 if (requiredsize > ressize) {
2391 if (requiredsize<2*ressize)
2392 requiredsize = 2*ressize;
2393 if (_PyString_Resize(&res, requiredsize)) {
2394 Py_DECREF(repunicode);
2395 goto onError;
2397 str = PyString_AS_STRING(res) + respos;
2398 ressize = requiredsize;
2400 /* check if there is anything unencodable in the replacement
2401 and copy it to the output */
2402 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2403 c = *uni2;
2404 if (c >= limit) {
2405 raise_encode_exception(&exc, encoding, startp, size,
2406 unicodepos, unicodepos+1, reason);
2407 Py_DECREF(repunicode);
2408 goto onError;
2410 *str = (char)c;
2412 p = startp + newpos;
2413 Py_DECREF(repunicode);
2417 /* Resize if we allocated to much */
2418 respos = str-PyString_AS_STRING(res);
2419 if (respos<ressize)
2420 /* If this falls res will be NULL */
2421 _PyString_Resize(&res, respos);
2422 Py_XDECREF(errorHandler);
2423 Py_XDECREF(exc);
2424 return res;
2426 onError:
2427 Py_XDECREF(res);
2428 Py_XDECREF(errorHandler);
2429 Py_XDECREF(exc);
2430 return NULL;
2433 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2434 int size,
2435 const char *errors)
2437 return unicode_encode_ucs1(p, size, errors, 256);
2440 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2442 if (!PyUnicode_Check(unicode)) {
2443 PyErr_BadArgument();
2444 return NULL;
2446 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2447 PyUnicode_GET_SIZE(unicode),
2448 NULL);
2451 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2453 PyObject *PyUnicode_DecodeASCII(const char *s,
2454 int size,
2455 const char *errors)
2457 const char *starts = s;
2458 PyUnicodeObject *v;
2459 Py_UNICODE *p;
2460 int startinpos;
2461 int endinpos;
2462 int outpos;
2463 const char *e;
2464 PyObject *errorHandler = NULL;
2465 PyObject *exc = NULL;
2467 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2468 if (size == 1 && *(unsigned char*)s < 128) {
2469 Py_UNICODE r = *(unsigned char*)s;
2470 return PyUnicode_FromUnicode(&r, 1);
2473 v = _PyUnicode_New(size);
2474 if (v == NULL)
2475 goto onError;
2476 if (size == 0)
2477 return (PyObject *)v;
2478 p = PyUnicode_AS_UNICODE(v);
2479 e = s + size;
2480 while (s < e) {
2481 register unsigned char c = (unsigned char)*s;
2482 if (c < 128) {
2483 *p++ = c;
2484 ++s;
2486 else {
2487 startinpos = s-starts;
2488 endinpos = startinpos + 1;
2489 outpos = p-PyUnicode_AS_UNICODE(v);
2490 if (unicode_decode_call_errorhandler(
2491 errors, &errorHandler,
2492 "ascii", "ordinal not in range(128)",
2493 starts, size, &startinpos, &endinpos, &exc, &s,
2494 (PyObject **)&v, &outpos, &p))
2495 goto onError;
2498 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2499 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2500 goto onError;
2501 Py_XDECREF(errorHandler);
2502 Py_XDECREF(exc);
2503 return (PyObject *)v;
2505 onError:
2506 Py_XDECREF(v);
2507 Py_XDECREF(errorHandler);
2508 Py_XDECREF(exc);
2509 return NULL;
2512 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2513 int size,
2514 const char *errors)
2516 return unicode_encode_ucs1(p, size, errors, 128);
2519 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2521 if (!PyUnicode_Check(unicode)) {
2522 PyErr_BadArgument();
2523 return NULL;
2525 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2526 PyUnicode_GET_SIZE(unicode),
2527 NULL);
2530 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2532 /* --- MBCS codecs for Windows -------------------------------------------- */
2534 PyObject *PyUnicode_DecodeMBCS(const char *s,
2535 int size,
2536 const char *errors)
2538 PyUnicodeObject *v;
2539 Py_UNICODE *p;
2541 /* First get the size of the result */
2542 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2543 if (size > 0 && usize==0)
2544 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2546 v = _PyUnicode_New(usize);
2547 if (v == NULL)
2548 return NULL;
2549 if (usize == 0)
2550 return (PyObject *)v;
2551 p = PyUnicode_AS_UNICODE(v);
2552 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2553 Py_DECREF(v);
2554 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2557 return (PyObject *)v;
2560 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2561 int size,
2562 const char *errors)
2564 PyObject *repr;
2565 char *s;
2566 DWORD mbcssize;
2568 /* If there are no characters, bail now! */
2569 if (size==0)
2570 return PyString_FromString("");
2572 /* First get the size of the result */
2573 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2574 if (mbcssize==0)
2575 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2577 repr = PyString_FromStringAndSize(NULL, mbcssize);
2578 if (repr == NULL)
2579 return NULL;
2580 if (mbcssize == 0)
2581 return repr;
2583 /* Do the conversion */
2584 s = PyString_AS_STRING(repr);
2585 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2586 Py_DECREF(repr);
2587 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2589 return repr;
2592 #endif /* MS_WINDOWS */
2594 /* --- Character Mapping Codec -------------------------------------------- */
2596 PyObject *PyUnicode_DecodeCharmap(const char *s,
2597 int size,
2598 PyObject *mapping,
2599 const char *errors)
2601 const char *starts = s;
2602 int startinpos;
2603 int endinpos;
2604 int outpos;
2605 const char *e;
2606 PyUnicodeObject *v;
2607 Py_UNICODE *p;
2608 int extrachars = 0;
2609 PyObject *errorHandler = NULL;
2610 PyObject *exc = NULL;
2612 /* Default to Latin-1 */
2613 if (mapping == NULL)
2614 return PyUnicode_DecodeLatin1(s, size, errors);
2616 v = _PyUnicode_New(size);
2617 if (v == NULL)
2618 goto onError;
2619 if (size == 0)
2620 return (PyObject *)v;
2621 p = PyUnicode_AS_UNICODE(v);
2622 e = s + size;
2623 while (s < e) {
2624 unsigned char ch = *s;
2625 PyObject *w, *x;
2627 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2628 w = PyInt_FromLong((long)ch);
2629 if (w == NULL)
2630 goto onError;
2631 x = PyObject_GetItem(mapping, w);
2632 Py_DECREF(w);
2633 if (x == NULL) {
2634 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2635 /* No mapping found means: mapping is undefined. */
2636 PyErr_Clear();
2637 x = Py_None;
2638 Py_INCREF(x);
2639 } else
2640 goto onError;
2643 /* Apply mapping */
2644 if (PyInt_Check(x)) {
2645 long value = PyInt_AS_LONG(x);
2646 if (value < 0 || value > 65535) {
2647 PyErr_SetString(PyExc_TypeError,
2648 "character mapping must be in range(65536)");
2649 Py_DECREF(x);
2650 goto onError;
2652 *p++ = (Py_UNICODE)value;
2654 else if (x == Py_None) {
2655 /* undefined mapping */
2656 outpos = p-PyUnicode_AS_UNICODE(v);
2657 startinpos = s-starts;
2658 endinpos = startinpos+1;
2659 if (unicode_decode_call_errorhandler(
2660 errors, &errorHandler,
2661 "charmap", "character maps to <undefined>",
2662 starts, size, &startinpos, &endinpos, &exc, &s,
2663 (PyObject **)&v, &outpos, &p)) {
2664 Py_DECREF(x);
2665 goto onError;
2667 continue;
2669 else if (PyUnicode_Check(x)) {
2670 int targetsize = PyUnicode_GET_SIZE(x);
2672 if (targetsize == 1)
2673 /* 1-1 mapping */
2674 *p++ = *PyUnicode_AS_UNICODE(x);
2676 else if (targetsize > 1) {
2677 /* 1-n mapping */
2678 if (targetsize > extrachars) {
2679 /* resize first */
2680 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2681 int needed = (targetsize - extrachars) + \
2682 (targetsize << 2);
2683 extrachars += needed;
2684 if (_PyUnicode_Resize(&v,
2685 PyUnicode_GET_SIZE(v) + needed)) {
2686 Py_DECREF(x);
2687 goto onError;
2689 p = PyUnicode_AS_UNICODE(v) + oldpos;
2691 Py_UNICODE_COPY(p,
2692 PyUnicode_AS_UNICODE(x),
2693 targetsize);
2694 p += targetsize;
2695 extrachars -= targetsize;
2697 /* 1-0 mapping: skip the character */
2699 else {
2700 /* wrong return value */
2701 PyErr_SetString(PyExc_TypeError,
2702 "character mapping must return integer, None or unicode");
2703 Py_DECREF(x);
2704 goto onError;
2706 Py_DECREF(x);
2707 ++s;
2709 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2710 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2711 goto onError;
2712 Py_XDECREF(errorHandler);
2713 Py_XDECREF(exc);
2714 return (PyObject *)v;
2716 onError:
2717 Py_XDECREF(errorHandler);
2718 Py_XDECREF(exc);
2719 Py_XDECREF(v);
2720 return NULL;
2723 /* Lookup the character ch in the mapping. If the character
2724 can't be found, Py_None is returned (or NULL, if another
2725 error occured). */
2726 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2728 PyObject *w = PyInt_FromLong((long)c);
2729 PyObject *x;
2731 if (w == NULL)
2732 return NULL;
2733 x = PyObject_GetItem(mapping, w);
2734 Py_DECREF(w);
2735 if (x == NULL) {
2736 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2737 /* No mapping found means: mapping is undefined. */
2738 PyErr_Clear();
2739 x = Py_None;
2740 Py_INCREF(x);
2741 return x;
2742 } else
2743 return NULL;
2745 else if (x == Py_None)
2746 return x;
2747 else if (PyInt_Check(x)) {
2748 long value = PyInt_AS_LONG(x);
2749 if (value < 0 || value > 255) {
2750 PyErr_SetString(PyExc_TypeError,
2751 "character mapping must be in range(256)");
2752 Py_DECREF(x);
2753 return NULL;
2755 return x;
2757 else if (PyString_Check(x))
2758 return x;
2759 else {
2760 /* wrong return value */
2761 PyErr_SetString(PyExc_TypeError,
2762 "character mapping must return integer, None or str");
2763 Py_DECREF(x);
2764 return NULL;
2768 /* lookup the character, put the result in the output string and adjust
2769 various state variables. Reallocate the output string if not enough
2770 space is available. Return a new reference to the object that
2771 was put in the output buffer, or Py_None, if the mapping was undefined
2772 (in which case no character was written) or NULL, if a
2773 reallocation error ocurred. The called must decref the result */
2774 static
2775 PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2776 PyObject **outobj, int *outpos)
2778 PyObject *rep = charmapencode_lookup(c, mapping);
2780 if (rep==NULL)
2781 return NULL;
2782 else if (rep==Py_None)
2783 return rep;
2784 else {
2785 char *outstart = PyString_AS_STRING(*outobj);
2786 int outsize = PyString_GET_SIZE(*outobj);
2787 if (PyInt_Check(rep)) {
2788 int requiredsize = *outpos+1;
2789 if (outsize<requiredsize) {
2790 /* exponentially overallocate to minimize reallocations */
2791 if (requiredsize < 2*outsize)
2792 requiredsize = 2*outsize;
2793 if (_PyString_Resize(outobj, requiredsize)) {
2794 Py_DECREF(rep);
2795 return NULL;
2797 outstart = PyString_AS_STRING(*outobj);
2799 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2801 else {
2802 const char *repchars = PyString_AS_STRING(rep);
2803 int repsize = PyString_GET_SIZE(rep);
2804 int requiredsize = *outpos+repsize;
2805 if (outsize<requiredsize) {
2806 /* exponentially overallocate to minimize reallocations */
2807 if (requiredsize < 2*outsize)
2808 requiredsize = 2*outsize;
2809 if (_PyString_Resize(outobj, requiredsize)) {
2810 Py_DECREF(rep);
2811 return NULL;
2813 outstart = PyString_AS_STRING(*outobj);
2815 memcpy(outstart + *outpos, repchars, repsize);
2816 *outpos += repsize;
2819 return rep;
2822 /* handle an error in PyUnicode_EncodeCharmap
2823 Return 0 on success, -1 on error */
2824 static
2825 int charmap_encoding_error(
2826 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2827 PyObject **exceptionObject,
2828 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2829 PyObject **res, int *respos)
2831 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2832 int repsize;
2833 int newpos;
2834 Py_UNICODE *uni2;
2835 /* startpos for collecting unencodable chars */
2836 int collstartpos = *inpos;
2837 int collendpos = *inpos+1;
2838 int collpos;
2839 char *encoding = "charmap";
2840 char *reason = "character maps to <undefined>";
2842 PyObject *x;
2843 /* find all unencodable characters */
2844 while (collendpos < size) {
2845 x = charmapencode_lookup(p[collendpos], mapping);
2846 if (x==NULL)
2847 return -1;
2848 else if (x!=Py_None) {
2849 Py_DECREF(x);
2850 break;
2852 Py_DECREF(x);
2853 ++collendpos;
2855 /* cache callback name lookup
2856 * (if not done yet, i.e. it's the first error) */
2857 if (*known_errorHandler==-1) {
2858 if ((errors==NULL) || (!strcmp(errors, "strict")))
2859 *known_errorHandler = 1;
2860 else if (!strcmp(errors, "replace"))
2861 *known_errorHandler = 2;
2862 else if (!strcmp(errors, "ignore"))
2863 *known_errorHandler = 3;
2864 else if (!strcmp(errors, "xmlcharrefreplace"))
2865 *known_errorHandler = 4;
2866 else
2867 *known_errorHandler = 0;
2869 switch (*known_errorHandler) {
2870 case 1: /* strict */
2871 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2872 return -1;
2873 case 2: /* replace */
2874 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2875 x = charmapencode_output('?', mapping, res, respos);
2876 if (x==NULL) {
2877 return -1;
2879 else if (x==Py_None) {
2880 Py_DECREF(x);
2881 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2882 return -1;
2884 Py_DECREF(x);
2886 /* fall through */
2887 case 3: /* ignore */
2888 *inpos = collendpos;
2889 break;
2890 case 4: /* xmlcharrefreplace */
2891 /* generate replacement (temporarily (mis)uses p) */
2892 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2893 char buffer[2+29+1+1];
2894 char *cp;
2895 sprintf(buffer, "&#%d;", (int)p[collpos]);
2896 for (cp = buffer; *cp; ++cp) {
2897 x = charmapencode_output(*cp, mapping, res, respos);
2898 if (x==NULL)
2899 return -1;
2900 else if (x==Py_None) {
2901 Py_DECREF(x);
2902 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2903 return -1;
2905 Py_DECREF(x);
2908 *inpos = collendpos;
2909 break;
2910 default:
2911 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2912 encoding, reason, p, size, exceptionObject,
2913 collstartpos, collendpos, &newpos);
2914 if (repunicode == NULL)
2915 return -1;
2916 /* generate replacement */
2917 repsize = PyUnicode_GET_SIZE(repunicode);
2918 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2919 x = charmapencode_output(*uni2, mapping, res, respos);
2920 if (x==NULL) {
2921 Py_DECREF(repunicode);
2922 return -1;
2924 else if (x==Py_None) {
2925 Py_DECREF(repunicode);
2926 Py_DECREF(x);
2927 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2928 return -1;
2930 Py_DECREF(x);
2932 *inpos = newpos;
2933 Py_DECREF(repunicode);
2935 return 0;
2938 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2939 int size,
2940 PyObject *mapping,
2941 const char *errors)
2943 /* output object */
2944 PyObject *res = NULL;
2945 /* current input position */
2946 int inpos = 0;
2947 /* current output position */
2948 int respos = 0;
2949 PyObject *errorHandler = NULL;
2950 PyObject *exc = NULL;
2951 /* the following variable is used for caching string comparisons
2952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2953 * 3=ignore, 4=xmlcharrefreplace */
2954 int known_errorHandler = -1;
2956 /* Default to Latin-1 */
2957 if (mapping == NULL)
2958 return PyUnicode_EncodeLatin1(p, size, errors);
2960 /* allocate enough for a simple encoding without
2961 replacements, if we need more, we'll resize */
2962 res = PyString_FromStringAndSize(NULL, size);
2963 if (res == NULL)
2964 goto onError;
2965 if (size == 0)
2966 return res;
2968 while (inpos<size) {
2969 /* try to encode it */
2970 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2971 if (x==NULL) /* error */
2972 goto onError;
2973 if (x==Py_None) { /* unencodable character */
2974 if (charmap_encoding_error(p, size, &inpos, mapping,
2975 &exc,
2976 &known_errorHandler, errorHandler, errors,
2977 &res, &respos))
2978 goto onError;
2980 else
2981 /* done with this character => adjust input position */
2982 ++inpos;
2983 Py_DECREF(x);
2986 /* Resize if we allocated to much */
2987 if (respos<PyString_GET_SIZE(res)) {
2988 if (_PyString_Resize(&res, respos))
2989 goto onError;
2991 Py_XDECREF(exc);
2992 Py_XDECREF(errorHandler);
2993 return res;
2995 onError:
2996 Py_XDECREF(res);
2997 Py_XDECREF(exc);
2998 Py_XDECREF(errorHandler);
2999 return NULL;
3002 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3003 PyObject *mapping)
3005 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3006 PyErr_BadArgument();
3007 return NULL;
3009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3010 PyUnicode_GET_SIZE(unicode),
3011 mapping,
3012 NULL);
3015 /* create or adjust a UnicodeTranslateError */
3016 static void make_translate_exception(PyObject **exceptionObject,
3017 const Py_UNICODE *unicode, int size,
3018 int startpos, int endpos,
3019 const char *reason)
3021 if (*exceptionObject == NULL) {
3022 *exceptionObject = PyUnicodeTranslateError_Create(
3023 unicode, size, startpos, endpos, reason);
3025 else {
3026 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3027 goto onError;
3028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3029 goto onError;
3030 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3031 goto onError;
3032 return;
3033 onError:
3034 Py_DECREF(*exceptionObject);
3035 *exceptionObject = NULL;
3039 /* raises a UnicodeTranslateError */
3040 static void raise_translate_exception(PyObject **exceptionObject,
3041 const Py_UNICODE *unicode, int size,
3042 int startpos, int endpos,
3043 const char *reason)
3045 make_translate_exception(exceptionObject,
3046 unicode, size, startpos, endpos, reason);
3047 if (*exceptionObject != NULL)
3048 PyCodec_StrictErrors(*exceptionObject);
3051 /* error handling callback helper:
3052 build arguments, call the callback and check the arguments,
3053 put the result into newpos and return the replacement string, which
3054 has to be freed by the caller */
3055 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3056 PyObject **errorHandler,
3057 const char *reason,
3058 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3059 int startpos, int endpos,
3060 int *newpos)
3062 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3064 PyObject *restuple;
3065 PyObject *resunicode;
3067 if (*errorHandler == NULL) {
3068 *errorHandler = PyCodec_LookupError(errors);
3069 if (*errorHandler == NULL)
3070 return NULL;
3073 make_translate_exception(exceptionObject,
3074 unicode, size, startpos, endpos, reason);
3075 if (*exceptionObject == NULL)
3076 return NULL;
3078 restuple = PyObject_CallFunctionObjArgs(
3079 *errorHandler, *exceptionObject, NULL);
3080 if (restuple == NULL)
3081 return NULL;
3082 if (!PyTuple_Check(restuple)) {
3083 PyErr_Format(PyExc_TypeError, &argparse[4]);
3084 Py_DECREF(restuple);
3085 return NULL;
3087 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3088 &resunicode, newpos)) {
3089 Py_DECREF(restuple);
3090 return NULL;
3092 if (*newpos<0)
3093 *newpos = size+*newpos;
3094 if (*newpos<0 || *newpos>size) {
3095 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3096 Py_DECREF(restuple);
3097 return NULL;
3099 Py_INCREF(resunicode);
3100 Py_DECREF(restuple);
3101 return resunicode;
3104 /* Lookup the character ch in the mapping and put the result in result,
3105 which must be decrefed by the caller.
3106 Return 0 on success, -1 on error */
3107 static
3108 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3110 PyObject *w = PyInt_FromLong((long)c);
3111 PyObject *x;
3113 if (w == NULL)
3114 return -1;
3115 x = PyObject_GetItem(mapping, w);
3116 Py_DECREF(w);
3117 if (x == NULL) {
3118 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119 /* No mapping found means: use 1:1 mapping. */
3120 PyErr_Clear();
3121 *result = NULL;
3122 return 0;
3123 } else
3124 return -1;
3126 else if (x == Py_None) {
3127 *result = x;
3128 return 0;
3130 else if (PyInt_Check(x)) {
3131 long value = PyInt_AS_LONG(x);
3132 long max = PyUnicode_GetMax();
3133 if (value < 0 || value > max) {
3134 PyErr_Format(PyExc_TypeError,
3135 "character mapping must be in range(0x%lx)", max+1);
3136 Py_DECREF(x);
3137 return -1;
3139 *result = x;
3140 return 0;
3142 else if (PyUnicode_Check(x)) {
3143 *result = x;
3144 return 0;
3146 else {
3147 /* wrong return value */
3148 PyErr_SetString(PyExc_TypeError,
3149 "character mapping must return integer, None or unicode");
3150 return -1;
3153 /* ensure that *outobj is at least requiredsize characters long,
3154 if not reallocate and adjust various state variables.
3155 Return 0 on success, -1 on error */
3156 static
3157 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3158 int requiredsize)
3160 if (requiredsize > *outsize) {
3161 /* remember old output position */
3162 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3163 /* exponentially overallocate to minimize reallocations */
3164 if (requiredsize < 2 * *outsize)
3165 requiredsize = 2 * *outsize;
3166 if (_PyUnicode_Resize(outobj, requiredsize))
3167 return -1;
3168 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3169 *outsize = requiredsize;
3171 return 0;
3173 /* lookup the character, put the result in the output string and adjust
3174 various state variables. Return a new reference to the object that
3175 was put in the output buffer in *result, or Py_None, if the mapping was
3176 undefined (in which case no character was written).
3177 The called must decref result.
3178 Return 0 on success, -1 on error. */
3179 static
3180 int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3181 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3183 if (charmaptranslate_lookup(c, mapping, res))
3184 return -1;
3185 if (*res==NULL) {
3186 /* not found => default to 1:1 mapping */
3187 *(*outp)++ = (Py_UNICODE)c;
3189 else if (*res==Py_None)
3191 else if (PyInt_Check(*res)) {
3192 /* no overflow check, because we know that the space is enough */
3193 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3195 else if (PyUnicode_Check(*res)) {
3196 int repsize = PyUnicode_GET_SIZE(*res);
3197 if (repsize==1) {
3198 /* no overflow check, because we know that the space is enough */
3199 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3201 else if (repsize!=0) {
3202 /* more than one character */
3203 int requiredsize = *outsize + repsize - 1;
3204 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3205 return -1;
3206 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3207 *outp += repsize;
3210 else
3211 return -1;
3212 return 0;
3215 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3216 int size,
3217 PyObject *mapping,
3218 const char *errors)
3220 /* output object */
3221 PyObject *res = NULL;
3222 /* pointers to the beginning and end+1 of input */
3223 const Py_UNICODE *startp = p;
3224 const Py_UNICODE *endp = p + size;
3225 /* pointer into the output */
3226 Py_UNICODE *str;
3227 /* current output position */
3228 int respos = 0;
3229 int ressize;
3230 char *reason = "character maps to <undefined>";
3231 PyObject *errorHandler = NULL;
3232 PyObject *exc = NULL;
3233 /* the following variable is used for caching string comparisons
3234 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3235 * 3=ignore, 4=xmlcharrefreplace */
3236 int known_errorHandler = -1;
3238 if (mapping == NULL) {
3239 PyErr_BadArgument();
3240 return NULL;
3243 /* allocate enough for a simple 1:1 translation without
3244 replacements, if we need more, we'll resize */
3245 res = PyUnicode_FromUnicode(NULL, size);
3246 if (res == NULL)
3247 goto onError;
3248 if (size == 0)
3249 return res;
3250 str = PyUnicode_AS_UNICODE(res);
3251 ressize = size;
3253 while (p<endp) {
3254 /* try to encode it */
3255 PyObject *x = NULL;
3256 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3257 Py_XDECREF(x);
3258 goto onError;
3260 Py_XDECREF(x);
3261 if (x!=Py_None) /* it worked => adjust input pointer */
3262 ++p;
3263 else { /* untranslatable character */
3264 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3265 int repsize;
3266 int newpos;
3267 Py_UNICODE *uni2;
3268 /* startpos for collecting untranslatable chars */
3269 const Py_UNICODE *collstart = p;
3270 const Py_UNICODE *collend = p+1;
3271 const Py_UNICODE *coll;
3273 /* find all untranslatable characters */
3274 while (collend < endp) {
3275 if (charmaptranslate_lookup(*collend, mapping, &x))
3276 goto onError;
3277 Py_XDECREF(x);
3278 if (x!=Py_None)
3279 break;
3280 ++collend;
3282 /* cache callback name lookup
3283 * (if not done yet, i.e. it's the first error) */
3284 if (known_errorHandler==-1) {
3285 if ((errors==NULL) || (!strcmp(errors, "strict")))
3286 known_errorHandler = 1;
3287 else if (!strcmp(errors, "replace"))
3288 known_errorHandler = 2;
3289 else if (!strcmp(errors, "ignore"))
3290 known_errorHandler = 3;
3291 else if (!strcmp(errors, "xmlcharrefreplace"))
3292 known_errorHandler = 4;
3293 else
3294 known_errorHandler = 0;
3296 switch (known_errorHandler) {
3297 case 1: /* strict */
3298 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3299 goto onError;
3300 case 2: /* replace */
3301 /* No need to check for space, this is a 1:1 replacement */
3302 for (coll = collstart; coll<collend; ++coll)
3303 *str++ = '?';
3304 /* fall through */
3305 case 3: /* ignore */
3306 p = collend;
3307 break;
3308 case 4: /* xmlcharrefreplace */
3309 /* generate replacement (temporarily (mis)uses p) */
3310 for (p = collstart; p < collend; ++p) {
3311 char buffer[2+29+1+1];
3312 char *cp;
3313 sprintf(buffer, "&#%d;", (int)*p);
3314 if (charmaptranslate_makespace(&res, &str, &ressize,
3315 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3316 goto onError;
3317 for (cp = buffer; *cp; ++cp)
3318 *str++ = *cp;
3320 p = collend;
3321 break;
3322 default:
3323 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3324 reason, startp, size, &exc,
3325 collstart-startp, collend-startp, &newpos);
3326 if (repunicode == NULL)
3327 goto onError;
3328 /* generate replacement */
3329 repsize = PyUnicode_GET_SIZE(repunicode);
3330 if (charmaptranslate_makespace(&res, &str, &ressize,
3331 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3332 Py_DECREF(repunicode);
3333 goto onError;
3335 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3336 *str++ = *uni2;
3337 p = startp + newpos;
3338 Py_DECREF(repunicode);
3342 /* Resize if we allocated to much */
3343 respos = str-PyUnicode_AS_UNICODE(res);
3344 if (respos<ressize) {
3345 if (_PyUnicode_Resize(&res, respos))
3346 goto onError;
3348 Py_XDECREF(exc);
3349 Py_XDECREF(errorHandler);
3350 return res;
3352 onError:
3353 Py_XDECREF(res);
3354 Py_XDECREF(exc);
3355 Py_XDECREF(errorHandler);
3356 return NULL;
3359 PyObject *PyUnicode_Translate(PyObject *str,
3360 PyObject *mapping,
3361 const char *errors)
3363 PyObject *result;
3365 str = PyUnicode_FromObject(str);
3366 if (str == NULL)
3367 goto onError;
3368 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3369 PyUnicode_GET_SIZE(str),
3370 mapping,
3371 errors);
3372 Py_DECREF(str);
3373 return result;
3375 onError:
3376 Py_XDECREF(str);
3377 return NULL;
3380 /* --- Decimal Encoder ---------------------------------------------------- */
3382 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3383 int length,
3384 char *output,
3385 const char *errors)
3387 Py_UNICODE *p, *end;
3388 PyObject *errorHandler = NULL;
3389 PyObject *exc = NULL;
3390 const char *encoding = "decimal";
3391 const char *reason = "invalid decimal Unicode string";
3392 /* the following variable is used for caching string comparisons
3393 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3394 int known_errorHandler = -1;
3396 if (output == NULL) {
3397 PyErr_BadArgument();
3398 return -1;
3401 p = s;
3402 end = s + length;
3403 while (p < end) {
3404 register Py_UNICODE ch = *p;
3405 int decimal;
3406 PyObject *repunicode;
3407 int repsize;
3408 int newpos;
3409 Py_UNICODE *uni2;
3410 Py_UNICODE *collstart;
3411 Py_UNICODE *collend;
3413 if (Py_UNICODE_ISSPACE(ch)) {
3414 *output++ = ' ';
3415 ++p;
3416 continue;
3418 decimal = Py_UNICODE_TODECIMAL(ch);
3419 if (decimal >= 0) {
3420 *output++ = '0' + decimal;
3421 ++p;
3422 continue;
3424 if (0 < ch && ch < 256) {
3425 *output++ = (char)ch;
3426 ++p;
3427 continue;
3429 /* All other characters are considered unencodable */
3430 collstart = p;
3431 collend = p+1;
3432 while (collend < end) {
3433 if ((0 < *collend && *collend < 256) ||
3434 !Py_UNICODE_ISSPACE(*collend) ||
3435 Py_UNICODE_TODECIMAL(*collend))
3436 break;
3438 /* cache callback name lookup
3439 * (if not done yet, i.e. it's the first error) */
3440 if (known_errorHandler==-1) {
3441 if ((errors==NULL) || (!strcmp(errors, "strict")))
3442 known_errorHandler = 1;
3443 else if (!strcmp(errors, "replace"))
3444 known_errorHandler = 2;
3445 else if (!strcmp(errors, "ignore"))
3446 known_errorHandler = 3;
3447 else if (!strcmp(errors, "xmlcharrefreplace"))
3448 known_errorHandler = 4;
3449 else
3450 known_errorHandler = 0;
3452 switch (known_errorHandler) {
3453 case 1: /* strict */
3454 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3455 goto onError;
3456 case 2: /* replace */
3457 for (p = collstart; p < collend; ++p)
3458 *output++ = '?';
3459 /* fall through */
3460 case 3: /* ignore */
3461 p = collend;
3462 break;
3463 case 4: /* xmlcharrefreplace */
3464 /* generate replacement (temporarily (mis)uses p) */
3465 for (p = collstart; p < collend; ++p)
3466 output += sprintf(output, "&#%d;", (int)*p);
3467 p = collend;
3468 break;
3469 default:
3470 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3471 encoding, reason, s, length, &exc,
3472 collstart-s, collend-s, &newpos);
3473 if (repunicode == NULL)
3474 goto onError;
3475 /* generate replacement */
3476 repsize = PyUnicode_GET_SIZE(repunicode);
3477 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3478 Py_UNICODE ch = *uni2;
3479 if (Py_UNICODE_ISSPACE(ch))
3480 *output++ = ' ';
3481 else {
3482 decimal = Py_UNICODE_TODECIMAL(ch);
3483 if (decimal >= 0)
3484 *output++ = '0' + decimal;
3485 else if (0 < ch && ch < 256)
3486 *output++ = (char)ch;
3487 else {
3488 Py_DECREF(repunicode);
3489 raise_encode_exception(&exc, encoding,
3490 s, length, collstart-s, collend-s, reason);
3491 goto onError;
3495 p = s + newpos;
3496 Py_DECREF(repunicode);
3499 /* 0-terminate the output string */
3500 *output++ = '\0';
3501 Py_XDECREF(exc);
3502 Py_XDECREF(errorHandler);
3503 return 0;
3505 onError:
3506 Py_XDECREF(exc);
3507 Py_XDECREF(errorHandler);
3508 return -1;
3511 /* --- Helpers ------------------------------------------------------------ */
3513 static
3514 int count(PyUnicodeObject *self,
3515 int start,
3516 int end,
3517 PyUnicodeObject *substring)
3519 int count = 0;
3521 if (start < 0)
3522 start += self->length;
3523 if (start < 0)
3524 start = 0;
3525 if (end > self->length)
3526 end = self->length;
3527 if (end < 0)
3528 end += self->length;
3529 if (end < 0)
3530 end = 0;
3532 if (substring->length == 0)
3533 return (end - start + 1);
3535 end -= substring->length;
3537 while (start <= end)
3538 if (Py_UNICODE_MATCH(self, start, substring)) {
3539 count++;
3540 start += substring->length;
3541 } else
3542 start++;
3544 return count;
3547 int PyUnicode_Count(PyObject *str,
3548 PyObject *substr,
3549 int start,
3550 int end)
3552 int result;
3554 str = PyUnicode_FromObject(str);
3555 if (str == NULL)
3556 return -1;
3557 substr = PyUnicode_FromObject(substr);
3558 if (substr == NULL) {
3559 Py_DECREF(str);
3560 return -1;
3563 result = count((PyUnicodeObject *)str,
3564 start, end,
3565 (PyUnicodeObject *)substr);
3567 Py_DECREF(str);
3568 Py_DECREF(substr);
3569 return result;
3572 static
3573 int findstring(PyUnicodeObject *self,
3574 PyUnicodeObject *substring,
3575 int start,
3576 int end,
3577 int direction)
3579 if (start < 0)
3580 start += self->length;
3581 if (start < 0)
3582 start = 0;
3584 if (end > self->length)
3585 end = self->length;
3586 if (end < 0)
3587 end += self->length;
3588 if (end < 0)
3589 end = 0;
3591 if (substring->length == 0)
3592 return (direction > 0) ? start : end;
3594 end -= substring->length;
3596 if (direction < 0) {
3597 for (; end >= start; end--)
3598 if (Py_UNICODE_MATCH(self, end, substring))
3599 return end;
3600 } else {
3601 for (; start <= end; start++)
3602 if (Py_UNICODE_MATCH(self, start, substring))
3603 return start;
3606 return -1;
3609 int PyUnicode_Find(PyObject *str,
3610 PyObject *substr,
3611 int start,
3612 int end,
3613 int direction)
3615 int result;
3617 str = PyUnicode_FromObject(str);
3618 if (str == NULL)
3619 return -2;
3620 substr = PyUnicode_FromObject(substr);
3621 if (substr == NULL) {
3622 Py_DECREF(str);
3623 return -2;
3626 result = findstring((PyUnicodeObject *)str,
3627 (PyUnicodeObject *)substr,
3628 start, end, direction);
3629 Py_DECREF(str);
3630 Py_DECREF(substr);
3631 return result;
3634 static
3635 int tailmatch(PyUnicodeObject *self,
3636 PyUnicodeObject *substring,
3637 int start,
3638 int end,
3639 int direction)
3641 if (start < 0)
3642 start += self->length;
3643 if (start < 0)
3644 start = 0;
3646 if (substring->length == 0)
3647 return 1;
3649 if (end > self->length)
3650 end = self->length;
3651 if (end < 0)
3652 end += self->length;
3653 if (end < 0)
3654 end = 0;
3656 end -= substring->length;
3657 if (end < start)
3658 return 0;
3660 if (direction > 0) {
3661 if (Py_UNICODE_MATCH(self, end, substring))
3662 return 1;
3663 } else {
3664 if (Py_UNICODE_MATCH(self, start, substring))
3665 return 1;
3668 return 0;
3671 int PyUnicode_Tailmatch(PyObject *str,
3672 PyObject *substr,
3673 int start,
3674 int end,
3675 int direction)
3677 int result;
3679 str = PyUnicode_FromObject(str);
3680 if (str == NULL)
3681 return -1;
3682 substr = PyUnicode_FromObject(substr);
3683 if (substr == NULL) {
3684 Py_DECREF(substr);
3685 return -1;
3688 result = tailmatch((PyUnicodeObject *)str,
3689 (PyUnicodeObject *)substr,
3690 start, end, direction);
3691 Py_DECREF(str);
3692 Py_DECREF(substr);
3693 return result;
3696 static
3697 const Py_UNICODE *findchar(const Py_UNICODE *s,
3698 int size,
3699 Py_UNICODE ch)
3701 /* like wcschr, but doesn't stop at NULL characters */
3703 while (size-- > 0) {
3704 if (*s == ch)
3705 return s;
3706 s++;
3709 return NULL;
3712 /* Apply fixfct filter to the Unicode object self and return a
3713 reference to the modified object */
3715 static
3716 PyObject *fixup(PyUnicodeObject *self,
3717 int (*fixfct)(PyUnicodeObject *s))
3720 PyUnicodeObject *u;
3722 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3723 if (u == NULL)
3724 return NULL;
3726 Py_UNICODE_COPY(u->str, self->str, self->length);
3728 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3729 /* fixfct should return TRUE if it modified the buffer. If
3730 FALSE, return a reference to the original buffer instead
3731 (to save space, not time) */
3732 Py_INCREF(self);
3733 Py_DECREF(u);
3734 return (PyObject*) self;
3736 return (PyObject*) u;
3739 static
3740 int fixupper(PyUnicodeObject *self)
3742 int len = self->length;
3743 Py_UNICODE *s = self->str;
3744 int status = 0;
3746 while (len-- > 0) {
3747 register Py_UNICODE ch;
3749 ch = Py_UNICODE_TOUPPER(*s);
3750 if (ch != *s) {
3751 status = 1;
3752 *s = ch;
3754 s++;
3757 return status;
3760 static
3761 int fixlower(PyUnicodeObject *self)
3763 int len = self->length;
3764 Py_UNICODE *s = self->str;
3765 int status = 0;
3767 while (len-- > 0) {
3768 register Py_UNICODE ch;
3770 ch = Py_UNICODE_TOLOWER(*s);
3771 if (ch != *s) {
3772 status = 1;
3773 *s = ch;
3775 s++;
3778 return status;
3781 static
3782 int fixswapcase(PyUnicodeObject *self)
3784 int len = self->length;
3785 Py_UNICODE *s = self->str;
3786 int status = 0;
3788 while (len-- > 0) {
3789 if (Py_UNICODE_ISUPPER(*s)) {
3790 *s = Py_UNICODE_TOLOWER(*s);
3791 status = 1;
3792 } else if (Py_UNICODE_ISLOWER(*s)) {
3793 *s = Py_UNICODE_TOUPPER(*s);
3794 status = 1;
3796 s++;
3799 return status;
3802 static
3803 int fixcapitalize(PyUnicodeObject *self)
3805 int len = self->length;
3806 Py_UNICODE *s = self->str;
3807 int status = 0;
3809 if (len == 0)
3810 return 0;
3811 if (Py_UNICODE_ISLOWER(*s)) {
3812 *s = Py_UNICODE_TOUPPER(*s);
3813 status = 1;
3815 s++;
3816 while (--len > 0) {
3817 if (Py_UNICODE_ISUPPER(*s)) {
3818 *s = Py_UNICODE_TOLOWER(*s);
3819 status = 1;
3821 s++;
3823 return status;
3826 static
3827 int fixtitle(PyUnicodeObject *self)
3829 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830 register Py_UNICODE *e;
3831 int previous_is_cased;
3833 /* Shortcut for single character strings */
3834 if (PyUnicode_GET_SIZE(self) == 1) {
3835 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3836 if (*p != ch) {
3837 *p = ch;
3838 return 1;
3840 else
3841 return 0;
3844 e = p + PyUnicode_GET_SIZE(self);
3845 previous_is_cased = 0;
3846 for (; p < e; p++) {
3847 register const Py_UNICODE ch = *p;
3849 if (previous_is_cased)
3850 *p = Py_UNICODE_TOLOWER(ch);
3851 else
3852 *p = Py_UNICODE_TOTITLE(ch);
3854 if (Py_UNICODE_ISLOWER(ch) ||
3855 Py_UNICODE_ISUPPER(ch) ||
3856 Py_UNICODE_ISTITLE(ch))
3857 previous_is_cased = 1;
3858 else
3859 previous_is_cased = 0;
3861 return 1;
3864 PyObject *PyUnicode_Join(PyObject *separator,
3865 PyObject *seq)
3867 Py_UNICODE *sep;
3868 int seplen;
3869 PyUnicodeObject *res = NULL;
3870 int reslen = 0;
3871 Py_UNICODE *p;
3872 int sz = 100;
3873 int i;
3874 PyObject *it;
3876 it = PyObject_GetIter(seq);
3877 if (it == NULL)
3878 return NULL;
3880 if (separator == NULL) {
3881 Py_UNICODE blank = ' ';
3882 sep = &blank;
3883 seplen = 1;
3885 else {
3886 separator = PyUnicode_FromObject(separator);
3887 if (separator == NULL)
3888 goto onError;
3889 sep = PyUnicode_AS_UNICODE(separator);
3890 seplen = PyUnicode_GET_SIZE(separator);
3893 res = _PyUnicode_New(sz);
3894 if (res == NULL)
3895 goto onError;
3896 p = PyUnicode_AS_UNICODE(res);
3897 reslen = 0;
3899 for (i = 0; ; ++i) {
3900 int itemlen;
3901 PyObject *item = PyIter_Next(it);
3902 if (item == NULL) {
3903 if (PyErr_Occurred())
3904 goto onError;
3905 break;
3907 if (!PyUnicode_Check(item)) {
3908 PyObject *v;
3909 if (!PyString_Check(item)) {
3910 PyErr_Format(PyExc_TypeError,
3911 "sequence item %i: expected string or Unicode,"
3912 " %.80s found",
3913 i, item->ob_type->tp_name);
3914 Py_DECREF(item);
3915 goto onError;
3917 v = PyUnicode_FromObject(item);
3918 Py_DECREF(item);
3919 item = v;
3920 if (item == NULL)
3921 goto onError;
3923 itemlen = PyUnicode_GET_SIZE(item);
3924 while (reslen + itemlen + seplen >= sz) {
3925 if (_PyUnicode_Resize(&res, sz*2)) {
3926 Py_DECREF(item);
3927 goto onError;
3929 sz *= 2;
3930 p = PyUnicode_AS_UNICODE(res) + reslen;
3932 if (i > 0) {
3933 Py_UNICODE_COPY(p, sep, seplen);
3934 p += seplen;
3935 reslen += seplen;
3937 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3938 p += itemlen;
3939 reslen += itemlen;
3940 Py_DECREF(item);
3942 if (_PyUnicode_Resize(&res, reslen))
3943 goto onError;
3945 Py_XDECREF(separator);
3946 Py_DECREF(it);
3947 return (PyObject *)res;
3949 onError:
3950 Py_XDECREF(separator);
3951 Py_XDECREF(res);
3952 Py_DECREF(it);
3953 return NULL;
3956 static
3957 PyUnicodeObject *pad(PyUnicodeObject *self,
3958 int left,
3959 int right,
3960 Py_UNICODE fill)
3962 PyUnicodeObject *u;
3964 if (left < 0)
3965 left = 0;
3966 if (right < 0)
3967 right = 0;
3969 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3970 Py_INCREF(self);
3971 return self;
3974 u = _PyUnicode_New(left + self->length + right);
3975 if (u) {
3976 if (left)
3977 Py_UNICODE_FILL(u->str, fill, left);
3978 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3979 if (right)
3980 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3983 return u;
3986 #define SPLIT_APPEND(data, left, right) \
3987 str = PyUnicode_FromUnicode(data + left, right - left); \
3988 if (!str) \
3989 goto onError; \
3990 if (PyList_Append(list, str)) { \
3991 Py_DECREF(str); \
3992 goto onError; \
3994 else \
3995 Py_DECREF(str);
3997 static
3998 PyObject *split_whitespace(PyUnicodeObject *self,
3999 PyObject *list,
4000 int maxcount)
4002 register int i;
4003 register int j;
4004 int len = self->length;
4005 PyObject *str;
4007 for (i = j = 0; i < len; ) {
4008 /* find a token */
4009 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4010 i++;
4011 j = i;
4012 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4013 i++;
4014 if (j < i) {
4015 if (maxcount-- <= 0)
4016 break;
4017 SPLIT_APPEND(self->str, j, i);
4018 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4019 i++;
4020 j = i;
4023 if (j < len) {
4024 SPLIT_APPEND(self->str, j, len);
4026 return list;
4028 onError:
4029 Py_DECREF(list);
4030 return NULL;
4033 PyObject *PyUnicode_Splitlines(PyObject *string,
4034 int keepends)
4036 register int i;
4037 register int j;
4038 int len;
4039 PyObject *list;
4040 PyObject *str;
4041 Py_UNICODE *data;
4043 string = PyUnicode_FromObject(string);
4044 if (string == NULL)
4045 return NULL;
4046 data = PyUnicode_AS_UNICODE(string);
4047 len = PyUnicode_GET_SIZE(string);
4049 list = PyList_New(0);
4050 if (!list)
4051 goto onError;
4053 for (i = j = 0; i < len; ) {
4054 int eol;
4056 /* Find a line and append it */
4057 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4058 i++;
4060 /* Skip the line break reading CRLF as one line break */
4061 eol = i;
4062 if (i < len) {
4063 if (data[i] == '\r' && i + 1 < len &&
4064 data[i+1] == '\n')
4065 i += 2;
4066 else
4067 i++;
4068 if (keepends)
4069 eol = i;
4071 SPLIT_APPEND(data, j, eol);
4072 j = i;
4074 if (j < len) {
4075 SPLIT_APPEND(data, j, len);
4078 Py_DECREF(string);
4079 return list;
4081 onError:
4082 Py_DECREF(list);
4083 Py_DECREF(string);
4084 return NULL;
4087 static
4088 PyObject *split_char(PyUnicodeObject *self,
4089 PyObject *list,
4090 Py_UNICODE ch,
4091 int maxcount)
4093 register int i;
4094 register int j;
4095 int len = self->length;
4096 PyObject *str;
4098 for (i = j = 0; i < len; ) {
4099 if (self->str[i] == ch) {
4100 if (maxcount-- <= 0)
4101 break;
4102 SPLIT_APPEND(self->str, j, i);
4103 i = j = i + 1;
4104 } else
4105 i++;
4107 if (j <= len) {
4108 SPLIT_APPEND(self->str, j, len);
4110 return list;
4112 onError:
4113 Py_DECREF(list);
4114 return NULL;
4117 static
4118 PyObject *split_substring(PyUnicodeObject *self,
4119 PyObject *list,
4120 PyUnicodeObject *substring,
4121 int maxcount)
4123 register int i;
4124 register int j;
4125 int len = self->length;
4126 int sublen = substring->length;
4127 PyObject *str;
4129 for (i = j = 0; i <= len - sublen; ) {
4130 if (Py_UNICODE_MATCH(self, i, substring)) {
4131 if (maxcount-- <= 0)
4132 break;
4133 SPLIT_APPEND(self->str, j, i);
4134 i = j = i + sublen;
4135 } else
4136 i++;
4138 if (j <= len) {
4139 SPLIT_APPEND(self->str, j, len);
4141 return list;
4143 onError:
4144 Py_DECREF(list);
4145 return NULL;
4148 #undef SPLIT_APPEND
4150 static
4151 PyObject *split(PyUnicodeObject *self,
4152 PyUnicodeObject *substring,
4153 int maxcount)
4155 PyObject *list;
4157 if (maxcount < 0)
4158 maxcount = INT_MAX;
4160 list = PyList_New(0);
4161 if (!list)
4162 return NULL;
4164 if (substring == NULL)
4165 return split_whitespace(self,list,maxcount);
4167 else if (substring->length == 1)
4168 return split_char(self,list,substring->str[0],maxcount);
4170 else if (substring->length == 0) {
4171 Py_DECREF(list);
4172 PyErr_SetString(PyExc_ValueError, "empty separator");
4173 return NULL;
4175 else
4176 return split_substring(self,list,substring,maxcount);
4179 static
4180 PyObject *replace(PyUnicodeObject *self,
4181 PyUnicodeObject *str1,
4182 PyUnicodeObject *str2,
4183 int maxcount)
4185 PyUnicodeObject *u;
4187 if (maxcount < 0)
4188 maxcount = INT_MAX;
4190 if (str1->length == 1 && str2->length == 1) {
4191 int i;
4193 /* replace characters */
4194 if (!findchar(self->str, self->length, str1->str[0]) &&
4195 PyUnicode_CheckExact(self)) {
4196 /* nothing to replace, return original string */
4197 Py_INCREF(self);
4198 u = self;
4199 } else {
4200 Py_UNICODE u1 = str1->str[0];
4201 Py_UNICODE u2 = str2->str[0];
4203 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4204 NULL,
4205 self->length
4207 if (u != NULL) {
4208 Py_UNICODE_COPY(u->str, self->str,
4209 self->length);
4210 for (i = 0; i < u->length; i++)
4211 if (u->str[i] == u1) {
4212 if (--maxcount < 0)
4213 break;
4214 u->str[i] = u2;
4219 } else {
4220 int n, i;
4221 Py_UNICODE *p;
4223 /* replace strings */
4224 n = count(self, 0, self->length, str1);
4225 if (n > maxcount)
4226 n = maxcount;
4227 if (n == 0) {
4228 /* nothing to replace, return original string */
4229 if (PyUnicode_CheckExact(self)) {
4230 Py_INCREF(self);
4231 u = self;
4233 else {
4234 u = (PyUnicodeObject *)
4235 PyUnicode_FromUnicode(self->str, self->length);
4237 } else {
4238 u = _PyUnicode_New(
4239 self->length + n * (str2->length - str1->length));
4240 if (u) {
4241 i = 0;
4242 p = u->str;
4243 if (str1->length > 0) {
4244 while (i <= self->length - str1->length)
4245 if (Py_UNICODE_MATCH(self, i, str1)) {
4246 /* replace string segment */
4247 Py_UNICODE_COPY(p, str2->str, str2->length);
4248 p += str2->length;
4249 i += str1->length;
4250 if (--n <= 0) {
4251 /* copy remaining part */
4252 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4253 break;
4255 } else
4256 *p++ = self->str[i++];
4257 } else {
4258 while (n > 0) {
4259 Py_UNICODE_COPY(p, str2->str, str2->length);
4260 p += str2->length;
4261 if (--n <= 0)
4262 break;
4263 *p++ = self->str[i++];
4265 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4271 return (PyObject *) u;
4274 /* --- Unicode Object Methods --------------------------------------------- */
4276 PyDoc_STRVAR(title__doc__,
4277 "S.title() -> unicode\n\
4279 Return a titlecased version of S, i.e. words start with title case\n\
4280 characters, all remaining cased characters have lower case.");
4282 static PyObject*
4283 unicode_title(PyUnicodeObject *self)
4285 return fixup(self, fixtitle);
4288 PyDoc_STRVAR(capitalize__doc__,
4289 "S.capitalize() -> unicode\n\
4291 Return a capitalized version of S, i.e. make the first character\n\
4292 have upper case.");
4294 static PyObject*
4295 unicode_capitalize(PyUnicodeObject *self)
4297 return fixup(self, fixcapitalize);
4300 #if 0
4301 PyDoc_STRVAR(capwords__doc__,
4302 "S.capwords() -> unicode\n\
4304 Apply .capitalize() to all words in S and return the result with\n\
4305 normalized whitespace (all whitespace strings are replaced by ' ').");
4307 static PyObject*
4308 unicode_capwords(PyUnicodeObject *self)
4310 PyObject *list;
4311 PyObject *item;
4312 int i;
4314 /* Split into words */
4315 list = split(self, NULL, -1);
4316 if (!list)
4317 return NULL;
4319 /* Capitalize each word */
4320 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4321 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4322 fixcapitalize);
4323 if (item == NULL)
4324 goto onError;
4325 Py_DECREF(PyList_GET_ITEM(list, i));
4326 PyList_SET_ITEM(list, i, item);
4329 /* Join the words to form a new string */
4330 item = PyUnicode_Join(NULL, list);
4332 onError:
4333 Py_DECREF(list);
4334 return (PyObject *)item;
4336 #endif
4338 PyDoc_STRVAR(center__doc__,
4339 "S.center(width) -> unicode\n\
4341 Return S centered in a Unicode string of length width. Padding is done\n\
4342 using spaces.");
4344 static PyObject *
4345 unicode_center(PyUnicodeObject *self, PyObject *args)
4347 int marg, left;
4348 int width;
4350 if (!PyArg_ParseTuple(args, "i:center", &width))
4351 return NULL;
4353 if (self->length >= width && PyUnicode_CheckExact(self)) {
4354 Py_INCREF(self);
4355 return (PyObject*) self;
4358 marg = width - self->length;
4359 left = marg / 2 + (marg & width & 1);
4361 return (PyObject*) pad(self, left, marg - left, ' ');
4364 #if 0
4366 /* This code should go into some future Unicode collation support
4367 module. The basic comparison should compare ordinals on a naive
4368 basis (this is what Java does and thus JPython too). */
4370 /* speedy UTF-16 code point order comparison */
4371 /* gleaned from: */
4372 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4374 static short utf16Fixup[32] =
4376 0, 0, 0, 0, 0, 0, 0, 0,
4377 0, 0, 0, 0, 0, 0, 0, 0,
4378 0, 0, 0, 0, 0, 0, 0, 0,
4379 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4382 static int
4383 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4385 int len1, len2;
4387 Py_UNICODE *s1 = str1->str;
4388 Py_UNICODE *s2 = str2->str;
4390 len1 = str1->length;
4391 len2 = str2->length;
4393 while (len1 > 0 && len2 > 0) {
4394 Py_UNICODE c1, c2;
4396 c1 = *s1++;
4397 c2 = *s2++;
4399 if (c1 > (1<<11) * 26)
4400 c1 += utf16Fixup[c1>>11];
4401 if (c2 > (1<<11) * 26)
4402 c2 += utf16Fixup[c2>>11];
4403 /* now c1 and c2 are in UTF-32-compatible order */
4405 if (c1 != c2)
4406 return (c1 < c2) ? -1 : 1;
4408 len1--; len2--;
4411 return (len1 < len2) ? -1 : (len1 != len2);
4414 #else
4416 static int
4417 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4419 register int len1, len2;
4421 Py_UNICODE *s1 = str1->str;
4422 Py_UNICODE *s2 = str2->str;
4424 len1 = str1->length;
4425 len2 = str2->length;
4427 while (len1 > 0 && len2 > 0) {
4428 Py_UNICODE c1, c2;
4430 c1 = *s1++;
4431 c2 = *s2++;
4433 if (c1 != c2)
4434 return (c1 < c2) ? -1 : 1;
4436 len1--; len2--;
4439 return (len1 < len2) ? -1 : (len1 != len2);
4442 #endif
4444 int PyUnicode_Compare(PyObject *left,
4445 PyObject *right)
4447 PyUnicodeObject *u = NULL, *v = NULL;
4448 int result;
4450 /* Coerce the two arguments */
4451 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4452 if (u == NULL)
4453 goto onError;
4454 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4455 if (v == NULL)
4456 goto onError;
4458 /* Shortcut for empty or interned objects */
4459 if (v == u) {
4460 Py_DECREF(u);
4461 Py_DECREF(v);
4462 return 0;
4465 result = unicode_compare(u, v);
4467 Py_DECREF(u);
4468 Py_DECREF(v);
4469 return result;
4471 onError:
4472 Py_XDECREF(u);
4473 Py_XDECREF(v);
4474 return -1;
4477 int PyUnicode_Contains(PyObject *container,
4478 PyObject *element)
4480 PyUnicodeObject *u = NULL, *v = NULL;
4481 int result, size;
4482 register const Py_UNICODE *lhs, *end, *rhs;
4484 /* Coerce the two arguments */
4485 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4486 if (v == NULL) {
4487 PyErr_SetString(PyExc_TypeError,
4488 "'in <string>' requires string as left operand");
4489 goto onError;
4491 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4492 if (u == NULL)
4493 goto onError;
4495 size = PyUnicode_GET_SIZE(v);
4496 rhs = PyUnicode_AS_UNICODE(v);
4497 lhs = PyUnicode_AS_UNICODE(u);
4499 result = 0;
4500 if (size == 1) {
4501 end = lhs + PyUnicode_GET_SIZE(u);
4502 while (lhs < end) {
4503 if (*lhs++ == *rhs) {
4504 result = 1;
4505 break;
4509 else {
4510 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4511 while (lhs <= end) {
4512 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4513 result = 1;
4514 break;
4519 Py_DECREF(u);
4520 Py_DECREF(v);
4521 return result;
4523 onError:
4524 Py_XDECREF(u);
4525 Py_XDECREF(v);
4526 return -1;
4529 /* Concat to string or Unicode object giving a new Unicode object. */
4531 PyObject *PyUnicode_Concat(PyObject *left,
4532 PyObject *right)
4534 PyUnicodeObject *u = NULL, *v = NULL, *w;
4536 /* Coerce the two arguments */
4537 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4538 if (u == NULL)
4539 goto onError;
4540 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4541 if (v == NULL)
4542 goto onError;
4544 /* Shortcuts */
4545 if (v == unicode_empty) {
4546 Py_DECREF(v);
4547 return (PyObject *)u;
4549 if (u == unicode_empty) {
4550 Py_DECREF(u);
4551 return (PyObject *)v;
4554 /* Concat the two Unicode strings */
4555 w = _PyUnicode_New(u->length + v->length);
4556 if (w == NULL)
4557 goto onError;
4558 Py_UNICODE_COPY(w->str, u->str, u->length);
4559 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4561 Py_DECREF(u);
4562 Py_DECREF(v);
4563 return (PyObject *)w;
4565 onError:
4566 Py_XDECREF(u);
4567 Py_XDECREF(v);
4568 return NULL;
4571 PyDoc_STRVAR(count__doc__,
4572 "S.count(sub[, start[, end]]) -> int\n\
4574 Return the number of occurrences of substring sub in Unicode string\n\
4575 S[start:end]. Optional arguments start and end are\n\
4576 interpreted as in slice notation.");
4578 static PyObject *
4579 unicode_count(PyUnicodeObject *self, PyObject *args)
4581 PyUnicodeObject *substring;
4582 int start = 0;
4583 int end = INT_MAX;
4584 PyObject *result;
4586 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4587 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4588 return NULL;
4590 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4591 (PyObject *)substring);
4592 if (substring == NULL)
4593 return NULL;
4595 if (start < 0)
4596 start += self->length;
4597 if (start < 0)
4598 start = 0;
4599 if (end > self->length)
4600 end = self->length;
4601 if (end < 0)
4602 end += self->length;
4603 if (end < 0)
4604 end = 0;
4606 result = PyInt_FromLong((long) count(self, start, end, substring));
4608 Py_DECREF(substring);
4609 return result;
4612 PyDoc_STRVAR(encode__doc__,
4613 "S.encode([encoding[,errors]]) -> string\n\
4615 Return an encoded string version of S. Default encoding is the current\n\
4616 default string encoding. errors may be given to set a different error\n\
4617 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4618 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4619 'xmlcharrefreplace' as well as any other name registered with\n\
4620 codecs.register_error that can handle UnicodeEncodeErrors.");
4622 static PyObject *
4623 unicode_encode(PyUnicodeObject *self, PyObject *args)
4625 char *encoding = NULL;
4626 char *errors = NULL;
4627 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4628 return NULL;
4629 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4632 PyDoc_STRVAR(expandtabs__doc__,
4633 "S.expandtabs([tabsize]) -> unicode\n\
4635 Return a copy of S where all tab characters are expanded using spaces.\n\
4636 If tabsize is not given, a tab size of 8 characters is assumed.");
4638 static PyObject*
4639 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4641 Py_UNICODE *e;
4642 Py_UNICODE *p;
4643 Py_UNICODE *q;
4644 int i, j;
4645 PyUnicodeObject *u;
4646 int tabsize = 8;
4648 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4649 return NULL;
4651 /* First pass: determine size of output string */
4652 i = j = 0;
4653 e = self->str + self->length;
4654 for (p = self->str; p < e; p++)
4655 if (*p == '\t') {
4656 if (tabsize > 0)
4657 j += tabsize - (j % tabsize);
4659 else {
4660 j++;
4661 if (*p == '\n' || *p == '\r') {
4662 i += j;
4663 j = 0;
4667 /* Second pass: create output string and fill it */
4668 u = _PyUnicode_New(i + j);
4669 if (!u)
4670 return NULL;
4672 j = 0;
4673 q = u->str;
4675 for (p = self->str; p < e; p++)
4676 if (*p == '\t') {
4677 if (tabsize > 0) {
4678 i = tabsize - (j % tabsize);
4679 j += i;
4680 while (i--)
4681 *q++ = ' ';
4684 else {
4685 j++;
4686 *q++ = *p;
4687 if (*p == '\n' || *p == '\r')
4688 j = 0;
4691 return (PyObject*) u;
4694 PyDoc_STRVAR(find__doc__,
4695 "S.find(sub [,start [,end]]) -> int\n\
4697 Return the lowest index in S where substring sub is found,\n\
4698 such that sub is contained within s[start,end]. Optional\n\
4699 arguments start and end are interpreted as in slice notation.\n\
4701 Return -1 on failure.");
4703 static PyObject *
4704 unicode_find(PyUnicodeObject *self, PyObject *args)
4706 PyUnicodeObject *substring;
4707 int start = 0;
4708 int end = INT_MAX;
4709 PyObject *result;
4711 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4712 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4713 return NULL;
4714 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4715 (PyObject *)substring);
4716 if (substring == NULL)
4717 return NULL;
4719 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4721 Py_DECREF(substring);
4722 return result;
4725 static PyObject *
4726 unicode_getitem(PyUnicodeObject *self, int index)
4728 if (index < 0 || index >= self->length) {
4729 PyErr_SetString(PyExc_IndexError, "string index out of range");
4730 return NULL;
4733 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4736 static long
4737 unicode_hash(PyUnicodeObject *self)
4739 /* Since Unicode objects compare equal to their ASCII string
4740 counterparts, they should use the individual character values
4741 as basis for their hash value. This is needed to assure that
4742 strings and Unicode objects behave in the same way as
4743 dictionary keys. */
4745 register int len;
4746 register Py_UNICODE *p;
4747 register long x;
4749 if (self->hash != -1)
4750 return self->hash;
4751 len = PyUnicode_GET_SIZE(self);
4752 p = PyUnicode_AS_UNICODE(self);
4753 x = *p << 7;
4754 while (--len >= 0)
4755 x = (1000003*x) ^ *p++;
4756 x ^= PyUnicode_GET_SIZE(self);
4757 if (x == -1)
4758 x = -2;
4759 self->hash = x;
4760 return x;
4763 PyDoc_STRVAR(index__doc__,
4764 "S.index(sub [,start [,end]]) -> int\n\
4766 Like S.find() but raise ValueError when the substring is not found.");
4768 static PyObject *
4769 unicode_index(PyUnicodeObject *self, PyObject *args)
4771 int result;
4772 PyUnicodeObject *substring;
4773 int start = 0;
4774 int end = INT_MAX;
4776 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4777 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4778 return NULL;
4780 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4781 (PyObject *)substring);
4782 if (substring == NULL)
4783 return NULL;
4785 result = findstring(self, substring, start, end, 1);
4787 Py_DECREF(substring);
4788 if (result < 0) {
4789 PyErr_SetString(PyExc_ValueError, "substring not found");
4790 return NULL;
4792 return PyInt_FromLong(result);
4795 PyDoc_STRVAR(islower__doc__,
4796 "S.islower() -> bool\n\
4798 Return True if all cased characters in S are lowercase and there is\n\
4799 at least one cased character in S, False otherwise.");
4801 static PyObject*
4802 unicode_islower(PyUnicodeObject *self)
4804 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4805 register const Py_UNICODE *e;
4806 int cased;
4808 /* Shortcut for single character strings */
4809 if (PyUnicode_GET_SIZE(self) == 1)
4810 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4812 /* Special case for empty strings */
4813 if (PyString_GET_SIZE(self) == 0)
4814 return PyBool_FromLong(0);
4816 e = p + PyUnicode_GET_SIZE(self);
4817 cased = 0;
4818 for (; p < e; p++) {
4819 register const Py_UNICODE ch = *p;
4821 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4822 return PyBool_FromLong(0);
4823 else if (!cased && Py_UNICODE_ISLOWER(ch))
4824 cased = 1;
4826 return PyBool_FromLong(cased);
4829 PyDoc_STRVAR(isupper__doc__,
4830 "S.isupper() -> bool\n\
4832 Return True if all cased characters in S are uppercase and there is\n\
4833 at least one cased character in S, False otherwise.");
4835 static PyObject*
4836 unicode_isupper(PyUnicodeObject *self)
4838 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4839 register const Py_UNICODE *e;
4840 int cased;
4842 /* Shortcut for single character strings */
4843 if (PyUnicode_GET_SIZE(self) == 1)
4844 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4846 /* Special case for empty strings */
4847 if (PyString_GET_SIZE(self) == 0)
4848 return PyBool_FromLong(0);
4850 e = p + PyUnicode_GET_SIZE(self);
4851 cased = 0;
4852 for (; p < e; p++) {
4853 register const Py_UNICODE ch = *p;
4855 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4856 return PyBool_FromLong(0);
4857 else if (!cased && Py_UNICODE_ISUPPER(ch))
4858 cased = 1;
4860 return PyBool_FromLong(cased);
4863 PyDoc_STRVAR(istitle__doc__,
4864 "S.istitle() -> bool\n\
4866 Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4867 characters may only follow uncased characters and lowercase characters\n\
4868 only cased ones. Return False otherwise.");
4870 static PyObject*
4871 unicode_istitle(PyUnicodeObject *self)
4873 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4874 register const Py_UNICODE *e;
4875 int cased, previous_is_cased;
4877 /* Shortcut for single character strings */
4878 if (PyUnicode_GET_SIZE(self) == 1)
4879 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4880 (Py_UNICODE_ISUPPER(*p) != 0));
4882 /* Special case for empty strings */
4883 if (PyString_GET_SIZE(self) == 0)
4884 return PyBool_FromLong(0);
4886 e = p + PyUnicode_GET_SIZE(self);
4887 cased = 0;
4888 previous_is_cased = 0;
4889 for (; p < e; p++) {
4890 register const Py_UNICODE ch = *p;
4892 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4893 if (previous_is_cased)
4894 return PyBool_FromLong(0);
4895 previous_is_cased = 1;
4896 cased = 1;
4898 else if (Py_UNICODE_ISLOWER(ch)) {
4899 if (!previous_is_cased)
4900 return PyBool_FromLong(0);
4901 previous_is_cased = 1;
4902 cased = 1;
4904 else
4905 previous_is_cased = 0;
4907 return PyBool_FromLong(cased);
4910 PyDoc_STRVAR(isspace__doc__,
4911 "S.isspace() -> bool\n\
4913 Return True if there are only whitespace characters in S,\n\
4914 False otherwise.");
4916 static PyObject*
4917 unicode_isspace(PyUnicodeObject *self)
4919 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4920 register const Py_UNICODE *e;
4922 /* Shortcut for single character strings */
4923 if (PyUnicode_GET_SIZE(self) == 1 &&
4924 Py_UNICODE_ISSPACE(*p))
4925 return PyBool_FromLong(1);
4927 /* Special case for empty strings */
4928 if (PyString_GET_SIZE(self) == 0)
4929 return PyBool_FromLong(0);
4931 e = p + PyUnicode_GET_SIZE(self);
4932 for (; p < e; p++) {
4933 if (!Py_UNICODE_ISSPACE(*p))
4934 return PyBool_FromLong(0);
4936 return PyBool_FromLong(1);
4939 PyDoc_STRVAR(isalpha__doc__,
4940 "S.isalpha() -> bool\n\
4942 Return True if all characters in S are alphabetic\n\
4943 and there is at least one character in S, False otherwise.");
4945 static PyObject*
4946 unicode_isalpha(PyUnicodeObject *self)
4948 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4949 register const Py_UNICODE *e;
4951 /* Shortcut for single character strings */
4952 if (PyUnicode_GET_SIZE(self) == 1 &&
4953 Py_UNICODE_ISALPHA(*p))
4954 return PyBool_FromLong(1);
4956 /* Special case for empty strings */
4957 if (PyString_GET_SIZE(self) == 0)
4958 return PyBool_FromLong(0);
4960 e = p + PyUnicode_GET_SIZE(self);
4961 for (; p < e; p++) {
4962 if (!Py_UNICODE_ISALPHA(*p))
4963 return PyBool_FromLong(0);
4965 return PyBool_FromLong(1);
4968 PyDoc_STRVAR(isalnum__doc__,
4969 "S.isalnum() -> bool\n\
4971 Return True if all characters in S are alphanumeric\n\
4972 and there is at least one character in S, False otherwise.");
4974 static PyObject*
4975 unicode_isalnum(PyUnicodeObject *self)
4977 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4978 register const Py_UNICODE *e;
4980 /* Shortcut for single character strings */
4981 if (PyUnicode_GET_SIZE(self) == 1 &&
4982 Py_UNICODE_ISALNUM(*p))
4983 return PyBool_FromLong(1);
4985 /* Special case for empty strings */
4986 if (PyString_GET_SIZE(self) == 0)
4987 return PyBool_FromLong(0);
4989 e = p + PyUnicode_GET_SIZE(self);
4990 for (; p < e; p++) {
4991 if (!Py_UNICODE_ISALNUM(*p))
4992 return PyBool_FromLong(0);
4994 return PyBool_FromLong(1);
4997 PyDoc_STRVAR(isdecimal__doc__,
4998 "S.isdecimal() -> bool\n\
5000 Return True if there are only decimal characters in S,\n\
5001 False otherwise.");
5003 static PyObject*
5004 unicode_isdecimal(PyUnicodeObject *self)
5006 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5007 register const Py_UNICODE *e;
5009 /* Shortcut for single character strings */
5010 if (PyUnicode_GET_SIZE(self) == 1 &&
5011 Py_UNICODE_ISDECIMAL(*p))
5012 return PyBool_FromLong(1);
5014 /* Special case for empty strings */
5015 if (PyString_GET_SIZE(self) == 0)
5016 return PyBool_FromLong(0);
5018 e = p + PyUnicode_GET_SIZE(self);
5019 for (; p < e; p++) {
5020 if (!Py_UNICODE_ISDECIMAL(*p))
5021 return PyBool_FromLong(0);
5023 return PyBool_FromLong(1);
5026 PyDoc_STRVAR(isdigit__doc__,
5027 "S.isdigit() -> bool\n\
5029 Return True if there are only digit characters in S,\n\
5030 False otherwise.");
5032 static PyObject*
5033 unicode_isdigit(PyUnicodeObject *self)
5035 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5036 register const Py_UNICODE *e;
5038 /* Shortcut for single character strings */
5039 if (PyUnicode_GET_SIZE(self) == 1 &&
5040 Py_UNICODE_ISDIGIT(*p))
5041 return PyBool_FromLong(1);
5043 /* Special case for empty strings */
5044 if (PyString_GET_SIZE(self) == 0)
5045 return PyBool_FromLong(0);
5047 e = p + PyUnicode_GET_SIZE(self);
5048 for (; p < e; p++) {
5049 if (!Py_UNICODE_ISDIGIT(*p))
5050 return PyBool_FromLong(0);
5052 return PyBool_FromLong(1);
5055 PyDoc_STRVAR(isnumeric__doc__,
5056 "S.isnumeric() -> bool\n\
5058 Return True if there are only numeric characters in S,\n\
5059 False otherwise.");
5061 static PyObject*
5062 unicode_isnumeric(PyUnicodeObject *self)
5064 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5065 register const Py_UNICODE *e;
5067 /* Shortcut for single character strings */
5068 if (PyUnicode_GET_SIZE(self) == 1 &&
5069 Py_UNICODE_ISNUMERIC(*p))
5070 return PyBool_FromLong(1);
5072 /* Special case for empty strings */
5073 if (PyString_GET_SIZE(self) == 0)
5074 return PyBool_FromLong(0);
5076 e = p + PyUnicode_GET_SIZE(self);
5077 for (; p < e; p++) {
5078 if (!Py_UNICODE_ISNUMERIC(*p))
5079 return PyBool_FromLong(0);
5081 return PyBool_FromLong(1);
5084 PyDoc_STRVAR(join__doc__,
5085 "S.join(sequence) -> unicode\n\
5087 Return a string which is the concatenation of the strings in the\n\
5088 sequence. The separator between elements is S.");
5090 static PyObject*
5091 unicode_join(PyObject *self, PyObject *data)
5093 return PyUnicode_Join(self, data);
5096 static int
5097 unicode_length(PyUnicodeObject *self)
5099 return self->length;
5102 PyDoc_STRVAR(ljust__doc__,
5103 "S.ljust(width) -> unicode\n\
5105 Return S left justified in a Unicode string of length width. Padding is\n\
5106 done using spaces.");
5108 static PyObject *
5109 unicode_ljust(PyUnicodeObject *self, PyObject *args)
5111 int width;
5112 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5113 return NULL;
5115 if (self->length >= width && PyUnicode_CheckExact(self)) {
5116 Py_INCREF(self);
5117 return (PyObject*) self;
5120 return (PyObject*) pad(self, 0, width - self->length, ' ');
5123 PyDoc_STRVAR(lower__doc__,
5124 "S.lower() -> unicode\n\
5126 Return a copy of the string S converted to lowercase.");
5128 static PyObject*
5129 unicode_lower(PyUnicodeObject *self)
5131 return fixup(self, fixlower);
5134 #define LEFTSTRIP 0
5135 #define RIGHTSTRIP 1
5136 #define BOTHSTRIP 2
5138 /* Arrays indexed by above */
5139 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5141 #define STRIPNAME(i) (stripformat[i]+3)
5143 static const Py_UNICODE *
5144 unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5146 size_t i;
5147 for (i = 0; i < n; ++i)
5148 if (s[i] == c)
5149 return s+i;
5150 return NULL;
5153 /* externally visible for str.strip(unicode) */
5154 PyObject *
5155 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5157 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5158 int len = PyUnicode_GET_SIZE(self);
5159 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5160 int seplen = PyUnicode_GET_SIZE(sepobj);
5161 int i, j;
5163 i = 0;
5164 if (striptype != RIGHTSTRIP) {
5165 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5166 i++;
5170 j = len;
5171 if (striptype != LEFTSTRIP) {
5172 do {
5173 j--;
5174 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5175 j++;
5178 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5179 Py_INCREF(self);
5180 return (PyObject*)self;
5182 else
5183 return PyUnicode_FromUnicode(s+i, j-i);
5187 static PyObject *
5188 do_strip(PyUnicodeObject *self, int striptype)
5190 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5191 int len = PyUnicode_GET_SIZE(self), i, j;
5193 i = 0;
5194 if (striptype != RIGHTSTRIP) {
5195 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5196 i++;
5200 j = len;
5201 if (striptype != LEFTSTRIP) {
5202 do {
5203 j--;
5204 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5205 j++;
5208 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5209 Py_INCREF(self);
5210 return (PyObject*)self;
5212 else
5213 return PyUnicode_FromUnicode(s+i, j-i);
5217 static PyObject *
5218 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5220 PyObject *sep = NULL;
5222 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5223 return NULL;
5225 if (sep != NULL && sep != Py_None) {
5226 if (PyUnicode_Check(sep))
5227 return _PyUnicode_XStrip(self, striptype, sep);
5228 else if (PyString_Check(sep)) {
5229 PyObject *res;
5230 sep = PyUnicode_FromObject(sep);
5231 if (sep==NULL)
5232 return NULL;
5233 res = _PyUnicode_XStrip(self, striptype, sep);
5234 Py_DECREF(sep);
5235 return res;
5237 else {
5238 PyErr_Format(PyExc_TypeError,
5239 "%s arg must be None, unicode or str",
5240 STRIPNAME(striptype));
5241 return NULL;
5245 return do_strip(self, striptype);
5249 PyDoc_STRVAR(strip__doc__,
5250 "S.strip([chars]) -> unicode\n\
5252 Return a copy of the string S with leading and trailing\n\
5253 whitespace removed.\n\
5254 If chars is given and not None, remove characters in chars instead.\n\
5255 If chars is a str, it will be converted to unicode before stripping");
5257 static PyObject *
5258 unicode_strip(PyUnicodeObject *self, PyObject *args)
5260 if (PyTuple_GET_SIZE(args) == 0)
5261 return do_strip(self, BOTHSTRIP); /* Common case */
5262 else
5263 return do_argstrip(self, BOTHSTRIP, args);
5267 PyDoc_STRVAR(lstrip__doc__,
5268 "S.lstrip([chars]) -> unicode\n\
5270 Return a copy of the string S with leading whitespace removed.\n\
5271 If chars is given and not None, remove characters in chars instead.\n\
5272 If chars is a str, it will be converted to unicode before stripping");
5274 static PyObject *
5275 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5277 if (PyTuple_GET_SIZE(args) == 0)
5278 return do_strip(self, LEFTSTRIP); /* Common case */
5279 else
5280 return do_argstrip(self, LEFTSTRIP, args);
5284 PyDoc_STRVAR(rstrip__doc__,
5285 "S.rstrip([chars]) -> unicode\n\
5287 Return a copy of the string S with trailing whitespace removed.\n\
5288 If chars is given and not None, remove characters in chars instead.\n\
5289 If chars is a str, it will be converted to unicode before stripping");
5291 static PyObject *
5292 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5294 if (PyTuple_GET_SIZE(args) == 0)
5295 return do_strip(self, RIGHTSTRIP); /* Common case */
5296 else
5297 return do_argstrip(self, RIGHTSTRIP, args);
5301 static PyObject*
5302 unicode_repeat(PyUnicodeObject *str, int len)
5304 PyUnicodeObject *u;
5305 Py_UNICODE *p;
5306 int nchars;
5307 size_t nbytes;
5309 if (len < 0)
5310 len = 0;
5312 if (len == 1 && PyUnicode_CheckExact(str)) {
5313 /* no repeat, return original string */
5314 Py_INCREF(str);
5315 return (PyObject*) str;
5318 /* ensure # of chars needed doesn't overflow int and # of bytes
5319 * needed doesn't overflow size_t
5321 nchars = len * str->length;
5322 if (len && nchars / len != str->length) {
5323 PyErr_SetString(PyExc_OverflowError,
5324 "repeated string is too long");
5325 return NULL;
5327 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5328 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5329 PyErr_SetString(PyExc_OverflowError,
5330 "repeated string is too long");
5331 return NULL;
5333 u = _PyUnicode_New(nchars);
5334 if (!u)
5335 return NULL;
5337 p = u->str;
5339 while (len-- > 0) {
5340 Py_UNICODE_COPY(p, str->str, str->length);
5341 p += str->length;
5344 return (PyObject*) u;
5347 PyObject *PyUnicode_Replace(PyObject *obj,
5348 PyObject *subobj,
5349 PyObject *replobj,
5350 int maxcount)
5352 PyObject *self;
5353 PyObject *str1;
5354 PyObject *str2;
5355 PyObject *result;
5357 self = PyUnicode_FromObject(obj);
5358 if (self == NULL)
5359 return NULL;
5360 str1 = PyUnicode_FromObject(subobj);
5361 if (str1 == NULL) {
5362 Py_DECREF(self);
5363 return NULL;
5365 str2 = PyUnicode_FromObject(replobj);
5366 if (str2 == NULL) {
5367 Py_DECREF(self);
5368 Py_DECREF(str1);
5369 return NULL;
5371 result = replace((PyUnicodeObject *)self,
5372 (PyUnicodeObject *)str1,
5373 (PyUnicodeObject *)str2,
5374 maxcount);
5375 Py_DECREF(self);
5376 Py_DECREF(str1);
5377 Py_DECREF(str2);
5378 return result;
5381 PyDoc_STRVAR(replace__doc__,
5382 "S.replace (old, new[, maxsplit]) -> unicode\n\
5384 Return a copy of S with all occurrences of substring\n\
5385 old replaced by new. If the optional argument maxsplit is\n\
5386 given, only the first maxsplit occurrences are replaced.");
5388 static PyObject*
5389 unicode_replace(PyUnicodeObject *self, PyObject *args)
5391 PyUnicodeObject *str1;
5392 PyUnicodeObject *str2;
5393 int maxcount = -1;
5394 PyObject *result;
5396 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5397 return NULL;
5398 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5399 if (str1 == NULL)
5400 return NULL;
5401 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5402 if (str2 == NULL) {
5403 Py_DECREF(str1);
5404 return NULL;
5407 result = replace(self, str1, str2, maxcount);
5409 Py_DECREF(str1);
5410 Py_DECREF(str2);
5411 return result;
5414 static
5415 PyObject *unicode_repr(PyObject *unicode)
5417 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5418 PyUnicode_GET_SIZE(unicode),
5422 PyDoc_STRVAR(rfind__doc__,
5423 "S.rfind(sub [,start [,end]]) -> int\n\
5425 Return the highest index in S where substring sub is found,\n\
5426 such that sub is contained within s[start,end]. Optional\n\
5427 arguments start and end are interpreted as in slice notation.\n\
5429 Return -1 on failure.");
5431 static PyObject *
5432 unicode_rfind(PyUnicodeObject *self, PyObject *args)
5434 PyUnicodeObject *substring;
5435 int start = 0;
5436 int end = INT_MAX;
5437 PyObject *result;
5439 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5441 return NULL;
5442 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5443 (PyObject *)substring);
5444 if (substring == NULL)
5445 return NULL;
5447 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5449 Py_DECREF(substring);
5450 return result;
5453 PyDoc_STRVAR(rindex__doc__,
5454 "S.rindex(sub [,start [,end]]) -> int\n\
5456 Like S.rfind() but raise ValueError when the substring is not found.");
5458 static PyObject *
5459 unicode_rindex(PyUnicodeObject *self, PyObject *args)
5461 int result;
5462 PyUnicodeObject *substring;
5463 int start = 0;
5464 int end = INT_MAX;
5466 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5467 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5468 return NULL;
5469 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5470 (PyObject *)substring);
5471 if (substring == NULL)
5472 return NULL;
5474 result = findstring(self, substring, start, end, -1);
5476 Py_DECREF(substring);
5477 if (result < 0) {
5478 PyErr_SetString(PyExc_ValueError, "substring not found");
5479 return NULL;
5481 return PyInt_FromLong(result);
5484 PyDoc_STRVAR(rjust__doc__,
5485 "S.rjust(width) -> unicode\n\
5487 Return S right justified in a Unicode string of length width. Padding is\n\
5488 done using spaces.");
5490 static PyObject *
5491 unicode_rjust(PyUnicodeObject *self, PyObject *args)
5493 int width;
5494 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5495 return NULL;
5497 if (self->length >= width && PyUnicode_CheckExact(self)) {
5498 Py_INCREF(self);
5499 return (PyObject*) self;
5502 return (PyObject*) pad(self, width - self->length, 0, ' ');
5505 static PyObject*
5506 unicode_slice(PyUnicodeObject *self, int start, int end)
5508 /* standard clamping */
5509 if (start < 0)
5510 start = 0;
5511 if (end < 0)
5512 end = 0;
5513 if (end > self->length)
5514 end = self->length;
5515 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5516 /* full slice, return original string */
5517 Py_INCREF(self);
5518 return (PyObject*) self;
5520 if (start > end)
5521 start = end;
5522 /* copy slice */
5523 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5524 end - start);
5527 PyObject *PyUnicode_Split(PyObject *s,
5528 PyObject *sep,
5529 int maxsplit)
5531 PyObject *result;
5533 s = PyUnicode_FromObject(s);
5534 if (s == NULL)
5535 return NULL;
5536 if (sep != NULL) {
5537 sep = PyUnicode_FromObject(sep);
5538 if (sep == NULL) {
5539 Py_DECREF(s);
5540 return NULL;
5544 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5546 Py_DECREF(s);
5547 Py_XDECREF(sep);
5548 return result;
5551 PyDoc_STRVAR(split__doc__,
5552 "S.split([sep [,maxsplit]]) -> list of strings\n\
5554 Return a list of the words in S, using sep as the\n\
5555 delimiter string. If maxsplit is given, at most maxsplit\n\
5556 splits are done. If sep is not specified, any whitespace string\n\
5557 is a separator.");
5559 static PyObject*
5560 unicode_split(PyUnicodeObject *self, PyObject *args)
5562 PyObject *substring = Py_None;
5563 int maxcount = -1;
5565 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5566 return NULL;
5568 if (substring == Py_None)
5569 return split(self, NULL, maxcount);
5570 else if (PyUnicode_Check(substring))
5571 return split(self, (PyUnicodeObject *)substring, maxcount);
5572 else
5573 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5576 PyDoc_STRVAR(splitlines__doc__,
5577 "S.splitlines([keepends]]) -> list of strings\n\
5579 Return a list of the lines in S, breaking at line boundaries.\n\
5580 Line breaks are not included in the resulting list unless keepends\n\
5581 is given and true.");
5583 static PyObject*
5584 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5586 int keepends = 0;
5588 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
5589 return NULL;
5591 return PyUnicode_Splitlines((PyObject *)self, keepends);
5594 static
5595 PyObject *unicode_str(PyUnicodeObject *self)
5597 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
5600 PyDoc_STRVAR(swapcase__doc__,
5601 "S.swapcase() -> unicode\n\
5603 Return a copy of S with uppercase characters converted to lowercase\n\
5604 and vice versa.");
5606 static PyObject*
5607 unicode_swapcase(PyUnicodeObject *self)
5609 return fixup(self, fixswapcase);
5612 PyDoc_STRVAR(translate__doc__,
5613 "S.translate(table) -> unicode\n\
5615 Return a copy of the string S, where all characters have been mapped\n\
5616 through the given translation table, which must be a mapping of\n\
5617 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5618 Unmapped characters are left untouched. Characters mapped to None\n\
5619 are deleted.");
5621 static PyObject*
5622 unicode_translate(PyUnicodeObject *self, PyObject *table)
5624 return PyUnicode_TranslateCharmap(self->str,
5625 self->length,
5626 table,
5627 "ignore");
5630 PyDoc_STRVAR(upper__doc__,
5631 "S.upper() -> unicode\n\
5633 Return a copy of S converted to uppercase.");
5635 static PyObject*
5636 unicode_upper(PyUnicodeObject *self)
5638 return fixup(self, fixupper);
5641 PyDoc_STRVAR(zfill__doc__,
5642 "S.zfill(width) -> unicode\n\
5644 Pad a numeric string x with zeros on the left, to fill a field\n\
5645 of the specified width. The string x is never truncated.");
5647 static PyObject *
5648 unicode_zfill(PyUnicodeObject *self, PyObject *args)
5650 int fill;
5651 PyUnicodeObject *u;
5653 int width;
5654 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5655 return NULL;
5657 if (self->length >= width) {
5658 if (PyUnicode_CheckExact(self)) {
5659 Py_INCREF(self);
5660 return (PyObject*) self;
5662 else
5663 return PyUnicode_FromUnicode(
5664 PyUnicode_AS_UNICODE(self),
5665 PyUnicode_GET_SIZE(self)
5669 fill = width - self->length;
5671 u = pad(self, fill, 0, '0');
5673 if (u == NULL)
5674 return NULL;
5676 if (u->str[fill] == '+' || u->str[fill] == '-') {
5677 /* move sign to beginning of string */
5678 u->str[0] = u->str[fill];
5679 u->str[fill] = '0';
5682 return (PyObject*) u;
5685 #if 0
5686 static PyObject*
5687 unicode_freelistsize(PyUnicodeObject *self)
5689 return PyInt_FromLong(unicode_freelist_size);
5691 #endif
5693 PyDoc_STRVAR(startswith__doc__,
5694 "S.startswith(prefix[, start[, end]]) -> bool\n\
5696 Return True if S starts with the specified prefix, False otherwise.\n\
5697 With optional start, test S beginning at that position.\n\
5698 With optional end, stop comparing S at that position.");
5700 static PyObject *
5701 unicode_startswith(PyUnicodeObject *self,
5702 PyObject *args)
5704 PyUnicodeObject *substring;
5705 int start = 0;
5706 int end = INT_MAX;
5707 PyObject *result;
5709 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5711 return NULL;
5712 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5713 (PyObject *)substring);
5714 if (substring == NULL)
5715 return NULL;
5717 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
5719 Py_DECREF(substring);
5720 return result;
5724 PyDoc_STRVAR(endswith__doc__,
5725 "S.endswith(suffix[, start[, end]]) -> bool\n\
5727 Return True if S ends with the specified suffix, False otherwise.\n\
5728 With optional start, test S beginning at that position.\n\
5729 With optional end, stop comparing S at that position.");
5731 static PyObject *
5732 unicode_endswith(PyUnicodeObject *self,
5733 PyObject *args)
5735 PyUnicodeObject *substring;
5736 int start = 0;
5737 int end = INT_MAX;
5738 PyObject *result;
5740 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5741 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5742 return NULL;
5743 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5744 (PyObject *)substring);
5745 if (substring == NULL)
5746 return NULL;
5748 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
5750 Py_DECREF(substring);
5751 return result;
5756 static PyObject *
5757 unicode_getnewargs(PyUnicodeObject *v)
5759 return Py_BuildValue("(u#)", v->str, v->length);
5763 static PyMethodDef unicode_methods[] = {
5765 /* Order is according to common usage: often used methods should
5766 appear first, since lookup is done sequentially. */
5768 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5769 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5770 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5771 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5772 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5773 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5774 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5775 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5776 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5777 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5778 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5779 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5780 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5781 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5782 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5783 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5784 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5785 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5786 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5787 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5788 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5789 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5790 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5791 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5792 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5793 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5794 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5795 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5796 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5797 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5798 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5799 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5800 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5801 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5802 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5803 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5804 #if 0
5805 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5806 #endif
5808 #if 0
5809 /* This one is just used for debugging the implementation. */
5810 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5811 #endif
5813 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
5814 {NULL, NULL}
5817 static PyObject *
5818 unicode_mod(PyObject *v, PyObject *w)
5820 if (!PyUnicode_Check(v)) {
5821 Py_INCREF(Py_NotImplemented);
5822 return Py_NotImplemented;
5824 return PyUnicode_Format(v, w);
5827 static PyNumberMethods unicode_as_number = {
5828 0, /*nb_add*/
5829 0, /*nb_subtract*/
5830 0, /*nb_multiply*/
5831 0, /*nb_divide*/
5832 unicode_mod, /*nb_remainder*/
5835 static PySequenceMethods unicode_as_sequence = {
5836 (inquiry) unicode_length, /* sq_length */
5837 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5838 (intargfunc) unicode_repeat, /* sq_repeat */
5839 (intargfunc) unicode_getitem, /* sq_item */
5840 (intintargfunc) unicode_slice, /* sq_slice */
5841 0, /* sq_ass_item */
5842 0, /* sq_ass_slice */
5843 (objobjproc)PyUnicode_Contains, /*sq_contains*/
5846 static PyObject*
5847 unicode_subscript(PyUnicodeObject* self, PyObject* item)
5849 if (PyInt_Check(item)) {
5850 long i = PyInt_AS_LONG(item);
5851 if (i < 0)
5852 i += PyString_GET_SIZE(self);
5853 return unicode_getitem(self, i);
5854 } else if (PyLong_Check(item)) {
5855 long i = PyLong_AsLong(item);
5856 if (i == -1 && PyErr_Occurred())
5857 return NULL;
5858 if (i < 0)
5859 i += PyString_GET_SIZE(self);
5860 return unicode_getitem(self, i);
5861 } else if (PySlice_Check(item)) {
5862 int start, stop, step, slicelength, cur, i;
5863 Py_UNICODE* source_buf;
5864 Py_UNICODE* result_buf;
5865 PyObject* result;
5867 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5868 &start, &stop, &step, &slicelength) < 0) {
5869 return NULL;
5872 if (slicelength <= 0) {
5873 return PyUnicode_FromUnicode(NULL, 0);
5874 } else {
5875 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5876 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5878 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5879 result_buf[i] = source_buf[cur];
5882 result = PyUnicode_FromUnicode(result_buf, slicelength);
5883 PyMem_FREE(result_buf);
5884 return result;
5886 } else {
5887 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5888 return NULL;
5892 static PyMappingMethods unicode_as_mapping = {
5893 (inquiry)unicode_length, /* mp_length */
5894 (binaryfunc)unicode_subscript, /* mp_subscript */
5895 (objobjargproc)0, /* mp_ass_subscript */
5898 static int
5899 unicode_buffer_getreadbuf(PyUnicodeObject *self,
5900 int index,
5901 const void **ptr)
5903 if (index != 0) {
5904 PyErr_SetString(PyExc_SystemError,
5905 "accessing non-existent unicode segment");
5906 return -1;
5908 *ptr = (void *) self->str;
5909 return PyUnicode_GET_DATA_SIZE(self);
5912 static int
5913 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5914 const void **ptr)
5916 PyErr_SetString(PyExc_TypeError,
5917 "cannot use unicode as modifiable buffer");
5918 return -1;
5921 static int
5922 unicode_buffer_getsegcount(PyUnicodeObject *self,
5923 int *lenp)
5925 if (lenp)
5926 *lenp = PyUnicode_GET_DATA_SIZE(self);
5927 return 1;
5930 static int
5931 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5932 int index,
5933 const void **ptr)
5935 PyObject *str;
5937 if (index != 0) {
5938 PyErr_SetString(PyExc_SystemError,
5939 "accessing non-existent unicode segment");
5940 return -1;
5942 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5943 if (str == NULL)
5944 return -1;
5945 *ptr = (void *) PyString_AS_STRING(str);
5946 return PyString_GET_SIZE(str);
5949 /* Helpers for PyUnicode_Format() */
5951 static PyObject *
5952 getnextarg(PyObject *args, int arglen, int *p_argidx)
5954 int argidx = *p_argidx;
5955 if (argidx < arglen) {
5956 (*p_argidx)++;
5957 if (arglen < 0)
5958 return args;
5959 else
5960 return PyTuple_GetItem(args, argidx);
5962 PyErr_SetString(PyExc_TypeError,
5963 "not enough arguments for format string");
5964 return NULL;
5967 #define F_LJUST (1<<0)
5968 #define F_SIGN (1<<1)
5969 #define F_BLANK (1<<2)
5970 #define F_ALT (1<<3)
5971 #define F_ZERO (1<<4)
5973 static
5974 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5976 register int i;
5977 int len;
5978 va_list va;
5979 char *charbuffer;
5980 va_start(va, format);
5982 /* First, format the string as char array, then expand to Py_UNICODE
5983 array. */
5984 charbuffer = (char *)buffer;
5985 len = vsprintf(charbuffer, format, va);
5986 for (i = len - 1; i >= 0; i--)
5987 buffer[i] = (Py_UNICODE) charbuffer[i];
5989 va_end(va);
5990 return len;
5993 /* XXX To save some code duplication, formatfloat/long/int could have been
5994 shared with stringobject.c, converting from 8-bit to Unicode after the
5995 formatting is done. */
5997 static int
5998 formatfloat(Py_UNICODE *buf,
5999 size_t buflen,
6000 int flags,
6001 int prec,
6002 int type,
6003 PyObject *v)
6005 /* fmt = '%#.' + `prec` + `type`
6006 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6007 char fmt[20];
6008 double x;
6010 x = PyFloat_AsDouble(v);
6011 if (x == -1.0 && PyErr_Occurred())
6012 return -1;
6013 if (prec < 0)
6014 prec = 6;
6015 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6016 type = 'g';
6017 /* Worst case length calc to ensure no buffer overrun:
6019 'g' formats:
6020 fmt = %#.<prec>g
6021 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6022 for any double rep.)
6023 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6025 'f' formats:
6026 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6027 len = 1 + 50 + 1 + prec = 52 + prec
6029 If prec=0 the effective precision is 1 (the leading digit is
6030 always given), therefore increase the length by one.
6033 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6034 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6035 PyErr_SetString(PyExc_OverflowError,
6036 "formatted float is too long (precision too large?)");
6037 return -1;
6039 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6040 (flags&F_ALT) ? "#" : "",
6041 prec, type);
6042 return usprintf(buf, fmt, x);
6045 static PyObject*
6046 formatlong(PyObject *val, int flags, int prec, int type)
6048 char *buf;
6049 int i, len;
6050 PyObject *str; /* temporary string object. */
6051 PyUnicodeObject *result;
6053 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6054 if (!str)
6055 return NULL;
6056 result = _PyUnicode_New(len);
6057 for (i = 0; i < len; i++)
6058 result->str[i] = buf[i];
6059 result->str[len] = 0;
6060 Py_DECREF(str);
6061 return (PyObject*)result;
6064 static int
6065 formatint(Py_UNICODE *buf,
6066 size_t buflen,
6067 int flags,
6068 int prec,
6069 int type,
6070 PyObject *v)
6072 /* fmt = '%#.' + `prec` + 'l' + `type`
6073 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6074 * + 1 + 1
6075 * = 24
6077 char fmt[64]; /* plenty big enough! */
6078 long x;
6080 x = PyInt_AsLong(v);
6081 if (x == -1 && PyErr_Occurred())
6082 return -1;
6083 if (x < 0 && type != 'd' && type != 'i') {
6084 if (PyErr_Warn(PyExc_FutureWarning,
6085 "%u/%o/%x/%X of negative int will return "
6086 "a signed string in Python 2.4 and up") < 0)
6087 return -1;
6089 if (prec < 0)
6090 prec = 1;
6092 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6093 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6095 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
6096 PyErr_SetString(PyExc_OverflowError,
6097 "formatted integer is too long (precision too large?)");
6098 return -1;
6101 if ((flags & F_ALT) &&
6102 (type == 'x' || type == 'X')) {
6103 /* When converting under %#x or %#X, there are a number
6104 * of issues that cause pain:
6105 * - when 0 is being converted, the C standard leaves off
6106 * the '0x' or '0X', which is inconsistent with other
6107 * %#x/%#X conversions and inconsistent with Python's
6108 * hex() function
6109 * - there are platforms that violate the standard and
6110 * convert 0 with the '0x' or '0X'
6111 * (Metrowerks, Compaq Tru64)
6112 * - there are platforms that give '0x' when converting
6113 * under %#X, but convert 0 in accordance with the
6114 * standard (OS/2 EMX)
6116 * We can achieve the desired consistency by inserting our
6117 * own '0x' or '0X' prefix, and substituting %x/%X in place
6118 * of %#x/%#X.
6120 * Note that this is the same approach as used in
6121 * formatint() in stringobject.c
6123 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6124 type, prec, type);
6126 else {
6127 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6128 (flags&F_ALT) ? "#" : "",
6129 prec, type);
6131 return usprintf(buf, fmt, x);
6134 static int
6135 formatchar(Py_UNICODE *buf,
6136 size_t buflen,
6137 PyObject *v)
6139 /* presume that the buffer is at least 2 characters long */
6140 if (PyUnicode_Check(v)) {
6141 if (PyUnicode_GET_SIZE(v) != 1)
6142 goto onError;
6143 buf[0] = PyUnicode_AS_UNICODE(v)[0];
6146 else if (PyString_Check(v)) {
6147 if (PyString_GET_SIZE(v) != 1)
6148 goto onError;
6149 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6152 else {
6153 /* Integer input truncated to a character */
6154 long x;
6155 x = PyInt_AsLong(v);
6156 if (x == -1 && PyErr_Occurred())
6157 goto onError;
6158 #ifdef Py_UNICODE_WIDE
6159 if (x < 0 || x > 0x10ffff) {
6160 PyErr_SetString(PyExc_OverflowError,
6161 "%c arg not in range(0x110000) "
6162 "(wide Python build)");
6163 return -1;
6165 #else
6166 if (x < 0 || x > 0xffff) {
6167 PyErr_SetString(PyExc_OverflowError,
6168 "%c arg not in range(0x10000) "
6169 "(narrow Python build)");
6170 return -1;
6172 #endif
6173 buf[0] = (Py_UNICODE) x;
6175 buf[1] = '\0';
6176 return 1;
6178 onError:
6179 PyErr_SetString(PyExc_TypeError,
6180 "%c requires int or char");
6181 return -1;
6184 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6186 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6187 chars are formatted. XXX This is a magic number. Each formatting
6188 routine does bounds checking to ensure no overflow, but a better
6189 solution may be to malloc a buffer of appropriate size for each
6190 format. For now, the current solution is sufficient.
6192 #define FORMATBUFLEN (size_t)120
6194 PyObject *PyUnicode_Format(PyObject *format,
6195 PyObject *args)
6197 Py_UNICODE *fmt, *res;
6198 int fmtcnt, rescnt, reslen, arglen, argidx;
6199 int args_owned = 0;
6200 PyUnicodeObject *result = NULL;
6201 PyObject *dict = NULL;
6202 PyObject *uformat;
6204 if (format == NULL || args == NULL) {
6205 PyErr_BadInternalCall();
6206 return NULL;
6208 uformat = PyUnicode_FromObject(format);
6209 if (uformat == NULL)
6210 return NULL;
6211 fmt = PyUnicode_AS_UNICODE(uformat);
6212 fmtcnt = PyUnicode_GET_SIZE(uformat);
6214 reslen = rescnt = fmtcnt + 100;
6215 result = _PyUnicode_New(reslen);
6216 if (result == NULL)
6217 goto onError;
6218 res = PyUnicode_AS_UNICODE(result);
6220 if (PyTuple_Check(args)) {
6221 arglen = PyTuple_Size(args);
6222 argidx = 0;
6224 else {
6225 arglen = -1;
6226 argidx = -2;
6228 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6229 !PyObject_TypeCheck(args, &PyBaseString_Type))
6230 dict = args;
6232 while (--fmtcnt >= 0) {
6233 if (*fmt != '%') {
6234 if (--rescnt < 0) {
6235 rescnt = fmtcnt + 100;
6236 reslen += rescnt;
6237 if (_PyUnicode_Resize(&result, reslen) < 0)
6238 return NULL;
6239 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6240 --rescnt;
6242 *res++ = *fmt++;
6244 else {
6245 /* Got a format specifier */
6246 int flags = 0;
6247 int width = -1;
6248 int prec = -1;
6249 Py_UNICODE c = '\0';
6250 Py_UNICODE fill;
6251 PyObject *v = NULL;
6252 PyObject *temp = NULL;
6253 Py_UNICODE *pbuf;
6254 Py_UNICODE sign;
6255 int len;
6256 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6258 fmt++;
6259 if (*fmt == '(') {
6260 Py_UNICODE *keystart;
6261 int keylen;
6262 PyObject *key;
6263 int pcount = 1;
6265 if (dict == NULL) {
6266 PyErr_SetString(PyExc_TypeError,
6267 "format requires a mapping");
6268 goto onError;
6270 ++fmt;
6271 --fmtcnt;
6272 keystart = fmt;
6273 /* Skip over balanced parentheses */
6274 while (pcount > 0 && --fmtcnt >= 0) {
6275 if (*fmt == ')')
6276 --pcount;
6277 else if (*fmt == '(')
6278 ++pcount;
6279 fmt++;
6281 keylen = fmt - keystart - 1;
6282 if (fmtcnt < 0 || pcount > 0) {
6283 PyErr_SetString(PyExc_ValueError,
6284 "incomplete format key");
6285 goto onError;
6287 #if 0
6288 /* keys are converted to strings using UTF-8 and
6289 then looked up since Python uses strings to hold
6290 variables names etc. in its namespaces and we
6291 wouldn't want to break common idioms. */
6292 key = PyUnicode_EncodeUTF8(keystart,
6293 keylen,
6294 NULL);
6295 #else
6296 key = PyUnicode_FromUnicode(keystart, keylen);
6297 #endif
6298 if (key == NULL)
6299 goto onError;
6300 if (args_owned) {
6301 Py_DECREF(args);
6302 args_owned = 0;
6304 args = PyObject_GetItem(dict, key);
6305 Py_DECREF(key);
6306 if (args == NULL) {
6307 goto onError;
6309 args_owned = 1;
6310 arglen = -1;
6311 argidx = -2;
6313 while (--fmtcnt >= 0) {
6314 switch (c = *fmt++) {
6315 case '-': flags |= F_LJUST; continue;
6316 case '+': flags |= F_SIGN; continue;
6317 case ' ': flags |= F_BLANK; continue;
6318 case '#': flags |= F_ALT; continue;
6319 case '0': flags |= F_ZERO; continue;
6321 break;
6323 if (c == '*') {
6324 v = getnextarg(args, arglen, &argidx);
6325 if (v == NULL)
6326 goto onError;
6327 if (!PyInt_Check(v)) {
6328 PyErr_SetString(PyExc_TypeError,
6329 "* wants int");
6330 goto onError;
6332 width = PyInt_AsLong(v);
6333 if (width < 0) {
6334 flags |= F_LJUST;
6335 width = -width;
6337 if (--fmtcnt >= 0)
6338 c = *fmt++;
6340 else if (c >= '0' && c <= '9') {
6341 width = c - '0';
6342 while (--fmtcnt >= 0) {
6343 c = *fmt++;
6344 if (c < '0' || c > '9')
6345 break;
6346 if ((width*10) / 10 != width) {
6347 PyErr_SetString(PyExc_ValueError,
6348 "width too big");
6349 goto onError;
6351 width = width*10 + (c - '0');
6354 if (c == '.') {
6355 prec = 0;
6356 if (--fmtcnt >= 0)
6357 c = *fmt++;
6358 if (c == '*') {
6359 v = getnextarg(args, arglen, &argidx);
6360 if (v == NULL)
6361 goto onError;
6362 if (!PyInt_Check(v)) {
6363 PyErr_SetString(PyExc_TypeError,
6364 "* wants int");
6365 goto onError;
6367 prec = PyInt_AsLong(v);
6368 if (prec < 0)
6369 prec = 0;
6370 if (--fmtcnt >= 0)
6371 c = *fmt++;
6373 else if (c >= '0' && c <= '9') {
6374 prec = c - '0';
6375 while (--fmtcnt >= 0) {
6376 c = Py_CHARMASK(*fmt++);
6377 if (c < '0' || c > '9')
6378 break;
6379 if ((prec*10) / 10 != prec) {
6380 PyErr_SetString(PyExc_ValueError,
6381 "prec too big");
6382 goto onError;
6384 prec = prec*10 + (c - '0');
6387 } /* prec */
6388 if (fmtcnt >= 0) {
6389 if (c == 'h' || c == 'l' || c == 'L') {
6390 if (--fmtcnt >= 0)
6391 c = *fmt++;
6394 if (fmtcnt < 0) {
6395 PyErr_SetString(PyExc_ValueError,
6396 "incomplete format");
6397 goto onError;
6399 if (c != '%') {
6400 v = getnextarg(args, arglen, &argidx);
6401 if (v == NULL)
6402 goto onError;
6404 sign = 0;
6405 fill = ' ';
6406 switch (c) {
6408 case '%':
6409 pbuf = formatbuf;
6410 /* presume that buffer length is at least 1 */
6411 pbuf[0] = '%';
6412 len = 1;
6413 break;
6415 case 's':
6416 case 'r':
6417 if (PyUnicode_Check(v) && c == 's') {
6418 temp = v;
6419 Py_INCREF(temp);
6421 else {
6422 PyObject *unicode;
6423 if (c == 's')
6424 temp = PyObject_Str(v);
6425 else
6426 temp = PyObject_Repr(v);
6427 if (temp == NULL)
6428 goto onError;
6429 if (!PyString_Check(temp)) {
6430 /* XXX Note: this should never happen, since
6431 PyObject_Repr() and PyObject_Str() assure
6432 this */
6433 Py_DECREF(temp);
6434 PyErr_SetString(PyExc_TypeError,
6435 "%s argument has non-string str()");
6436 goto onError;
6438 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6439 PyString_GET_SIZE(temp),
6440 NULL,
6441 "strict");
6442 Py_DECREF(temp);
6443 temp = unicode;
6444 if (temp == NULL)
6445 goto onError;
6447 pbuf = PyUnicode_AS_UNICODE(temp);
6448 len = PyUnicode_GET_SIZE(temp);
6449 if (prec >= 0 && len > prec)
6450 len = prec;
6451 break;
6453 case 'i':
6454 case 'd':
6455 case 'u':
6456 case 'o':
6457 case 'x':
6458 case 'X':
6459 if (c == 'i')
6460 c = 'd';
6461 if (PyLong_Check(v)) {
6462 temp = formatlong(v, flags, prec, c);
6463 if (!temp)
6464 goto onError;
6465 pbuf = PyUnicode_AS_UNICODE(temp);
6466 len = PyUnicode_GET_SIZE(temp);
6467 /* unbounded ints can always produce
6468 a sign character! */
6469 sign = 1;
6471 else {
6472 pbuf = formatbuf;
6473 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6474 flags, prec, c, v);
6475 if (len < 0)
6476 goto onError;
6477 /* only d conversion is signed */
6478 sign = c == 'd';
6480 if (flags & F_ZERO)
6481 fill = '0';
6482 break;
6484 case 'e':
6485 case 'E':
6486 case 'f':
6487 case 'g':
6488 case 'G':
6489 pbuf = formatbuf;
6490 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6491 flags, prec, c, v);
6492 if (len < 0)
6493 goto onError;
6494 sign = 1;
6495 if (flags & F_ZERO)
6496 fill = '0';
6497 break;
6499 case 'c':
6500 pbuf = formatbuf;
6501 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6502 if (len < 0)
6503 goto onError;
6504 break;
6506 default:
6507 PyErr_Format(PyExc_ValueError,
6508 "unsupported format character '%c' (0x%x) "
6509 "at index %i",
6510 (31<=c && c<=126) ? (char)c : '?',
6511 (int)c,
6512 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
6513 goto onError;
6515 if (sign) {
6516 if (*pbuf == '-' || *pbuf == '+') {
6517 sign = *pbuf++;
6518 len--;
6520 else if (flags & F_SIGN)
6521 sign = '+';
6522 else if (flags & F_BLANK)
6523 sign = ' ';
6524 else
6525 sign = 0;
6527 if (width < len)
6528 width = len;
6529 if (rescnt - (sign != 0) < width) {
6530 reslen -= rescnt;
6531 rescnt = width + fmtcnt + 100;
6532 reslen += rescnt;
6533 if (reslen < 0) {
6534 Py_DECREF(result);
6535 return PyErr_NoMemory();
6537 if (_PyUnicode_Resize(&result, reslen) < 0)
6538 return NULL;
6539 res = PyUnicode_AS_UNICODE(result)
6540 + reslen - rescnt;
6542 if (sign) {
6543 if (fill != ' ')
6544 *res++ = sign;
6545 rescnt--;
6546 if (width > len)
6547 width--;
6549 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6550 assert(pbuf[0] == '0');
6551 assert(pbuf[1] == c);
6552 if (fill != ' ') {
6553 *res++ = *pbuf++;
6554 *res++ = *pbuf++;
6556 rescnt -= 2;
6557 width -= 2;
6558 if (width < 0)
6559 width = 0;
6560 len -= 2;
6562 if (width > len && !(flags & F_LJUST)) {
6563 do {
6564 --rescnt;
6565 *res++ = fill;
6566 } while (--width > len);
6568 if (fill == ' ') {
6569 if (sign)
6570 *res++ = sign;
6571 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6572 assert(pbuf[0] == '0');
6573 assert(pbuf[1] == c);
6574 *res++ = *pbuf++;
6575 *res++ = *pbuf++;
6578 Py_UNICODE_COPY(res, pbuf, len);
6579 res += len;
6580 rescnt -= len;
6581 while (--width >= len) {
6582 --rescnt;
6583 *res++ = ' ';
6585 if (dict && (argidx < arglen) && c != '%') {
6586 PyErr_SetString(PyExc_TypeError,
6587 "not all arguments converted during string formatting");
6588 goto onError;
6590 Py_XDECREF(temp);
6591 } /* '%' */
6592 } /* until end */
6593 if (argidx < arglen && !dict) {
6594 PyErr_SetString(PyExc_TypeError,
6595 "not all arguments converted during string formatting");
6596 goto onError;
6599 if (args_owned) {
6600 Py_DECREF(args);
6602 Py_DECREF(uformat);
6603 if (_PyUnicode_Resize(&result, reslen - rescnt))
6604 goto onError;
6605 return (PyObject *)result;
6607 onError:
6608 Py_XDECREF(result);
6609 Py_DECREF(uformat);
6610 if (args_owned) {
6611 Py_DECREF(args);
6613 return NULL;
6616 static PyBufferProcs unicode_as_buffer = {
6617 (getreadbufferproc) unicode_buffer_getreadbuf,
6618 (getwritebufferproc) unicode_buffer_getwritebuf,
6619 (getsegcountproc) unicode_buffer_getsegcount,
6620 (getcharbufferproc) unicode_buffer_getcharbuf,
6623 static PyObject *
6624 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6626 static PyObject *
6627 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6629 PyObject *x = NULL;
6630 static char *kwlist[] = {"string", "encoding", "errors", 0};
6631 char *encoding = NULL;
6632 char *errors = NULL;
6634 if (type != &PyUnicode_Type)
6635 return unicode_subtype_new(type, args, kwds);
6636 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6637 kwlist, &x, &encoding, &errors))
6638 return NULL;
6639 if (x == NULL)
6640 return (PyObject *)_PyUnicode_New(0);
6641 if (encoding == NULL && errors == NULL)
6642 return PyObject_Unicode(x);
6643 else
6644 return PyUnicode_FromEncodedObject(x, encoding, errors);
6647 static PyObject *
6648 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6650 PyUnicodeObject *tmp, *pnew;
6651 int n;
6653 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6654 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6655 if (tmp == NULL)
6656 return NULL;
6657 assert(PyUnicode_Check(tmp));
6658 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6659 if (pnew == NULL)
6660 return NULL;
6661 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6662 if (pnew->str == NULL) {
6663 _Py_ForgetReference((PyObject *)pnew);
6664 PyObject_Del(pnew);
6665 return PyErr_NoMemory();
6667 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6668 pnew->length = n;
6669 pnew->hash = tmp->hash;
6670 Py_DECREF(tmp);
6671 return (PyObject *)pnew;
6674 PyDoc_STRVAR(unicode_doc,
6675 "unicode(string [, encoding[, errors]]) -> object\n\
6677 Create a new Unicode object from the given encoded string.\n\
6678 encoding defaults to the current default string encoding.\n\
6679 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6681 PyTypeObject PyUnicode_Type = {
6682 PyObject_HEAD_INIT(&PyType_Type)
6683 0, /* ob_size */
6684 "unicode", /* tp_name */
6685 sizeof(PyUnicodeObject), /* tp_size */
6686 0, /* tp_itemsize */
6687 /* Slots */
6688 (destructor)unicode_dealloc, /* tp_dealloc */
6689 0, /* tp_print */
6690 0, /* tp_getattr */
6691 0, /* tp_setattr */
6692 (cmpfunc) unicode_compare, /* tp_compare */
6693 (reprfunc) unicode_repr, /* tp_repr */
6694 &unicode_as_number, /* tp_as_number */
6695 &unicode_as_sequence, /* tp_as_sequence */
6696 &unicode_as_mapping, /* tp_as_mapping */
6697 (hashfunc) unicode_hash, /* tp_hash*/
6698 0, /* tp_call*/
6699 (reprfunc) unicode_str, /* tp_str */
6700 PyObject_GenericGetAttr, /* tp_getattro */
6701 0, /* tp_setattro */
6702 &unicode_as_buffer, /* tp_as_buffer */
6703 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6704 Py_TPFLAGS_BASETYPE, /* tp_flags */
6705 unicode_doc, /* tp_doc */
6706 0, /* tp_traverse */
6707 0, /* tp_clear */
6708 0, /* tp_richcompare */
6709 0, /* tp_weaklistoffset */
6710 0, /* tp_iter */
6711 0, /* tp_iternext */
6712 unicode_methods, /* tp_methods */
6713 0, /* tp_members */
6714 0, /* tp_getset */
6715 &PyBaseString_Type, /* tp_base */
6716 0, /* tp_dict */
6717 0, /* tp_descr_get */
6718 0, /* tp_descr_set */
6719 0, /* tp_dictoffset */
6720 0, /* tp_init */
6721 0, /* tp_alloc */
6722 unicode_new, /* tp_new */
6723 PyObject_Del, /* tp_free */
6726 /* Initialize the Unicode implementation */
6728 void _PyUnicode_Init(void)
6730 int i;
6732 /* Init the implementation */
6733 unicode_freelist = NULL;
6734 unicode_freelist_size = 0;
6735 unicode_empty = _PyUnicode_New(0);
6736 strcpy(unicode_default_encoding, "ascii");
6737 for (i = 0; i < 256; i++)
6738 unicode_latin1[i] = NULL;
6739 if (PyType_Ready(&PyUnicode_Type) < 0)
6740 Py_FatalError("Can't initialize 'unicode'");
6743 /* Finalize the Unicode implementation */
6745 void
6746 _PyUnicode_Fini(void)
6748 PyUnicodeObject *u;
6749 int i;
6751 Py_XDECREF(unicode_empty);
6752 unicode_empty = NULL;
6754 for (i = 0; i < 256; i++) {
6755 if (unicode_latin1[i]) {
6756 Py_DECREF(unicode_latin1[i]);
6757 unicode_latin1[i] = NULL;
6761 for (u = unicode_freelist; u != NULL;) {
6762 PyUnicodeObject *v = u;
6763 u = *(PyUnicodeObject **)u;
6764 if (v->str)
6765 PyMem_DEL(v->str);
6766 Py_XDECREF(v->defenc);
6767 PyObject_Del(v);
6769 unicode_freelist = NULL;
6770 unicode_freelist_size = 0;