Null commit with -f option to force an uprev and put HEADs firmly on the trunk.
[python/dscho.git] / Objects / unicodeobject.c
blob08e80894d841a6e4d13b1d25979423d5fe58c1a8
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax()
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_DEL(unicode);
223 return NULL;
226 static
227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
232 PyMem_DEL(unicode->str);
233 unicode->str = NULL;
234 unicode->length = 0;
236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
240 /* Add to free list */
241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
245 else {
246 PyMem_DEL(unicode->str);
247 Py_XDECREF(unicode->defenc);
248 PyObject_DEL(unicode);
252 int PyUnicode_Resize(PyObject **unicode,
253 int length)
255 register PyUnicodeObject *v;
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
294 PyUnicodeObject *unicode;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
312 if (!unicode)
313 return NULL;
314 unicode->str[0] = *u;
315 unicode_latin1[*u] = unicode;
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
328 Py_UNICODE_COPY(unicode->str, u, size);
330 return (PyObject *)unicode;
333 #ifdef HAVE_WCHAR_H
335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
338 PyUnicodeObject *unicode;
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352 #else
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
360 #endif
362 return (PyObject *)unicode;
365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377 #else
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
385 #endif
387 return size;
390 #endif
392 PyObject *PyUnicode_FromObject(register PyObject *obj)
394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
401 const char *s;
402 int len;
403 int owned = 0;
404 PyObject *v;
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
426 if (PyUnicode_Check(obj)) {
427 Py_INCREF(obj);
428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
434 goto done;
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
448 goto onError;
451 /* Convert to Unicode */
452 if (len == 0) {
453 Py_INCREF(unicode_empty);
454 v = (PyObject *)unicode_empty;
456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
459 done:
460 if (owned) {
461 Py_DECREF(obj);
463 return v;
465 onError:
466 if (owned) {
467 Py_DECREF(obj);
469 return NULL;
472 PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
477 PyObject *buffer = NULL, *unicode;
479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
484 return PyUnicode_DecodeUTF8(s, size, errors);
485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
499 "decoder did not return an unicode object (type=%.400s)",
500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
504 Py_DECREF(buffer);
505 return unicode;
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
512 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
517 PyObject *v, *unicode;
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
527 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
531 PyObject *v;
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
544 return PyUnicode_AsUTF8String(unicode);
545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
558 "encoder did not return a string object (type=%.400s)",
559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
563 return v;
565 onError:
566 return NULL;
569 /* Return a Python string holding the default encoded value of the
570 Unicode object.
572 The resulting string is cached in the Unicode object for subsequent
573 usage by this function. The cached version is needed to implement
574 the character buffer interface and will live (at least) as long as
575 the Unicode object itself.
577 The refcount of the string is *not* incremented.
579 *** Exported for internal use by the interpreter only !!! ***
583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
602 return PyUnicode_AS_UNICODE(unicode);
604 onError:
605 return NULL;
608 int PyUnicode_GetSize(PyObject *unicode)
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
614 return PyUnicode_GET_SIZE(unicode);
616 onError:
617 return -1;
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding;
625 int PyUnicode_SetDefaultEncoding(const char *encoding)
627 PyObject *v;
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
640 onError:
641 return -1;
644 /* --- UTF-8 Codec -------------------------------------------------------- */
646 static
647 char utf8_code_length[256] = {
648 /* Map UTF-8 encoded prefix byte to sequence length. zero means
649 illegal prefix. see RFC 2279 for details */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
663 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
665 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
668 static
669 int utf8_decoding_error(const char **source,
670 Py_UNICODE **dest,
671 const char *errors,
672 const char *details)
674 if ((errors == NULL) ||
675 (strcmp(errors,"strict") == 0)) {
676 PyErr_Format(PyExc_UnicodeError,
677 "UTF-8 decoding error: %.400s",
678 details);
679 return -1;
681 else if (strcmp(errors,"ignore") == 0) {
682 (*source)++;
683 return 0;
685 else if (strcmp(errors,"replace") == 0) {
686 (*source)++;
687 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
688 (*dest)++;
689 return 0;
691 else {
692 PyErr_Format(PyExc_ValueError,
693 "UTF-8 decoding error; unknown error handling code: %.400s",
694 errors);
695 return -1;
699 PyObject *PyUnicode_DecodeUTF8(const char *s,
700 int size,
701 const char *errors)
703 int n;
704 const char *e;
705 PyUnicodeObject *unicode;
706 Py_UNICODE *p;
707 const char *errmsg = "";
709 /* Note: size will always be longer than the resulting Unicode
710 character count */
711 unicode = _PyUnicode_New(size);
712 if (!unicode)
713 return NULL;
714 if (size == 0)
715 return (PyObject *)unicode;
717 /* Unpack UTF-8 encoded data */
718 p = unicode->str;
719 e = s + size;
721 while (s < e) {
722 Py_UCS4 ch = (unsigned char)*s;
724 if (ch < 0x80) {
725 *p++ = (Py_UNICODE)ch;
726 s++;
727 continue;
730 n = utf8_code_length[ch];
732 if (s + n > e) {
733 errmsg = "unexpected end of data";
734 goto utf8Error;
737 switch (n) {
739 case 0:
740 errmsg = "unexpected code byte";
741 goto utf8Error;
743 case 1:
744 errmsg = "internal error";
745 goto utf8Error;
747 case 2:
748 if ((s[1] & 0xc0) != 0x80) {
749 errmsg = "invalid data";
750 goto utf8Error;
752 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
753 if (ch < 0x80) {
754 errmsg = "illegal encoding";
755 goto utf8Error;
757 else
758 *p++ = (Py_UNICODE)ch;
759 break;
761 case 3:
762 if ((s[1] & 0xc0) != 0x80 ||
763 (s[2] & 0xc0) != 0x80) {
764 errmsg = "invalid data";
765 goto utf8Error;
767 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
768 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
769 errmsg = "illegal encoding";
770 goto utf8Error;
772 else
773 *p++ = (Py_UNICODE)ch;
774 break;
776 case 4:
777 if ((s[1] & 0xc0) != 0x80 ||
778 (s[2] & 0xc0) != 0x80 ||
779 (s[3] & 0xc0) != 0x80) {
780 errmsg = "invalid data";
781 goto utf8Error;
783 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
784 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
785 /* validate and convert to UTF-16 */
786 if ((ch < 0x10000) /* minimum value allowed for 4
787 byte encoding */
788 || (ch > 0x10ffff)) /* maximum value allowed for
789 UTF-16 */
791 errmsg = "illegal encoding";
792 goto utf8Error;
794 #ifdef Py_UNICODE_WIDE
795 *p++ = (Py_UNICODE)ch;
796 #else
797 /* compute and append the two surrogates: */
799 /* translate from 10000..10FFFF to 0..FFFF */
800 ch -= 0x10000;
802 /* high surrogate = top 10 bits added to D800 */
803 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
805 /* low surrogate = bottom 10 bits added to DC00 */
806 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
807 #endif
808 break;
810 default:
811 /* Other sizes are only needed for UCS-4 */
812 errmsg = "unsupported Unicode code range";
813 goto utf8Error;
815 s += n;
816 continue;
818 utf8Error:
819 if (utf8_decoding_error(&s, &p, errors, errmsg))
820 goto onError;
823 /* Adjust length */
824 if (_PyUnicode_Resize(&unicode, p - unicode->str))
825 goto onError;
827 return (PyObject *)unicode;
829 onError:
830 Py_DECREF(unicode);
831 return NULL;
834 /* Not used anymore, now that the encoder supports UTF-16
835 surrogates. */
836 #if 0
837 static
838 int utf8_encoding_error(const Py_UNICODE **source,
839 char **dest,
840 const char *errors,
841 const char *details)
843 if ((errors == NULL) ||
844 (strcmp(errors,"strict") == 0)) {
845 PyErr_Format(PyExc_UnicodeError,
846 "UTF-8 encoding error: %.400s",
847 details);
848 return -1;
850 else if (strcmp(errors,"ignore") == 0) {
851 return 0;
853 else if (strcmp(errors,"replace") == 0) {
854 **dest = '?';
855 (*dest)++;
856 return 0;
858 else {
859 PyErr_Format(PyExc_ValueError,
860 "UTF-8 encoding error; "
861 "unknown error handling code: %.400s",
862 errors);
863 return -1;
866 #endif
868 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
869 int size,
870 const char *errors)
872 PyObject *v;
873 char *p;
874 char *q;
875 Py_UCS4 ch2;
876 unsigned int cbAllocated = 3 * size;
877 unsigned int cbWritten = 0;
878 int i = 0;
880 v = PyString_FromStringAndSize(NULL, cbAllocated);
881 if (v == NULL)
882 return NULL;
883 if (size == 0)
884 return v;
886 p = q = PyString_AS_STRING(v);
887 while (i < size) {
888 Py_UCS4 ch = s[i++];
889 if (ch < 0x80) {
890 *p++ = (char) ch;
891 cbWritten++;
893 else if (ch < 0x0800) {
894 *p++ = 0xc0 | (ch >> 6);
895 *p++ = 0x80 | (ch & 0x3f);
896 cbWritten += 2;
898 else if (ch < 0x10000) {
899 /* Check for high surrogate */
900 if (0xD800 <= ch && ch <= 0xDBFF) {
901 if (i != size) {
902 ch2 = s[i];
903 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
905 if (cbWritten >= (cbAllocated - 4)) {
906 /* Provide enough room for some more
907 surrogates */
908 cbAllocated += 4*10;
909 if (_PyString_Resize(&v, cbAllocated))
910 goto onError;
913 /* combine the two values */
914 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
916 *p++ = (char)((ch >> 18) | 0xf0);
917 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
918 i++;
919 cbWritten += 4;
923 else {
924 *p++ = (char)(0xe0 | (ch >> 12));
925 cbWritten += 3;
927 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
928 *p++ = (char)(0x80 | (ch & 0x3f));
929 } else {
930 *p++ = 0xf0 | (ch>>18);
931 *p++ = 0x80 | ((ch>>12) & 0x3f);
932 *p++ = 0x80 | ((ch>>6) & 0x3f);
933 *p++ = 0x80 | (ch & 0x3f);
934 cbWritten += 4;
937 *p = '\0';
938 if (_PyString_Resize(&v, p - q))
939 goto onError;
940 return v;
942 onError:
943 Py_DECREF(v);
944 return NULL;
947 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
949 if (!PyUnicode_Check(unicode)) {
950 PyErr_BadArgument();
951 return NULL;
953 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
954 PyUnicode_GET_SIZE(unicode),
955 NULL);
958 /* --- UTF-16 Codec ------------------------------------------------------- */
960 static
961 int utf16_decoding_error(const Py_UCS2 **source,
962 Py_UNICODE **dest,
963 const char *errors,
964 const char *details)
966 if ((errors == NULL) ||
967 (strcmp(errors,"strict") == 0)) {
968 PyErr_Format(PyExc_UnicodeError,
969 "UTF-16 decoding error: %.400s",
970 details);
971 return -1;
973 else if (strcmp(errors,"ignore") == 0) {
974 return 0;
976 else if (strcmp(errors,"replace") == 0) {
977 if (dest) {
978 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
979 (*dest)++;
981 return 0;
983 else {
984 PyErr_Format(PyExc_ValueError,
985 "UTF-16 decoding error; "
986 "unknown error handling code: %.400s",
987 errors);
988 return -1;
992 PyObject *PyUnicode_DecodeUTF16(const char *s,
993 int size,
994 const char *errors,
995 int *byteorder)
997 PyUnicodeObject *unicode;
998 Py_UNICODE *p;
999 const Py_UCS2 *q, *e;
1000 int bo = 0;
1001 const char *errmsg = "";
1003 /* size should be an even number */
1004 if (size % sizeof(Py_UCS2) != 0) {
1005 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1006 return NULL;
1007 /* The remaining input chars are ignored if we fall through
1008 here... */
1011 /* Note: size will always be longer than the resulting Unicode
1012 character count */
1013 unicode = _PyUnicode_New(size);
1014 if (!unicode)
1015 return NULL;
1016 if (size == 0)
1017 return (PyObject *)unicode;
1019 /* Unpack UTF-16 encoded data */
1020 p = unicode->str;
1021 q = (Py_UCS2 *)s;
1022 e = q + (size / sizeof(Py_UCS2));
1024 if (byteorder)
1025 bo = *byteorder;
1027 /* Check for BOM marks (U+FEFF) in the input and adjust current
1028 byte order setting accordingly. In native mode, the leading BOM
1029 mark is skipped, in all other modes, it is copied to the output
1030 stream as-is (giving a ZWNBSP character). */
1031 if (bo == 0) {
1032 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1033 if (*q == 0xFEFF) {
1034 q++;
1035 bo = -1;
1036 } else if (*q == 0xFFFE) {
1037 q++;
1038 bo = 1;
1040 #else
1041 if (*q == 0xFEFF) {
1042 q++;
1043 bo = 1;
1044 } else if (*q == 0xFFFE) {
1045 q++;
1046 bo = -1;
1048 #endif
1051 while (q < e) {
1052 register Py_UCS2 ch = *q++;
1054 /* Swap input bytes if needed. (This assumes
1055 sizeof(Py_UNICODE) == 2 !) */
1056 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1057 if (bo == 1)
1058 ch = (ch >> 8) | (ch << 8);
1059 #else
1060 if (bo == -1)
1061 ch = (ch >> 8) | (ch << 8);
1062 #endif
1063 if (ch < 0xD800 || ch > 0xDFFF) {
1064 *p++ = ch;
1065 continue;
1068 /* UTF-16 code pair: */
1069 if (q >= e) {
1070 errmsg = "unexpected end of data";
1071 goto utf16Error;
1073 if (0xD800 <= ch && ch <= 0xDBFF) {
1074 Py_UCS2 ch2 = *q++;
1075 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1076 if (bo == 1)
1077 ch2 = (ch2 >> 8) | (ch2 << 8);
1078 #else
1079 if (bo == -1)
1080 ch2 = (ch2 >> 8) | (ch2 << 8);
1081 #endif
1082 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1083 #ifndef Py_UNICODE_WIDE
1084 /* This is valid data (a UTF-16 surrogate pair), but
1085 we are not able to store this information since our
1086 Py_UNICODE type only has 16 bits... this might
1087 change someday, even though it's unlikely. */
1088 errmsg = "code pairs are not supported";
1089 goto utf16Error;
1090 #else
1091 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1092 continue;
1093 #endif
1096 else {
1097 errmsg = "illegal UTF-16 surrogate";
1098 goto utf16Error;
1102 errmsg = "illegal encoding";
1103 /* Fall through to report the error */
1105 utf16Error:
1106 if (utf16_decoding_error(&q, &p, errors, errmsg))
1107 goto onError;
1110 if (byteorder)
1111 *byteorder = bo;
1113 /* Adjust length */
1114 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1115 goto onError;
1117 return (PyObject *)unicode;
1119 onError:
1120 Py_DECREF(unicode);
1121 return NULL;
1124 #undef UTF16_ERROR
1126 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1127 int size,
1128 const char *errors,
1129 int byteorder)
1131 PyObject *v;
1132 Py_UCS2 *p;
1133 char *q;
1134 int i, pairs, doswap = 1;
1136 for (i = pairs = 0; i < size; i++)
1137 if (s[i] >= 0x10000)
1138 pairs++;
1139 v = PyString_FromStringAndSize(NULL,
1140 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
1141 if (v == NULL)
1142 return NULL;
1144 q = PyString_AS_STRING(v);
1145 p = (Py_UCS2 *)q;
1146 if (byteorder == 0)
1147 *p++ = 0xFEFF;
1148 if (size == 0)
1149 return v;
1150 if (byteorder == 0 ||
1151 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1152 byteorder == -1
1153 #else
1154 byteorder == 1
1155 #endif
1157 doswap = 0;
1158 while (size-- > 0) {
1159 Py_UNICODE ch = *s++;
1160 Py_UNICODE ch2 = 0;
1161 if (ch >= 0x10000) {
1162 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1163 ch = 0xD800|((ch-0x10000)>>10);
1165 if (doswap){
1166 *p++ = (ch >> 8) | (ch << 8);
1167 if (ch2)
1168 *p++ = (ch2 >> 8) | (ch2 << 8);
1169 }else{
1170 *p++ = ch;
1171 if(ch2)
1172 *p++ = ch2;
1175 return v;
1178 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1180 if (!PyUnicode_Check(unicode)) {
1181 PyErr_BadArgument();
1182 return NULL;
1184 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1185 PyUnicode_GET_SIZE(unicode),
1186 NULL,
1190 /* --- Unicode Escape Codec ----------------------------------------------- */
1192 static
1193 int unicodeescape_decoding_error(const char **source,
1194 Py_UNICODE *x,
1195 const char *errors,
1196 const char *details)
1198 if ((errors == NULL) ||
1199 (strcmp(errors,"strict") == 0)) {
1200 PyErr_Format(PyExc_UnicodeError,
1201 "Unicode-Escape decoding error: %.400s",
1202 details);
1203 return -1;
1205 else if (strcmp(errors,"ignore") == 0) {
1206 return 0;
1208 else if (strcmp(errors,"replace") == 0) {
1209 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1210 return 0;
1212 else {
1213 PyErr_Format(PyExc_ValueError,
1214 "Unicode-Escape decoding error; "
1215 "unknown error handling code: %.400s",
1216 errors);
1217 return -1;
1221 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1223 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1224 int size,
1225 const char *errors)
1227 PyUnicodeObject *v;
1228 Py_UNICODE *p, *buf;
1229 const char *end;
1230 char* message;
1231 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1233 /* Escaped strings will always be longer than the resulting
1234 Unicode string, so we start with size here and then reduce the
1235 length after conversion to the true value. */
1236 v = _PyUnicode_New(size);
1237 if (v == NULL)
1238 goto onError;
1239 if (size == 0)
1240 return (PyObject *)v;
1242 p = buf = PyUnicode_AS_UNICODE(v);
1243 end = s + size;
1245 while (s < end) {
1246 unsigned char c;
1247 Py_UNICODE x;
1248 int i, digits;
1250 /* Non-escape characters are interpreted as Unicode ordinals */
1251 if (*s != '\\') {
1252 *p++ = (unsigned char) *s++;
1253 continue;
1256 /* \ - Escapes */
1257 s++;
1258 switch (*s++) {
1260 /* \x escapes */
1261 case '\n': break;
1262 case '\\': *p++ = '\\'; break;
1263 case '\'': *p++ = '\''; break;
1264 case '\"': *p++ = '\"'; break;
1265 case 'b': *p++ = '\b'; break;
1266 case 'f': *p++ = '\014'; break; /* FF */
1267 case 't': *p++ = '\t'; break;
1268 case 'n': *p++ = '\n'; break;
1269 case 'r': *p++ = '\r'; break;
1270 case 'v': *p++ = '\013'; break; /* VT */
1271 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1273 /* \OOO (octal) escapes */
1274 case '0': case '1': case '2': case '3':
1275 case '4': case '5': case '6': case '7':
1276 x = s[-1] - '0';
1277 if ('0' <= *s && *s <= '7') {
1278 x = (x<<3) + *s++ - '0';
1279 if ('0' <= *s && *s <= '7')
1280 x = (x<<3) + *s++ - '0';
1282 *p++ = x;
1283 break;
1285 /* hex escapes */
1286 /* \xXX */
1287 case 'x':
1288 digits = 2;
1289 message = "truncated \\xXX escape";
1290 goto hexescape;
1292 /* \uXXXX */
1293 case 'u':
1294 digits = 4;
1295 message = "truncated \\uXXXX escape";
1296 goto hexescape;
1298 /* \UXXXXXXXX */
1299 case 'U':
1300 digits = 8;
1301 message = "truncated \\UXXXXXXXX escape";
1302 hexescape:
1303 chr = 0;
1304 for (i = 0; i < digits; i++) {
1305 c = (unsigned char) s[i];
1306 if (!isxdigit(c)) {
1307 if (unicodeescape_decoding_error(&s, &x, errors, message))
1308 goto onError;
1309 chr = x;
1310 i++;
1311 break;
1313 chr = (chr<<4) & ~0xF;
1314 if (c >= '0' && c <= '9')
1315 chr += c - '0';
1316 else if (c >= 'a' && c <= 'f')
1317 chr += 10 + c - 'a';
1318 else
1319 chr += 10 + c - 'A';
1321 s += i;
1322 store:
1323 /* when we get here, chr is a 32-bit unicode character */
1324 if (chr <= 0xffff)
1325 /* UCS-2 character */
1326 *p++ = (Py_UNICODE) chr;
1327 else if (chr <= 0x10ffff) {
1328 /* UCS-4 character. Either store directly, or as surrogate pair. */
1329 #ifdef Py_UNICODE_WIDE
1330 *p++ = chr;
1331 #else
1332 chr -= 0x10000L;
1333 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1334 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1335 #endif
1336 } else {
1337 if (unicodeescape_decoding_error(
1338 &s, &x, errors,
1339 "illegal Unicode character")
1341 goto onError;
1342 *p++ = x; /* store replacement character */
1344 break;
1346 /* \N{name} */
1347 case 'N':
1348 message = "malformed \\N character escape";
1349 if (ucnhash_CAPI == NULL) {
1350 /* load the unicode data module */
1351 PyObject *m, *v;
1352 m = PyImport_ImportModule("unicodedata");
1353 if (m == NULL)
1354 goto ucnhashError;
1355 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1356 Py_DECREF(m);
1357 if (v == NULL)
1358 goto ucnhashError;
1359 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1360 Py_DECREF(v);
1361 if (ucnhash_CAPI == NULL)
1362 goto ucnhashError;
1364 if (*s == '{') {
1365 const char *start = s+1;
1366 /* look for the closing brace */
1367 while (*s != '}' && s < end)
1368 s++;
1369 if (s > start && s < end && *s == '}') {
1370 /* found a name. look it up in the unicode database */
1371 message = "unknown Unicode character name";
1372 s++;
1373 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1374 goto store;
1377 if (unicodeescape_decoding_error(&s, &x, errors, message))
1378 goto onError;
1379 *p++ = x;
1380 break;
1382 default:
1383 *p++ = '\\';
1384 *p++ = (unsigned char)s[-1];
1385 break;
1388 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1389 goto onError;
1390 return (PyObject *)v;
1392 ucnhashError:
1393 PyErr_SetString(
1394 PyExc_UnicodeError,
1395 "\\N escapes not supported (can't load unicodedata module)"
1397 return NULL;
1399 onError:
1400 Py_XDECREF(v);
1401 return NULL;
1404 /* Return a Unicode-Escape string version of the Unicode object.
1406 If quotes is true, the string is enclosed in u"" or u'' quotes as
1407 appropriate.
1411 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1412 int size,
1413 Py_UNICODE ch);
1415 static
1416 PyObject *unicodeescape_string(const Py_UNICODE *s,
1417 int size,
1418 int quotes)
1420 PyObject *repr;
1421 char *p;
1422 char *q;
1424 static const char *hexdigit = "0123456789abcdef";
1426 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1427 if (repr == NULL)
1428 return NULL;
1430 p = q = PyString_AS_STRING(repr);
1432 if (quotes) {
1433 *p++ = 'u';
1434 *p++ = (findchar(s, size, '\'') &&
1435 !findchar(s, size, '"')) ? '"' : '\'';
1437 while (size-- > 0) {
1438 Py_UNICODE ch = *s++;
1439 /* Escape quotes */
1440 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
1441 *p++ = '\\';
1442 *p++ = (char) ch;
1444 /* Map 21-bit characters to '\U00xxxxxx' */
1445 else if (ch >= 0x10000) {
1446 *p++ = '\\';
1447 *p++ = 'U';
1448 *p++ = hexdigit[(ch >> 28) & 0xf];
1449 *p++ = hexdigit[(ch >> 24) & 0xf];
1450 *p++ = hexdigit[(ch >> 20) & 0xf];
1451 *p++ = hexdigit[(ch >> 16) & 0xf];
1452 *p++ = hexdigit[(ch >> 12) & 0xf];
1453 *p++ = hexdigit[(ch >> 8) & 0xf];
1454 *p++ = hexdigit[(ch >> 4) & 0xf];
1455 *p++ = hexdigit[ch & 15];
1457 /* Map 16-bit characters to '\uxxxx' */
1458 else if (ch >= 256) {
1459 *p++ = '\\';
1460 *p++ = 'u';
1461 *p++ = hexdigit[(ch >> 12) & 0xf];
1462 *p++ = hexdigit[(ch >> 8) & 0xf];
1463 *p++ = hexdigit[(ch >> 4) & 0xf];
1464 *p++ = hexdigit[ch & 15];
1466 /* Map special whitespace to '\t', \n', '\r' */
1467 else if (ch == '\t') {
1468 *p++ = '\\';
1469 *p++ = 't';
1471 else if (ch == '\n') {
1472 *p++ = '\\';
1473 *p++ = 'n';
1475 else if (ch == '\r') {
1476 *p++ = '\\';
1477 *p++ = 'r';
1479 /* Map non-printable US ASCII to '\xhh' */
1480 else if (ch < ' ' || ch >= 128) {
1481 *p++ = '\\';
1482 *p++ = 'x';
1483 *p++ = hexdigit[(ch >> 4) & 0xf];
1484 *p++ = hexdigit[ch & 15];
1486 /* Copy everything else as-is */
1487 else
1488 *p++ = (char) ch;
1490 if (quotes)
1491 *p++ = q[1];
1493 *p = '\0';
1494 if (_PyString_Resize(&repr, p - q))
1495 goto onError;
1497 return repr;
1499 onError:
1500 Py_DECREF(repr);
1501 return NULL;
1504 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1505 int size)
1507 return unicodeescape_string(s, size, 0);
1510 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1512 if (!PyUnicode_Check(unicode)) {
1513 PyErr_BadArgument();
1514 return NULL;
1516 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1517 PyUnicode_GET_SIZE(unicode));
1520 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1522 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1523 int size,
1524 const char *errors)
1526 PyUnicodeObject *v;
1527 Py_UNICODE *p, *buf;
1528 const char *end;
1529 const char *bs;
1531 /* Escaped strings will always be longer than the resulting
1532 Unicode string, so we start with size here and then reduce the
1533 length after conversion to the true value. */
1534 v = _PyUnicode_New(size);
1535 if (v == NULL)
1536 goto onError;
1537 if (size == 0)
1538 return (PyObject *)v;
1539 p = buf = PyUnicode_AS_UNICODE(v);
1540 end = s + size;
1541 while (s < end) {
1542 unsigned char c;
1543 Py_UNICODE x;
1544 int i;
1546 /* Non-escape characters are interpreted as Unicode ordinals */
1547 if (*s != '\\') {
1548 *p++ = (unsigned char)*s++;
1549 continue;
1552 /* \u-escapes are only interpreted iff the number of leading
1553 backslashes if odd */
1554 bs = s;
1555 for (;s < end;) {
1556 if (*s != '\\')
1557 break;
1558 *p++ = (unsigned char)*s++;
1560 if (((s - bs) & 1) == 0 ||
1561 s >= end ||
1562 *s != 'u') {
1563 continue;
1565 p--;
1566 s++;
1568 /* \uXXXX with 4 hex digits */
1569 for (x = 0, i = 0; i < 4; i++) {
1570 c = (unsigned char)s[i];
1571 if (!isxdigit(c)) {
1572 if (unicodeescape_decoding_error(&s, &x, errors,
1573 "truncated \\uXXXX"))
1574 goto onError;
1575 i++;
1576 break;
1578 x = (x<<4) & ~0xF;
1579 if (c >= '0' && c <= '9')
1580 x += c - '0';
1581 else if (c >= 'a' && c <= 'f')
1582 x += 10 + c - 'a';
1583 else
1584 x += 10 + c - 'A';
1586 s += i;
1587 *p++ = x;
1589 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1590 goto onError;
1591 return (PyObject *)v;
1593 onError:
1594 Py_XDECREF(v);
1595 return NULL;
1598 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1599 int size)
1601 PyObject *repr;
1602 char *p;
1603 char *q;
1605 static const char *hexdigit = "0123456789abcdef";
1607 repr = PyString_FromStringAndSize(NULL, 6 * size);
1608 if (repr == NULL)
1609 return NULL;
1610 if (size == 0)
1611 return repr;
1613 p = q = PyString_AS_STRING(repr);
1614 while (size-- > 0) {
1615 Py_UNICODE ch = *s++;
1616 /* Map 16-bit characters to '\uxxxx' */
1617 if (ch >= 256) {
1618 *p++ = '\\';
1619 *p++ = 'u';
1620 *p++ = hexdigit[(ch >> 12) & 0xf];
1621 *p++ = hexdigit[(ch >> 8) & 0xf];
1622 *p++ = hexdigit[(ch >> 4) & 0xf];
1623 *p++ = hexdigit[ch & 15];
1625 /* Copy everything else as-is */
1626 else
1627 *p++ = (char) ch;
1629 *p = '\0';
1630 if (_PyString_Resize(&repr, p - q))
1631 goto onError;
1633 return repr;
1635 onError:
1636 Py_DECREF(repr);
1637 return NULL;
1640 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1642 if (!PyUnicode_Check(unicode)) {
1643 PyErr_BadArgument();
1644 return NULL;
1646 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1647 PyUnicode_GET_SIZE(unicode));
1650 /* --- Latin-1 Codec ------------------------------------------------------ */
1652 PyObject *PyUnicode_DecodeLatin1(const char *s,
1653 int size,
1654 const char *errors)
1656 PyUnicodeObject *v;
1657 Py_UNICODE *p;
1659 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1660 if (size == 1 && *(unsigned char*)s < 256) {
1661 Py_UNICODE r = *(unsigned char*)s;
1662 return PyUnicode_FromUnicode(&r, 1);
1665 v = _PyUnicode_New(size);
1666 if (v == NULL)
1667 goto onError;
1668 if (size == 0)
1669 return (PyObject *)v;
1670 p = PyUnicode_AS_UNICODE(v);
1671 while (size-- > 0)
1672 *p++ = (unsigned char)*s++;
1673 return (PyObject *)v;
1675 onError:
1676 Py_XDECREF(v);
1677 return NULL;
1680 static
1681 int latin1_encoding_error(const Py_UNICODE **source,
1682 char **dest,
1683 const char *errors,
1684 const char *details)
1686 if ((errors == NULL) ||
1687 (strcmp(errors,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError,
1689 "Latin-1 encoding error: %.400s",
1690 details);
1691 return -1;
1693 else if (strcmp(errors,"ignore") == 0) {
1694 return 0;
1696 else if (strcmp(errors,"replace") == 0) {
1697 **dest = '?';
1698 (*dest)++;
1699 return 0;
1701 else {
1702 PyErr_Format(PyExc_ValueError,
1703 "Latin-1 encoding error; "
1704 "unknown error handling code: %.400s",
1705 errors);
1706 return -1;
1710 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1711 int size,
1712 const char *errors)
1714 PyObject *repr;
1715 char *s, *start;
1717 repr = PyString_FromStringAndSize(NULL, size);
1718 if (repr == NULL)
1719 return NULL;
1720 if (size == 0)
1721 return repr;
1723 s = PyString_AS_STRING(repr);
1724 start = s;
1725 while (size-- > 0) {
1726 Py_UNICODE ch = *p++;
1727 if (ch >= 256) {
1728 if (latin1_encoding_error(&p, &s, errors,
1729 "ordinal not in range(256)"))
1730 goto onError;
1732 else
1733 *s++ = (char)ch;
1735 /* Resize if error handling skipped some characters */
1736 if (s - start < PyString_GET_SIZE(repr))
1737 if (_PyString_Resize(&repr, s - start))
1738 goto onError;
1739 return repr;
1741 onError:
1742 Py_DECREF(repr);
1743 return NULL;
1746 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1748 if (!PyUnicode_Check(unicode)) {
1749 PyErr_BadArgument();
1750 return NULL;
1752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1753 PyUnicode_GET_SIZE(unicode),
1754 NULL);
1757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1759 static
1760 int ascii_decoding_error(const char **source,
1761 Py_UNICODE **dest,
1762 const char *errors,
1763 const char *details)
1765 if ((errors == NULL) ||
1766 (strcmp(errors,"strict") == 0)) {
1767 PyErr_Format(PyExc_UnicodeError,
1768 "ASCII decoding error: %.400s",
1769 details);
1770 return -1;
1772 else if (strcmp(errors,"ignore") == 0) {
1773 return 0;
1775 else if (strcmp(errors,"replace") == 0) {
1776 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1777 (*dest)++;
1778 return 0;
1780 else {
1781 PyErr_Format(PyExc_ValueError,
1782 "ASCII decoding error; "
1783 "unknown error handling code: %.400s",
1784 errors);
1785 return -1;
1789 PyObject *PyUnicode_DecodeASCII(const char *s,
1790 int size,
1791 const char *errors)
1793 PyUnicodeObject *v;
1794 Py_UNICODE *p;
1796 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1797 if (size == 1 && *(unsigned char*)s < 128) {
1798 Py_UNICODE r = *(unsigned char*)s;
1799 return PyUnicode_FromUnicode(&r, 1);
1802 v = _PyUnicode_New(size);
1803 if (v == NULL)
1804 goto onError;
1805 if (size == 0)
1806 return (PyObject *)v;
1807 p = PyUnicode_AS_UNICODE(v);
1808 while (size-- > 0) {
1809 register unsigned char c;
1811 c = (unsigned char)*s++;
1812 if (c < 128)
1813 *p++ = c;
1814 else if (ascii_decoding_error(&s, &p, errors,
1815 "ordinal not in range(128)"))
1816 goto onError;
1818 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1819 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1820 goto onError;
1821 return (PyObject *)v;
1823 onError:
1824 Py_XDECREF(v);
1825 return NULL;
1828 static
1829 int ascii_encoding_error(const Py_UNICODE **source,
1830 char **dest,
1831 const char *errors,
1832 const char *details)
1834 if ((errors == NULL) ||
1835 (strcmp(errors,"strict") == 0)) {
1836 PyErr_Format(PyExc_UnicodeError,
1837 "ASCII encoding error: %.400s",
1838 details);
1839 return -1;
1841 else if (strcmp(errors,"ignore") == 0) {
1842 return 0;
1844 else if (strcmp(errors,"replace") == 0) {
1845 **dest = '?';
1846 (*dest)++;
1847 return 0;
1849 else {
1850 PyErr_Format(PyExc_ValueError,
1851 "ASCII encoding error; "
1852 "unknown error handling code: %.400s",
1853 errors);
1854 return -1;
1858 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1859 int size,
1860 const char *errors)
1862 PyObject *repr;
1863 char *s, *start;
1865 repr = PyString_FromStringAndSize(NULL, size);
1866 if (repr == NULL)
1867 return NULL;
1868 if (size == 0)
1869 return repr;
1871 s = PyString_AS_STRING(repr);
1872 start = s;
1873 while (size-- > 0) {
1874 Py_UNICODE ch = *p++;
1875 if (ch >= 128) {
1876 if (ascii_encoding_error(&p, &s, errors,
1877 "ordinal not in range(128)"))
1878 goto onError;
1880 else
1881 *s++ = (char)ch;
1883 /* Resize if error handling skipped some characters */
1884 if (s - start < PyString_GET_SIZE(repr))
1885 if (_PyString_Resize(&repr, s - start))
1886 goto onError;
1887 return repr;
1889 onError:
1890 Py_DECREF(repr);
1891 return NULL;
1894 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1896 if (!PyUnicode_Check(unicode)) {
1897 PyErr_BadArgument();
1898 return NULL;
1900 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1901 PyUnicode_GET_SIZE(unicode),
1902 NULL);
1905 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1907 /* --- MBCS codecs for Windows -------------------------------------------- */
1909 PyObject *PyUnicode_DecodeMBCS(const char *s,
1910 int size,
1911 const char *errors)
1913 PyUnicodeObject *v;
1914 Py_UNICODE *p;
1916 /* First get the size of the result */
1917 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1918 if (size > 0 && usize==0)
1919 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1921 v = _PyUnicode_New(usize);
1922 if (v == NULL)
1923 return NULL;
1924 if (usize == 0)
1925 return (PyObject *)v;
1926 p = PyUnicode_AS_UNICODE(v);
1927 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1928 Py_DECREF(v);
1929 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1932 return (PyObject *)v;
1935 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1936 int size,
1937 const char *errors)
1939 PyObject *repr;
1940 char *s;
1941 DWORD mbcssize;
1943 /* If there are no characters, bail now! */
1944 if (size==0)
1945 return PyString_FromString("");
1947 /* First get the size of the result */
1948 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1949 if (mbcssize==0)
1950 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1952 repr = PyString_FromStringAndSize(NULL, mbcssize);
1953 if (repr == NULL)
1954 return NULL;
1955 if (mbcssize == 0)
1956 return repr;
1958 /* Do the conversion */
1959 s = PyString_AS_STRING(repr);
1960 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1961 Py_DECREF(repr);
1962 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1964 return repr;
1967 #endif /* MS_WIN32 */
1969 /* --- Character Mapping Codec -------------------------------------------- */
1971 static
1972 int charmap_decoding_error(const char **source,
1973 Py_UNICODE **dest,
1974 const char *errors,
1975 const char *details)
1977 if ((errors == NULL) ||
1978 (strcmp(errors,"strict") == 0)) {
1979 PyErr_Format(PyExc_UnicodeError,
1980 "charmap decoding error: %.400s",
1981 details);
1982 return -1;
1984 else if (strcmp(errors,"ignore") == 0) {
1985 return 0;
1987 else if (strcmp(errors,"replace") == 0) {
1988 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1989 (*dest)++;
1990 return 0;
1992 else {
1993 PyErr_Format(PyExc_ValueError,
1994 "charmap decoding error; "
1995 "unknown error handling code: %.400s",
1996 errors);
1997 return -1;
2001 PyObject *PyUnicode_DecodeCharmap(const char *s,
2002 int size,
2003 PyObject *mapping,
2004 const char *errors)
2006 PyUnicodeObject *v;
2007 Py_UNICODE *p;
2008 int extrachars = 0;
2010 /* Default to Latin-1 */
2011 if (mapping == NULL)
2012 return PyUnicode_DecodeLatin1(s, size, errors);
2014 v = _PyUnicode_New(size);
2015 if (v == NULL)
2016 goto onError;
2017 if (size == 0)
2018 return (PyObject *)v;
2019 p = PyUnicode_AS_UNICODE(v);
2020 while (size-- > 0) {
2021 unsigned char ch = *s++;
2022 PyObject *w, *x;
2024 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2025 w = PyInt_FromLong((long)ch);
2026 if (w == NULL)
2027 goto onError;
2028 x = PyObject_GetItem(mapping, w);
2029 Py_DECREF(w);
2030 if (x == NULL) {
2031 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2032 /* No mapping found means: mapping is undefined. */
2033 PyErr_Clear();
2034 x = Py_None;
2035 Py_INCREF(x);
2036 } else
2037 goto onError;
2040 /* Apply mapping */
2041 if (PyInt_Check(x)) {
2042 long value = PyInt_AS_LONG(x);
2043 if (value < 0 || value > 65535) {
2044 PyErr_SetString(PyExc_TypeError,
2045 "character mapping must be in range(65536)");
2046 Py_DECREF(x);
2047 goto onError;
2049 *p++ = (Py_UNICODE)value;
2051 else if (x == Py_None) {
2052 /* undefined mapping */
2053 if (charmap_decoding_error(&s, &p, errors,
2054 "character maps to <undefined>")) {
2055 Py_DECREF(x);
2056 goto onError;
2059 else if (PyUnicode_Check(x)) {
2060 int targetsize = PyUnicode_GET_SIZE(x);
2062 if (targetsize == 1)
2063 /* 1-1 mapping */
2064 *p++ = *PyUnicode_AS_UNICODE(x);
2066 else if (targetsize > 1) {
2067 /* 1-n mapping */
2068 if (targetsize > extrachars) {
2069 /* resize first */
2070 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2071 int needed = (targetsize - extrachars) + \
2072 (targetsize << 2);
2073 extrachars += needed;
2074 if (_PyUnicode_Resize(&v,
2075 PyUnicode_GET_SIZE(v) + needed)) {
2076 Py_DECREF(x);
2077 goto onError;
2079 p = PyUnicode_AS_UNICODE(v) + oldpos;
2081 Py_UNICODE_COPY(p,
2082 PyUnicode_AS_UNICODE(x),
2083 targetsize);
2084 p += targetsize;
2085 extrachars -= targetsize;
2087 /* 1-0 mapping: skip the character */
2089 else {
2090 /* wrong return value */
2091 PyErr_SetString(PyExc_TypeError,
2092 "character mapping must return integer, None or unicode");
2093 Py_DECREF(x);
2094 goto onError;
2096 Py_DECREF(x);
2098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2099 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2100 goto onError;
2101 return (PyObject *)v;
2103 onError:
2104 Py_XDECREF(v);
2105 return NULL;
2108 static
2109 int charmap_encoding_error(const Py_UNICODE **source,
2110 char **dest,
2111 const char *errors,
2112 const char *details)
2114 if ((errors == NULL) ||
2115 (strcmp(errors,"strict") == 0)) {
2116 PyErr_Format(PyExc_UnicodeError,
2117 "charmap encoding error: %.400s",
2118 details);
2119 return -1;
2121 else if (strcmp(errors,"ignore") == 0) {
2122 return 0;
2124 else if (strcmp(errors,"replace") == 0) {
2125 **dest = '?';
2126 (*dest)++;
2127 return 0;
2129 else {
2130 PyErr_Format(PyExc_ValueError,
2131 "charmap encoding error; "
2132 "unknown error handling code: %.400s",
2133 errors);
2134 return -1;
2138 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2139 int size,
2140 PyObject *mapping,
2141 const char *errors)
2143 PyObject *v;
2144 char *s;
2145 int extrachars = 0;
2147 /* Default to Latin-1 */
2148 if (mapping == NULL)
2149 return PyUnicode_EncodeLatin1(p, size, errors);
2151 v = PyString_FromStringAndSize(NULL, size);
2152 if (v == NULL)
2153 return NULL;
2154 if (size == 0)
2155 return v;
2156 s = PyString_AS_STRING(v);
2157 while (size-- > 0) {
2158 Py_UNICODE ch = *p++;
2159 PyObject *w, *x;
2161 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2162 w = PyInt_FromLong((long)ch);
2163 if (w == NULL)
2164 goto onError;
2165 x = PyObject_GetItem(mapping, w);
2166 Py_DECREF(w);
2167 if (x == NULL) {
2168 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2169 /* No mapping found means: mapping is undefined. */
2170 PyErr_Clear();
2171 x = Py_None;
2172 Py_INCREF(x);
2173 } else
2174 goto onError;
2177 /* Apply mapping */
2178 if (PyInt_Check(x)) {
2179 long value = PyInt_AS_LONG(x);
2180 if (value < 0 || value > 255) {
2181 PyErr_SetString(PyExc_TypeError,
2182 "character mapping must be in range(256)");
2183 Py_DECREF(x);
2184 goto onError;
2186 *s++ = (char)value;
2188 else if (x == Py_None) {
2189 /* undefined mapping */
2190 if (charmap_encoding_error(&p, &s, errors,
2191 "character maps to <undefined>")) {
2192 Py_DECREF(x);
2193 goto onError;
2196 else if (PyString_Check(x)) {
2197 int targetsize = PyString_GET_SIZE(x);
2199 if (targetsize == 1)
2200 /* 1-1 mapping */
2201 *s++ = *PyString_AS_STRING(x);
2203 else if (targetsize > 1) {
2204 /* 1-n mapping */
2205 if (targetsize > extrachars) {
2206 /* resize first */
2207 int oldpos = (int)(s - PyString_AS_STRING(v));
2208 int needed = (targetsize - extrachars) + \
2209 (targetsize << 2);
2210 extrachars += needed;
2211 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2212 Py_DECREF(x);
2213 goto onError;
2215 s = PyString_AS_STRING(v) + oldpos;
2217 memcpy(s, PyString_AS_STRING(x), targetsize);
2218 s += targetsize;
2219 extrachars -= targetsize;
2221 /* 1-0 mapping: skip the character */
2223 else {
2224 /* wrong return value */
2225 PyErr_SetString(PyExc_TypeError,
2226 "character mapping must return integer, None or unicode");
2227 Py_DECREF(x);
2228 goto onError;
2230 Py_DECREF(x);
2232 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2233 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2234 goto onError;
2235 return v;
2237 onError:
2238 Py_DECREF(v);
2239 return NULL;
2242 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2243 PyObject *mapping)
2245 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2246 PyErr_BadArgument();
2247 return NULL;
2249 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2250 PyUnicode_GET_SIZE(unicode),
2251 mapping,
2252 NULL);
2255 static
2256 int translate_error(const Py_UNICODE **source,
2257 Py_UNICODE **dest,
2258 const char *errors,
2259 const char *details)
2261 if ((errors == NULL) ||
2262 (strcmp(errors,"strict") == 0)) {
2263 PyErr_Format(PyExc_UnicodeError,
2264 "translate error: %.400s",
2265 details);
2266 return -1;
2268 else if (strcmp(errors,"ignore") == 0) {
2269 return 0;
2271 else if (strcmp(errors,"replace") == 0) {
2272 **dest = '?';
2273 (*dest)++;
2274 return 0;
2276 else {
2277 PyErr_Format(PyExc_ValueError,
2278 "translate error; "
2279 "unknown error handling code: %.400s",
2280 errors);
2281 return -1;
2285 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2286 int size,
2287 PyObject *mapping,
2288 const char *errors)
2290 PyUnicodeObject *v;
2291 Py_UNICODE *p;
2293 if (mapping == NULL) {
2294 PyErr_BadArgument();
2295 return NULL;
2298 /* Output will never be longer than input */
2299 v = _PyUnicode_New(size);
2300 if (v == NULL)
2301 goto onError;
2302 if (size == 0)
2303 goto done;
2304 p = PyUnicode_AS_UNICODE(v);
2305 while (size-- > 0) {
2306 Py_UNICODE ch = *s++;
2307 PyObject *w, *x;
2309 /* Get mapping */
2310 w = PyInt_FromLong(ch);
2311 if (w == NULL)
2312 goto onError;
2313 x = PyObject_GetItem(mapping, w);
2314 Py_DECREF(w);
2315 if (x == NULL) {
2316 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2317 /* No mapping found: default to 1-1 mapping */
2318 PyErr_Clear();
2319 *p++ = ch;
2320 continue;
2322 goto onError;
2325 /* Apply mapping */
2326 if (PyInt_Check(x))
2327 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2328 else if (x == Py_None) {
2329 /* undefined mapping */
2330 if (translate_error(&s, &p, errors,
2331 "character maps to <undefined>")) {
2332 Py_DECREF(x);
2333 goto onError;
2336 else if (PyUnicode_Check(x)) {
2337 if (PyUnicode_GET_SIZE(x) != 1) {
2338 /* 1-n mapping */
2339 PyErr_SetString(PyExc_NotImplementedError,
2340 "1-n mappings are currently not implemented");
2341 Py_DECREF(x);
2342 goto onError;
2344 *p++ = *PyUnicode_AS_UNICODE(x);
2346 else {
2347 /* wrong return value */
2348 PyErr_SetString(PyExc_TypeError,
2349 "translate mapping must return integer, None or unicode");
2350 Py_DECREF(x);
2351 goto onError;
2353 Py_DECREF(x);
2355 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2356 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2357 goto onError;
2359 done:
2360 return (PyObject *)v;
2362 onError:
2363 Py_XDECREF(v);
2364 return NULL;
2367 PyObject *PyUnicode_Translate(PyObject *str,
2368 PyObject *mapping,
2369 const char *errors)
2371 PyObject *result;
2373 str = PyUnicode_FromObject(str);
2374 if (str == NULL)
2375 goto onError;
2376 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2377 PyUnicode_GET_SIZE(str),
2378 mapping,
2379 errors);
2380 Py_DECREF(str);
2381 return result;
2383 onError:
2384 Py_XDECREF(str);
2385 return NULL;
2388 /* --- Decimal Encoder ---------------------------------------------------- */
2390 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2391 int length,
2392 char *output,
2393 const char *errors)
2395 Py_UNICODE *p, *end;
2397 if (output == NULL) {
2398 PyErr_BadArgument();
2399 return -1;
2402 p = s;
2403 end = s + length;
2404 while (p < end) {
2405 register Py_UNICODE ch = *p++;
2406 int decimal;
2408 if (Py_UNICODE_ISSPACE(ch)) {
2409 *output++ = ' ';
2410 continue;
2412 decimal = Py_UNICODE_TODECIMAL(ch);
2413 if (decimal >= 0) {
2414 *output++ = '0' + decimal;
2415 continue;
2417 if (0 < ch && ch < 256) {
2418 *output++ = (char)ch;
2419 continue;
2421 /* All other characters are considered invalid */
2422 if (errors == NULL || strcmp(errors, "strict") == 0) {
2423 PyErr_SetString(PyExc_ValueError,
2424 "invalid decimal Unicode string");
2425 goto onError;
2427 else if (strcmp(errors, "ignore") == 0)
2428 continue;
2429 else if (strcmp(errors, "replace") == 0) {
2430 *output++ = '?';
2431 continue;
2434 /* 0-terminate the output string */
2435 *output++ = '\0';
2436 return 0;
2438 onError:
2439 return -1;
2442 /* --- Helpers ------------------------------------------------------------ */
2444 static
2445 int count(PyUnicodeObject *self,
2446 int start,
2447 int end,
2448 PyUnicodeObject *substring)
2450 int count = 0;
2452 if (start < 0)
2453 start += self->length;
2454 if (start < 0)
2455 start = 0;
2456 if (end > self->length)
2457 end = self->length;
2458 if (end < 0)
2459 end += self->length;
2460 if (end < 0)
2461 end = 0;
2463 if (substring->length == 0)
2464 return (end - start + 1);
2466 end -= substring->length;
2468 while (start <= end)
2469 if (Py_UNICODE_MATCH(self, start, substring)) {
2470 count++;
2471 start += substring->length;
2472 } else
2473 start++;
2475 return count;
2478 int PyUnicode_Count(PyObject *str,
2479 PyObject *substr,
2480 int start,
2481 int end)
2483 int result;
2485 str = PyUnicode_FromObject(str);
2486 if (str == NULL)
2487 return -1;
2488 substr = PyUnicode_FromObject(substr);
2489 if (substr == NULL) {
2490 Py_DECREF(str);
2491 return -1;
2494 result = count((PyUnicodeObject *)str,
2495 start, end,
2496 (PyUnicodeObject *)substr);
2498 Py_DECREF(str);
2499 Py_DECREF(substr);
2500 return result;
2503 static
2504 int findstring(PyUnicodeObject *self,
2505 PyUnicodeObject *substring,
2506 int start,
2507 int end,
2508 int direction)
2510 if (start < 0)
2511 start += self->length;
2512 if (start < 0)
2513 start = 0;
2515 if (substring->length == 0)
2516 return start;
2518 if (end > self->length)
2519 end = self->length;
2520 if (end < 0)
2521 end += self->length;
2522 if (end < 0)
2523 end = 0;
2525 end -= substring->length;
2527 if (direction < 0) {
2528 for (; end >= start; end--)
2529 if (Py_UNICODE_MATCH(self, end, substring))
2530 return end;
2531 } else {
2532 for (; start <= end; start++)
2533 if (Py_UNICODE_MATCH(self, start, substring))
2534 return start;
2537 return -1;
2540 int PyUnicode_Find(PyObject *str,
2541 PyObject *substr,
2542 int start,
2543 int end,
2544 int direction)
2546 int result;
2548 str = PyUnicode_FromObject(str);
2549 if (str == NULL)
2550 return -1;
2551 substr = PyUnicode_FromObject(substr);
2552 if (substr == NULL) {
2553 Py_DECREF(substr);
2554 return -1;
2557 result = findstring((PyUnicodeObject *)str,
2558 (PyUnicodeObject *)substr,
2559 start, end, direction);
2560 Py_DECREF(str);
2561 Py_DECREF(substr);
2562 return result;
2565 static
2566 int tailmatch(PyUnicodeObject *self,
2567 PyUnicodeObject *substring,
2568 int start,
2569 int end,
2570 int direction)
2572 if (start < 0)
2573 start += self->length;
2574 if (start < 0)
2575 start = 0;
2577 if (substring->length == 0)
2578 return 1;
2580 if (end > self->length)
2581 end = self->length;
2582 if (end < 0)
2583 end += self->length;
2584 if (end < 0)
2585 end = 0;
2587 end -= substring->length;
2588 if (end < start)
2589 return 0;
2591 if (direction > 0) {
2592 if (Py_UNICODE_MATCH(self, end, substring))
2593 return 1;
2594 } else {
2595 if (Py_UNICODE_MATCH(self, start, substring))
2596 return 1;
2599 return 0;
2602 int PyUnicode_Tailmatch(PyObject *str,
2603 PyObject *substr,
2604 int start,
2605 int end,
2606 int direction)
2608 int result;
2610 str = PyUnicode_FromObject(str);
2611 if (str == NULL)
2612 return -1;
2613 substr = PyUnicode_FromObject(substr);
2614 if (substr == NULL) {
2615 Py_DECREF(substr);
2616 return -1;
2619 result = tailmatch((PyUnicodeObject *)str,
2620 (PyUnicodeObject *)substr,
2621 start, end, direction);
2622 Py_DECREF(str);
2623 Py_DECREF(substr);
2624 return result;
2627 static
2628 const Py_UNICODE *findchar(const Py_UNICODE *s,
2629 int size,
2630 Py_UNICODE ch)
2632 /* like wcschr, but doesn't stop at NULL characters */
2634 while (size-- > 0) {
2635 if (*s == ch)
2636 return s;
2637 s++;
2640 return NULL;
2643 /* Apply fixfct filter to the Unicode object self and return a
2644 reference to the modified object */
2646 static
2647 PyObject *fixup(PyUnicodeObject *self,
2648 int (*fixfct)(PyUnicodeObject *s))
2651 PyUnicodeObject *u;
2653 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2654 if (u == NULL)
2655 return NULL;
2657 Py_UNICODE_COPY(u->str, self->str, self->length);
2659 if (!fixfct(u)) {
2660 /* fixfct should return TRUE if it modified the buffer. If
2661 FALSE, return a reference to the original buffer instead
2662 (to save space, not time) */
2663 Py_INCREF(self);
2664 Py_DECREF(u);
2665 return (PyObject*) self;
2667 return (PyObject*) u;
2670 static
2671 int fixupper(PyUnicodeObject *self)
2673 int len = self->length;
2674 Py_UNICODE *s = self->str;
2675 int status = 0;
2677 while (len-- > 0) {
2678 register Py_UNICODE ch;
2680 ch = Py_UNICODE_TOUPPER(*s);
2681 if (ch != *s) {
2682 status = 1;
2683 *s = ch;
2685 s++;
2688 return status;
2691 static
2692 int fixlower(PyUnicodeObject *self)
2694 int len = self->length;
2695 Py_UNICODE *s = self->str;
2696 int status = 0;
2698 while (len-- > 0) {
2699 register Py_UNICODE ch;
2701 ch = Py_UNICODE_TOLOWER(*s);
2702 if (ch != *s) {
2703 status = 1;
2704 *s = ch;
2706 s++;
2709 return status;
2712 static
2713 int fixswapcase(PyUnicodeObject *self)
2715 int len = self->length;
2716 Py_UNICODE *s = self->str;
2717 int status = 0;
2719 while (len-- > 0) {
2720 if (Py_UNICODE_ISUPPER(*s)) {
2721 *s = Py_UNICODE_TOLOWER(*s);
2722 status = 1;
2723 } else if (Py_UNICODE_ISLOWER(*s)) {
2724 *s = Py_UNICODE_TOUPPER(*s);
2725 status = 1;
2727 s++;
2730 return status;
2733 static
2734 int fixcapitalize(PyUnicodeObject *self)
2736 int len = self->length;
2737 Py_UNICODE *s = self->str;
2738 int status = 0;
2740 if (len == 0)
2741 return 0;
2742 if (Py_UNICODE_ISLOWER(*s)) {
2743 *s = Py_UNICODE_TOUPPER(*s);
2744 status = 1;
2746 s++;
2747 while (--len > 0) {
2748 if (Py_UNICODE_ISUPPER(*s)) {
2749 *s = Py_UNICODE_TOLOWER(*s);
2750 status = 1;
2752 s++;
2754 return status;
2757 static
2758 int fixtitle(PyUnicodeObject *self)
2760 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2761 register Py_UNICODE *e;
2762 int previous_is_cased;
2764 /* Shortcut for single character strings */
2765 if (PyUnicode_GET_SIZE(self) == 1) {
2766 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2767 if (*p != ch) {
2768 *p = ch;
2769 return 1;
2771 else
2772 return 0;
2775 e = p + PyUnicode_GET_SIZE(self);
2776 previous_is_cased = 0;
2777 for (; p < e; p++) {
2778 register const Py_UNICODE ch = *p;
2780 if (previous_is_cased)
2781 *p = Py_UNICODE_TOLOWER(ch);
2782 else
2783 *p = Py_UNICODE_TOTITLE(ch);
2785 if (Py_UNICODE_ISLOWER(ch) ||
2786 Py_UNICODE_ISUPPER(ch) ||
2787 Py_UNICODE_ISTITLE(ch))
2788 previous_is_cased = 1;
2789 else
2790 previous_is_cased = 0;
2792 return 1;
2795 PyObject *PyUnicode_Join(PyObject *separator,
2796 PyObject *seq)
2798 Py_UNICODE *sep;
2799 int seplen;
2800 PyUnicodeObject *res = NULL;
2801 int reslen = 0;
2802 Py_UNICODE *p;
2803 int sz = 100;
2804 int i;
2805 PyObject *it;
2807 it = PyObject_GetIter(seq);
2808 if (it == NULL)
2809 return NULL;
2811 if (separator == NULL) {
2812 Py_UNICODE blank = ' ';
2813 sep = &blank;
2814 seplen = 1;
2816 else {
2817 separator = PyUnicode_FromObject(separator);
2818 if (separator == NULL)
2819 goto onError;
2820 sep = PyUnicode_AS_UNICODE(separator);
2821 seplen = PyUnicode_GET_SIZE(separator);
2824 res = _PyUnicode_New(sz);
2825 if (res == NULL)
2826 goto onError;
2827 p = PyUnicode_AS_UNICODE(res);
2828 reslen = 0;
2830 for (i = 0; ; ++i) {
2831 int itemlen;
2832 PyObject *item = PyIter_Next(it);
2833 if (item == NULL) {
2834 if (PyErr_Occurred())
2835 goto onError;
2836 break;
2838 if (!PyUnicode_Check(item)) {
2839 PyObject *v;
2840 v = PyUnicode_FromObject(item);
2841 Py_DECREF(item);
2842 item = v;
2843 if (item == NULL)
2844 goto onError;
2846 itemlen = PyUnicode_GET_SIZE(item);
2847 while (reslen + itemlen + seplen >= sz) {
2848 if (_PyUnicode_Resize(&res, sz*2))
2849 goto onError;
2850 sz *= 2;
2851 p = PyUnicode_AS_UNICODE(res) + reslen;
2853 if (i > 0) {
2854 Py_UNICODE_COPY(p, sep, seplen);
2855 p += seplen;
2856 reslen += seplen;
2858 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2859 p += itemlen;
2860 reslen += itemlen;
2861 Py_DECREF(item);
2863 if (_PyUnicode_Resize(&res, reslen))
2864 goto onError;
2866 Py_XDECREF(separator);
2867 Py_DECREF(it);
2868 return (PyObject *)res;
2870 onError:
2871 Py_XDECREF(separator);
2872 Py_XDECREF(res);
2873 Py_DECREF(it);
2874 return NULL;
2877 static
2878 PyUnicodeObject *pad(PyUnicodeObject *self,
2879 int left,
2880 int right,
2881 Py_UNICODE fill)
2883 PyUnicodeObject *u;
2885 if (left < 0)
2886 left = 0;
2887 if (right < 0)
2888 right = 0;
2890 if (left == 0 && right == 0) {
2891 Py_INCREF(self);
2892 return self;
2895 u = _PyUnicode_New(left + self->length + right);
2896 if (u) {
2897 if (left)
2898 Py_UNICODE_FILL(u->str, fill, left);
2899 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2900 if (right)
2901 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2904 return u;
2907 #define SPLIT_APPEND(data, left, right) \
2908 str = PyUnicode_FromUnicode(data + left, right - left); \
2909 if (!str) \
2910 goto onError; \
2911 if (PyList_Append(list, str)) { \
2912 Py_DECREF(str); \
2913 goto onError; \
2915 else \
2916 Py_DECREF(str);
2918 static
2919 PyObject *split_whitespace(PyUnicodeObject *self,
2920 PyObject *list,
2921 int maxcount)
2923 register int i;
2924 register int j;
2925 int len = self->length;
2926 PyObject *str;
2928 for (i = j = 0; i < len; ) {
2929 /* find a token */
2930 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2931 i++;
2932 j = i;
2933 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2934 i++;
2935 if (j < i) {
2936 if (maxcount-- <= 0)
2937 break;
2938 SPLIT_APPEND(self->str, j, i);
2939 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2940 i++;
2941 j = i;
2944 if (j < len) {
2945 SPLIT_APPEND(self->str, j, len);
2947 return list;
2949 onError:
2950 Py_DECREF(list);
2951 return NULL;
2954 PyObject *PyUnicode_Splitlines(PyObject *string,
2955 int keepends)
2957 register int i;
2958 register int j;
2959 int len;
2960 PyObject *list;
2961 PyObject *str;
2962 Py_UNICODE *data;
2964 string = PyUnicode_FromObject(string);
2965 if (string == NULL)
2966 return NULL;
2967 data = PyUnicode_AS_UNICODE(string);
2968 len = PyUnicode_GET_SIZE(string);
2970 list = PyList_New(0);
2971 if (!list)
2972 goto onError;
2974 for (i = j = 0; i < len; ) {
2975 int eol;
2977 /* Find a line and append it */
2978 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2979 i++;
2981 /* Skip the line break reading CRLF as one line break */
2982 eol = i;
2983 if (i < len) {
2984 if (data[i] == '\r' && i + 1 < len &&
2985 data[i+1] == '\n')
2986 i += 2;
2987 else
2988 i++;
2989 if (keepends)
2990 eol = i;
2992 SPLIT_APPEND(data, j, eol);
2993 j = i;
2995 if (j < len) {
2996 SPLIT_APPEND(data, j, len);
2999 Py_DECREF(string);
3000 return list;
3002 onError:
3003 Py_DECREF(list);
3004 Py_DECREF(string);
3005 return NULL;
3008 static
3009 PyObject *split_char(PyUnicodeObject *self,
3010 PyObject *list,
3011 Py_UNICODE ch,
3012 int maxcount)
3014 register int i;
3015 register int j;
3016 int len = self->length;
3017 PyObject *str;
3019 for (i = j = 0; i < len; ) {
3020 if (self->str[i] == ch) {
3021 if (maxcount-- <= 0)
3022 break;
3023 SPLIT_APPEND(self->str, j, i);
3024 i = j = i + 1;
3025 } else
3026 i++;
3028 if (j <= len) {
3029 SPLIT_APPEND(self->str, j, len);
3031 return list;
3033 onError:
3034 Py_DECREF(list);
3035 return NULL;
3038 static
3039 PyObject *split_substring(PyUnicodeObject *self,
3040 PyObject *list,
3041 PyUnicodeObject *substring,
3042 int maxcount)
3044 register int i;
3045 register int j;
3046 int len = self->length;
3047 int sublen = substring->length;
3048 PyObject *str;
3050 for (i = j = 0; i <= len - sublen; ) {
3051 if (Py_UNICODE_MATCH(self, i, substring)) {
3052 if (maxcount-- <= 0)
3053 break;
3054 SPLIT_APPEND(self->str, j, i);
3055 i = j = i + sublen;
3056 } else
3057 i++;
3059 if (j <= len) {
3060 SPLIT_APPEND(self->str, j, len);
3062 return list;
3064 onError:
3065 Py_DECREF(list);
3066 return NULL;
3069 #undef SPLIT_APPEND
3071 static
3072 PyObject *split(PyUnicodeObject *self,
3073 PyUnicodeObject *substring,
3074 int maxcount)
3076 PyObject *list;
3078 if (maxcount < 0)
3079 maxcount = INT_MAX;
3081 list = PyList_New(0);
3082 if (!list)
3083 return NULL;
3085 if (substring == NULL)
3086 return split_whitespace(self,list,maxcount);
3088 else if (substring->length == 1)
3089 return split_char(self,list,substring->str[0],maxcount);
3091 else if (substring->length == 0) {
3092 Py_DECREF(list);
3093 PyErr_SetString(PyExc_ValueError, "empty separator");
3094 return NULL;
3096 else
3097 return split_substring(self,list,substring,maxcount);
3100 static
3101 PyObject *strip(PyUnicodeObject *self,
3102 int left,
3103 int right)
3105 Py_UNICODE *p = self->str;
3106 int start = 0;
3107 int end = self->length;
3109 if (left)
3110 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3111 start++;
3113 if (right)
3114 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3115 end--;
3117 if (start == 0 && end == self->length) {
3118 /* couldn't strip anything off, return original string */
3119 Py_INCREF(self);
3120 return (PyObject*) self;
3123 return (PyObject*) PyUnicode_FromUnicode(
3124 self->str + start,
3125 end - start
3129 static
3130 PyObject *replace(PyUnicodeObject *self,
3131 PyUnicodeObject *str1,
3132 PyUnicodeObject *str2,
3133 int maxcount)
3135 PyUnicodeObject *u;
3137 if (maxcount < 0)
3138 maxcount = INT_MAX;
3140 if (str1->length == 1 && str2->length == 1) {
3141 int i;
3143 /* replace characters */
3144 if (!findchar(self->str, self->length, str1->str[0])) {
3145 /* nothing to replace, return original string */
3146 Py_INCREF(self);
3147 u = self;
3148 } else {
3149 Py_UNICODE u1 = str1->str[0];
3150 Py_UNICODE u2 = str2->str[0];
3152 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3153 NULL,
3154 self->length
3156 if (u != NULL) {
3157 Py_UNICODE_COPY(u->str, self->str,
3158 self->length);
3159 for (i = 0; i < u->length; i++)
3160 if (u->str[i] == u1) {
3161 if (--maxcount < 0)
3162 break;
3163 u->str[i] = u2;
3168 } else {
3169 int n, i;
3170 Py_UNICODE *p;
3172 /* replace strings */
3173 n = count(self, 0, self->length, str1);
3174 if (n > maxcount)
3175 n = maxcount;
3176 if (n == 0) {
3177 /* nothing to replace, return original string */
3178 Py_INCREF(self);
3179 u = self;
3180 } else {
3181 u = _PyUnicode_New(
3182 self->length + n * (str2->length - str1->length));
3183 if (u) {
3184 i = 0;
3185 p = u->str;
3186 while (i <= self->length - str1->length)
3187 if (Py_UNICODE_MATCH(self, i, str1)) {
3188 /* replace string segment */
3189 Py_UNICODE_COPY(p, str2->str, str2->length);
3190 p += str2->length;
3191 i += str1->length;
3192 if (--n <= 0) {
3193 /* copy remaining part */
3194 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3195 break;
3197 } else
3198 *p++ = self->str[i++];
3203 return (PyObject *) u;
3206 /* --- Unicode Object Methods --------------------------------------------- */
3208 static char title__doc__[] =
3209 "S.title() -> unicode\n\
3211 Return a titlecased version of S, i.e. words start with title case\n\
3212 characters, all remaining cased characters have lower case.";
3214 static PyObject*
3215 unicode_title(PyUnicodeObject *self, PyObject *args)
3217 if (!PyArg_NoArgs(args))
3218 return NULL;
3219 return fixup(self, fixtitle);
3222 static char capitalize__doc__[] =
3223 "S.capitalize() -> unicode\n\
3225 Return a capitalized version of S, i.e. make the first character\n\
3226 have upper case.";
3228 static PyObject*
3229 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3231 if (!PyArg_NoArgs(args))
3232 return NULL;
3233 return fixup(self, fixcapitalize);
3236 #if 0
3237 static char capwords__doc__[] =
3238 "S.capwords() -> unicode\n\
3240 Apply .capitalize() to all words in S and return the result with\n\
3241 normalized whitespace (all whitespace strings are replaced by ' ').";
3243 static PyObject*
3244 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3246 PyObject *list;
3247 PyObject *item;
3248 int i;
3250 if (!PyArg_NoArgs(args))
3251 return NULL;
3253 /* Split into words */
3254 list = split(self, NULL, -1);
3255 if (!list)
3256 return NULL;
3258 /* Capitalize each word */
3259 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3260 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3261 fixcapitalize);
3262 if (item == NULL)
3263 goto onError;
3264 Py_DECREF(PyList_GET_ITEM(list, i));
3265 PyList_SET_ITEM(list, i, item);
3268 /* Join the words to form a new string */
3269 item = PyUnicode_Join(NULL, list);
3271 onError:
3272 Py_DECREF(list);
3273 return (PyObject *)item;
3275 #endif
3277 static char center__doc__[] =
3278 "S.center(width) -> unicode\n\
3280 Return S centered in a Unicode string of length width. Padding is done\n\
3281 using spaces.";
3283 static PyObject *
3284 unicode_center(PyUnicodeObject *self, PyObject *args)
3286 int marg, left;
3287 int width;
3289 if (!PyArg_ParseTuple(args, "i:center", &width))
3290 return NULL;
3292 if (self->length >= width) {
3293 Py_INCREF(self);
3294 return (PyObject*) self;
3297 marg = width - self->length;
3298 left = marg / 2 + (marg & width & 1);
3300 return (PyObject*) pad(self, left, marg - left, ' ');
3303 #if 0
3305 /* This code should go into some future Unicode collation support
3306 module. The basic comparison should compare ordinals on a naive
3307 basis (this is what Java does and thus JPython too). */
3309 /* speedy UTF-16 code point order comparison */
3310 /* gleaned from: */
3311 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3313 static short utf16Fixup[32] =
3315 0, 0, 0, 0, 0, 0, 0, 0,
3316 0, 0, 0, 0, 0, 0, 0, 0,
3317 0, 0, 0, 0, 0, 0, 0, 0,
3318 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3321 static int
3322 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3324 int len1, len2;
3326 Py_UNICODE *s1 = str1->str;
3327 Py_UNICODE *s2 = str2->str;
3329 len1 = str1->length;
3330 len2 = str2->length;
3332 while (len1 > 0 && len2 > 0) {
3333 Py_UNICODE c1, c2;
3335 c1 = *s1++;
3336 c2 = *s2++;
3338 if (c1 > (1<<11) * 26)
3339 c1 += utf16Fixup[c1>>11];
3340 if (c2 > (1<<11) * 26)
3341 c2 += utf16Fixup[c2>>11];
3342 /* now c1 and c2 are in UTF-32-compatible order */
3344 if (c1 != c2)
3345 return (c1 < c2) ? -1 : 1;
3347 len1--; len2--;
3350 return (len1 < len2) ? -1 : (len1 != len2);
3353 #else
3355 static int
3356 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3358 register int len1, len2;
3360 Py_UNICODE *s1 = str1->str;
3361 Py_UNICODE *s2 = str2->str;
3363 len1 = str1->length;
3364 len2 = str2->length;
3366 while (len1 > 0 && len2 > 0) {
3367 Py_UNICODE c1, c2;
3369 c1 = *s1++;
3370 c2 = *s2++;
3372 if (c1 != c2)
3373 return (c1 < c2) ? -1 : 1;
3375 len1--; len2--;
3378 return (len1 < len2) ? -1 : (len1 != len2);
3381 #endif
3383 int PyUnicode_Compare(PyObject *left,
3384 PyObject *right)
3386 PyUnicodeObject *u = NULL, *v = NULL;
3387 int result;
3389 /* Coerce the two arguments */
3390 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3391 if (u == NULL)
3392 goto onError;
3393 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3394 if (v == NULL)
3395 goto onError;
3397 /* Shortcut for empty or interned objects */
3398 if (v == u) {
3399 Py_DECREF(u);
3400 Py_DECREF(v);
3401 return 0;
3404 result = unicode_compare(u, v);
3406 Py_DECREF(u);
3407 Py_DECREF(v);
3408 return result;
3410 onError:
3411 Py_XDECREF(u);
3412 Py_XDECREF(v);
3413 return -1;
3416 int PyUnicode_Contains(PyObject *container,
3417 PyObject *element)
3419 PyUnicodeObject *u = NULL, *v = NULL;
3420 int result;
3421 register const Py_UNICODE *p, *e;
3422 register Py_UNICODE ch;
3424 /* Coerce the two arguments */
3425 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3426 if (v == NULL) {
3427 PyErr_SetString(PyExc_TypeError,
3428 "'in <string>' requires character as left operand");
3429 goto onError;
3431 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3432 if (u == NULL) {
3433 Py_DECREF(v);
3434 goto onError;
3437 /* Check v in u */
3438 if (PyUnicode_GET_SIZE(v) != 1) {
3439 PyErr_SetString(PyExc_TypeError,
3440 "'in <string>' requires character as left operand");
3441 goto onError;
3443 ch = *PyUnicode_AS_UNICODE(v);
3444 p = PyUnicode_AS_UNICODE(u);
3445 e = p + PyUnicode_GET_SIZE(u);
3446 result = 0;
3447 while (p < e) {
3448 if (*p++ == ch) {
3449 result = 1;
3450 break;
3454 Py_DECREF(u);
3455 Py_DECREF(v);
3456 return result;
3458 onError:
3459 Py_XDECREF(u);
3460 Py_XDECREF(v);
3461 return -1;
3464 /* Concat to string or Unicode object giving a new Unicode object. */
3466 PyObject *PyUnicode_Concat(PyObject *left,
3467 PyObject *right)
3469 PyUnicodeObject *u = NULL, *v = NULL, *w;
3471 /* Coerce the two arguments */
3472 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3473 if (u == NULL)
3474 goto onError;
3475 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3476 if (v == NULL)
3477 goto onError;
3479 /* Shortcuts */
3480 if (v == unicode_empty) {
3481 Py_DECREF(v);
3482 return (PyObject *)u;
3484 if (u == unicode_empty) {
3485 Py_DECREF(u);
3486 return (PyObject *)v;
3489 /* Concat the two Unicode strings */
3490 w = _PyUnicode_New(u->length + v->length);
3491 if (w == NULL)
3492 goto onError;
3493 Py_UNICODE_COPY(w->str, u->str, u->length);
3494 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3496 Py_DECREF(u);
3497 Py_DECREF(v);
3498 return (PyObject *)w;
3500 onError:
3501 Py_XDECREF(u);
3502 Py_XDECREF(v);
3503 return NULL;
3506 static char count__doc__[] =
3507 "S.count(sub[, start[, end]]) -> int\n\
3509 Return the number of occurrences of substring sub in Unicode string\n\
3510 S[start:end]. Optional arguments start and end are\n\
3511 interpreted as in slice notation.";
3513 static PyObject *
3514 unicode_count(PyUnicodeObject *self, PyObject *args)
3516 PyUnicodeObject *substring;
3517 int start = 0;
3518 int end = INT_MAX;
3519 PyObject *result;
3521 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3522 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3523 return NULL;
3525 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3526 (PyObject *)substring);
3527 if (substring == NULL)
3528 return NULL;
3530 if (start < 0)
3531 start += self->length;
3532 if (start < 0)
3533 start = 0;
3534 if (end > self->length)
3535 end = self->length;
3536 if (end < 0)
3537 end += self->length;
3538 if (end < 0)
3539 end = 0;
3541 result = PyInt_FromLong((long) count(self, start, end, substring));
3543 Py_DECREF(substring);
3544 return result;
3547 static char encode__doc__[] =
3548 "S.encode([encoding[,errors]]) -> string\n\
3550 Return an encoded string version of S. Default encoding is the current\n\
3551 default string encoding. errors may be given to set a different error\n\
3552 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3553 a ValueError. Other possible values are 'ignore' and 'replace'.";
3555 static PyObject *
3556 unicode_encode(PyUnicodeObject *self, PyObject *args)
3558 char *encoding = NULL;
3559 char *errors = NULL;
3560 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3561 return NULL;
3562 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3565 static char expandtabs__doc__[] =
3566 "S.expandtabs([tabsize]) -> unicode\n\
3568 Return a copy of S where all tab characters are expanded using spaces.\n\
3569 If tabsize is not given, a tab size of 8 characters is assumed.";
3571 static PyObject*
3572 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3574 Py_UNICODE *e;
3575 Py_UNICODE *p;
3576 Py_UNICODE *q;
3577 int i, j;
3578 PyUnicodeObject *u;
3579 int tabsize = 8;
3581 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3582 return NULL;
3584 /* First pass: determine size of output string */
3585 i = j = 0;
3586 e = self->str + self->length;
3587 for (p = self->str; p < e; p++)
3588 if (*p == '\t') {
3589 if (tabsize > 0)
3590 j += tabsize - (j % tabsize);
3592 else {
3593 j++;
3594 if (*p == '\n' || *p == '\r') {
3595 i += j;
3596 j = 0;
3600 /* Second pass: create output string and fill it */
3601 u = _PyUnicode_New(i + j);
3602 if (!u)
3603 return NULL;
3605 j = 0;
3606 q = u->str;
3608 for (p = self->str; p < e; p++)
3609 if (*p == '\t') {
3610 if (tabsize > 0) {
3611 i = tabsize - (j % tabsize);
3612 j += i;
3613 while (i--)
3614 *q++ = ' ';
3617 else {
3618 j++;
3619 *q++ = *p;
3620 if (*p == '\n' || *p == '\r')
3621 j = 0;
3624 return (PyObject*) u;
3627 static char find__doc__[] =
3628 "S.find(sub [,start [,end]]) -> int\n\
3630 Return the lowest index in S where substring sub is found,\n\
3631 such that sub is contained within s[start,end]. Optional\n\
3632 arguments start and end are interpreted as in slice notation.\n\
3634 Return -1 on failure.";
3636 static PyObject *
3637 unicode_find(PyUnicodeObject *self, PyObject *args)
3639 PyUnicodeObject *substring;
3640 int start = 0;
3641 int end = INT_MAX;
3642 PyObject *result;
3644 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3645 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3646 return NULL;
3647 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3648 (PyObject *)substring);
3649 if (substring == NULL)
3650 return NULL;
3652 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3654 Py_DECREF(substring);
3655 return result;
3658 static PyObject *
3659 unicode_getitem(PyUnicodeObject *self, int index)
3661 if (index < 0 || index >= self->length) {
3662 PyErr_SetString(PyExc_IndexError, "string index out of range");
3663 return NULL;
3666 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3669 static long
3670 unicode_hash(PyUnicodeObject *self)
3672 /* Since Unicode objects compare equal to their ASCII string
3673 counterparts, they should use the individual character values
3674 as basis for their hash value. This is needed to assure that
3675 strings and Unicode objects behave in the same way as
3676 dictionary keys. */
3678 register int len;
3679 register Py_UNICODE *p;
3680 register long x;
3682 if (self->hash != -1)
3683 return self->hash;
3684 len = PyUnicode_GET_SIZE(self);
3685 p = PyUnicode_AS_UNICODE(self);
3686 x = *p << 7;
3687 while (--len >= 0)
3688 x = (1000003*x) ^ *p++;
3689 x ^= PyUnicode_GET_SIZE(self);
3690 if (x == -1)
3691 x = -2;
3692 self->hash = x;
3693 return x;
3696 static char index__doc__[] =
3697 "S.index(sub [,start [,end]]) -> int\n\
3699 Like S.find() but raise ValueError when the substring is not found.";
3701 static PyObject *
3702 unicode_index(PyUnicodeObject *self, PyObject *args)
3704 int result;
3705 PyUnicodeObject *substring;
3706 int start = 0;
3707 int end = INT_MAX;
3709 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3711 return NULL;
3713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3714 (PyObject *)substring);
3715 if (substring == NULL)
3716 return NULL;
3718 result = findstring(self, substring, start, end, 1);
3720 Py_DECREF(substring);
3721 if (result < 0) {
3722 PyErr_SetString(PyExc_ValueError, "substring not found");
3723 return NULL;
3725 return PyInt_FromLong(result);
3728 static char islower__doc__[] =
3729 "S.islower() -> int\n\
3731 Return 1 if all cased characters in S are lowercase and there is\n\
3732 at least one cased character in S, 0 otherwise.";
3734 static PyObject*
3735 unicode_islower(PyUnicodeObject *self, PyObject *args)
3737 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3738 register const Py_UNICODE *e;
3739 int cased;
3741 if (!PyArg_NoArgs(args))
3742 return NULL;
3744 /* Shortcut for single character strings */
3745 if (PyUnicode_GET_SIZE(self) == 1)
3746 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3748 /* Special case for empty strings */
3749 if (PyString_GET_SIZE(self) == 0)
3750 return PyInt_FromLong(0);
3752 e = p + PyUnicode_GET_SIZE(self);
3753 cased = 0;
3754 for (; p < e; p++) {
3755 register const Py_UNICODE ch = *p;
3757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3758 return PyInt_FromLong(0);
3759 else if (!cased && Py_UNICODE_ISLOWER(ch))
3760 cased = 1;
3762 return PyInt_FromLong(cased);
3765 static char isupper__doc__[] =
3766 "S.isupper() -> int\n\
3768 Return 1 if all cased characters in S are uppercase and there is\n\
3769 at least one cased character in S, 0 otherwise.";
3771 static PyObject*
3772 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3775 register const Py_UNICODE *e;
3776 int cased;
3778 if (!PyArg_NoArgs(args))
3779 return NULL;
3781 /* Shortcut for single character strings */
3782 if (PyUnicode_GET_SIZE(self) == 1)
3783 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3785 /* Special case for empty strings */
3786 if (PyString_GET_SIZE(self) == 0)
3787 return PyInt_FromLong(0);
3789 e = p + PyUnicode_GET_SIZE(self);
3790 cased = 0;
3791 for (; p < e; p++) {
3792 register const Py_UNICODE ch = *p;
3794 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3795 return PyInt_FromLong(0);
3796 else if (!cased && Py_UNICODE_ISUPPER(ch))
3797 cased = 1;
3799 return PyInt_FromLong(cased);
3802 static char istitle__doc__[] =
3803 "S.istitle() -> int\n\
3805 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3806 may only follow uncased characters and lowercase characters only cased\n\
3807 ones. Return 0 otherwise.";
3809 static PyObject*
3810 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814 int cased, previous_is_cased;
3816 if (!PyArg_NoArgs(args))
3817 return NULL;
3819 /* Shortcut for single character strings */
3820 if (PyUnicode_GET_SIZE(self) == 1)
3821 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3822 (Py_UNICODE_ISUPPER(*p) != 0));
3824 /* Special case for empty strings */
3825 if (PyString_GET_SIZE(self) == 0)
3826 return PyInt_FromLong(0);
3828 e = p + PyUnicode_GET_SIZE(self);
3829 cased = 0;
3830 previous_is_cased = 0;
3831 for (; p < e; p++) {
3832 register const Py_UNICODE ch = *p;
3834 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3835 if (previous_is_cased)
3836 return PyInt_FromLong(0);
3837 previous_is_cased = 1;
3838 cased = 1;
3840 else if (Py_UNICODE_ISLOWER(ch)) {
3841 if (!previous_is_cased)
3842 return PyInt_FromLong(0);
3843 previous_is_cased = 1;
3844 cased = 1;
3846 else
3847 previous_is_cased = 0;
3849 return PyInt_FromLong(cased);
3852 static char isspace__doc__[] =
3853 "S.isspace() -> int\n\
3855 Return 1 if there are only whitespace characters in S,\n\
3856 0 otherwise.";
3858 static PyObject*
3859 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862 register const Py_UNICODE *e;
3864 if (!PyArg_NoArgs(args))
3865 return NULL;
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self) == 1 &&
3869 Py_UNICODE_ISSPACE(*p))
3870 return PyInt_FromLong(1);
3872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self) == 0)
3874 return PyInt_FromLong(0);
3876 e = p + PyUnicode_GET_SIZE(self);
3877 for (; p < e; p++) {
3878 if (!Py_UNICODE_ISSPACE(*p))
3879 return PyInt_FromLong(0);
3881 return PyInt_FromLong(1);
3884 static char isalpha__doc__[] =
3885 "S.isalpha() -> int\n\
3887 Return 1 if all characters in S are alphabetic\n\
3888 and there is at least one character in S, 0 otherwise.";
3890 static PyObject*
3891 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894 register const Py_UNICODE *e;
3896 if (!PyArg_NoArgs(args))
3897 return NULL;
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self) == 1 &&
3901 Py_UNICODE_ISALPHA(*p))
3902 return PyInt_FromLong(1);
3904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self) == 0)
3906 return PyInt_FromLong(0);
3908 e = p + PyUnicode_GET_SIZE(self);
3909 for (; p < e; p++) {
3910 if (!Py_UNICODE_ISALPHA(*p))
3911 return PyInt_FromLong(0);
3913 return PyInt_FromLong(1);
3916 static char isalnum__doc__[] =
3917 "S.isalnum() -> int\n\
3919 Return 1 if all characters in S are alphanumeric\n\
3920 and there is at least one character in S, 0 otherwise.";
3922 static PyObject*
3923 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3925 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3926 register const Py_UNICODE *e;
3928 if (!PyArg_NoArgs(args))
3929 return NULL;
3931 /* Shortcut for single character strings */
3932 if (PyUnicode_GET_SIZE(self) == 1 &&
3933 Py_UNICODE_ISALNUM(*p))
3934 return PyInt_FromLong(1);
3936 /* Special case for empty strings */
3937 if (PyString_GET_SIZE(self) == 0)
3938 return PyInt_FromLong(0);
3940 e = p + PyUnicode_GET_SIZE(self);
3941 for (; p < e; p++) {
3942 if (!Py_UNICODE_ISALNUM(*p))
3943 return PyInt_FromLong(0);
3945 return PyInt_FromLong(1);
3948 static char isdecimal__doc__[] =
3949 "S.isdecimal() -> int\n\
3951 Return 1 if there are only decimal characters in S,\n\
3952 0 otherwise.";
3954 static PyObject*
3955 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3958 register const Py_UNICODE *e;
3960 if (!PyArg_NoArgs(args))
3961 return NULL;
3963 /* Shortcut for single character strings */
3964 if (PyUnicode_GET_SIZE(self) == 1 &&
3965 Py_UNICODE_ISDECIMAL(*p))
3966 return PyInt_FromLong(1);
3968 /* Special case for empty strings */
3969 if (PyString_GET_SIZE(self) == 0)
3970 return PyInt_FromLong(0);
3972 e = p + PyUnicode_GET_SIZE(self);
3973 for (; p < e; p++) {
3974 if (!Py_UNICODE_ISDECIMAL(*p))
3975 return PyInt_FromLong(0);
3977 return PyInt_FromLong(1);
3980 static char isdigit__doc__[] =
3981 "S.isdigit() -> int\n\
3983 Return 1 if there are only digit characters in S,\n\
3984 0 otherwise.";
3986 static PyObject*
3987 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3990 register const Py_UNICODE *e;
3992 if (!PyArg_NoArgs(args))
3993 return NULL;
3995 /* Shortcut for single character strings */
3996 if (PyUnicode_GET_SIZE(self) == 1 &&
3997 Py_UNICODE_ISDIGIT(*p))
3998 return PyInt_FromLong(1);
4000 /* Special case for empty strings */
4001 if (PyString_GET_SIZE(self) == 0)
4002 return PyInt_FromLong(0);
4004 e = p + PyUnicode_GET_SIZE(self);
4005 for (; p < e; p++) {
4006 if (!Py_UNICODE_ISDIGIT(*p))
4007 return PyInt_FromLong(0);
4009 return PyInt_FromLong(1);
4012 static char isnumeric__doc__[] =
4013 "S.isnumeric() -> int\n\
4015 Return 1 if there are only numeric characters in S,\n\
4016 0 otherwise.";
4018 static PyObject*
4019 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4022 register const Py_UNICODE *e;
4024 if (!PyArg_NoArgs(args))
4025 return NULL;
4027 /* Shortcut for single character strings */
4028 if (PyUnicode_GET_SIZE(self) == 1 &&
4029 Py_UNICODE_ISNUMERIC(*p))
4030 return PyInt_FromLong(1);
4032 /* Special case for empty strings */
4033 if (PyString_GET_SIZE(self) == 0)
4034 return PyInt_FromLong(0);
4036 e = p + PyUnicode_GET_SIZE(self);
4037 for (; p < e; p++) {
4038 if (!Py_UNICODE_ISNUMERIC(*p))
4039 return PyInt_FromLong(0);
4041 return PyInt_FromLong(1);
4044 static char join__doc__[] =
4045 "S.join(sequence) -> unicode\n\
4047 Return a string which is the concatenation of the strings in the\n\
4048 sequence. The separator between elements is S.";
4050 static PyObject*
4051 unicode_join(PyUnicodeObject *self, PyObject *args)
4053 PyObject *data;
4054 if (!PyArg_ParseTuple(args, "O:join", &data))
4055 return NULL;
4057 return PyUnicode_Join((PyObject *)self, data);
4060 static int
4061 unicode_length(PyUnicodeObject *self)
4063 return self->length;
4066 static char ljust__doc__[] =
4067 "S.ljust(width) -> unicode\n\
4069 Return S left justified in a Unicode string of length width. Padding is\n\
4070 done using spaces.";
4072 static PyObject *
4073 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4075 int width;
4076 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4077 return NULL;
4079 if (self->length >= width) {
4080 Py_INCREF(self);
4081 return (PyObject*) self;
4084 return (PyObject*) pad(self, 0, width - self->length, ' ');
4087 static char lower__doc__[] =
4088 "S.lower() -> unicode\n\
4090 Return a copy of the string S converted to lowercase.";
4092 static PyObject*
4093 unicode_lower(PyUnicodeObject *self, PyObject *args)
4095 if (!PyArg_NoArgs(args))
4096 return NULL;
4097 return fixup(self, fixlower);
4100 static char lstrip__doc__[] =
4101 "S.lstrip() -> unicode\n\
4103 Return a copy of the string S with leading whitespace removed.";
4105 static PyObject *
4106 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4108 if (!PyArg_NoArgs(args))
4109 return NULL;
4110 return strip(self, 1, 0);
4113 static PyObject*
4114 unicode_repeat(PyUnicodeObject *str, int len)
4116 PyUnicodeObject *u;
4117 Py_UNICODE *p;
4118 int nchars;
4119 size_t nbytes;
4121 if (len < 0)
4122 len = 0;
4124 if (len == 1) {
4125 /* no repeat, return original string */
4126 Py_INCREF(str);
4127 return (PyObject*) str;
4130 /* ensure # of chars needed doesn't overflow int and # of bytes
4131 * needed doesn't overflow size_t
4133 nchars = len * str->length;
4134 if (len && nchars / len != str->length) {
4135 PyErr_SetString(PyExc_OverflowError,
4136 "repeated string is too long");
4137 return NULL;
4139 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4140 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4141 PyErr_SetString(PyExc_OverflowError,
4142 "repeated string is too long");
4143 return NULL;
4145 u = _PyUnicode_New(nchars);
4146 if (!u)
4147 return NULL;
4149 p = u->str;
4151 while (len-- > 0) {
4152 Py_UNICODE_COPY(p, str->str, str->length);
4153 p += str->length;
4156 return (PyObject*) u;
4159 PyObject *PyUnicode_Replace(PyObject *obj,
4160 PyObject *subobj,
4161 PyObject *replobj,
4162 int maxcount)
4164 PyObject *self;
4165 PyObject *str1;
4166 PyObject *str2;
4167 PyObject *result;
4169 self = PyUnicode_FromObject(obj);
4170 if (self == NULL)
4171 return NULL;
4172 str1 = PyUnicode_FromObject(subobj);
4173 if (str1 == NULL) {
4174 Py_DECREF(self);
4175 return NULL;
4177 str2 = PyUnicode_FromObject(replobj);
4178 if (str2 == NULL) {
4179 Py_DECREF(self);
4180 Py_DECREF(str1);
4181 return NULL;
4183 result = replace((PyUnicodeObject *)self,
4184 (PyUnicodeObject *)str1,
4185 (PyUnicodeObject *)str2,
4186 maxcount);
4187 Py_DECREF(self);
4188 Py_DECREF(str1);
4189 Py_DECREF(str2);
4190 return result;
4193 static char replace__doc__[] =
4194 "S.replace (old, new[, maxsplit]) -> unicode\n\
4196 Return a copy of S with all occurrences of substring\n\
4197 old replaced by new. If the optional argument maxsplit is\n\
4198 given, only the first maxsplit occurrences are replaced.";
4200 static PyObject*
4201 unicode_replace(PyUnicodeObject *self, PyObject *args)
4203 PyUnicodeObject *str1;
4204 PyUnicodeObject *str2;
4205 int maxcount = -1;
4206 PyObject *result;
4208 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4209 return NULL;
4210 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4211 if (str1 == NULL)
4212 return NULL;
4213 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4214 if (str2 == NULL)
4215 return NULL;
4217 result = replace(self, str1, str2, maxcount);
4219 Py_DECREF(str1);
4220 Py_DECREF(str2);
4221 return result;
4224 static
4225 PyObject *unicode_repr(PyObject *unicode)
4227 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4228 PyUnicode_GET_SIZE(unicode),
4232 static char rfind__doc__[] =
4233 "S.rfind(sub [,start [,end]]) -> int\n\
4235 Return the highest index in S where substring sub is found,\n\
4236 such that sub is contained within s[start,end]. Optional\n\
4237 arguments start and end are interpreted as in slice notation.\n\
4239 Return -1 on failure.";
4241 static PyObject *
4242 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4244 PyUnicodeObject *substring;
4245 int start = 0;
4246 int end = INT_MAX;
4247 PyObject *result;
4249 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4250 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4251 return NULL;
4252 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4253 (PyObject *)substring);
4254 if (substring == NULL)
4255 return NULL;
4257 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4259 Py_DECREF(substring);
4260 return result;
4263 static char rindex__doc__[] =
4264 "S.rindex(sub [,start [,end]]) -> int\n\
4266 Like S.rfind() but raise ValueError when the substring is not found.";
4268 static PyObject *
4269 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4271 int result;
4272 PyUnicodeObject *substring;
4273 int start = 0;
4274 int end = INT_MAX;
4276 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4277 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4278 return NULL;
4279 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4280 (PyObject *)substring);
4281 if (substring == NULL)
4282 return NULL;
4284 result = findstring(self, substring, start, end, -1);
4286 Py_DECREF(substring);
4287 if (result < 0) {
4288 PyErr_SetString(PyExc_ValueError, "substring not found");
4289 return NULL;
4291 return PyInt_FromLong(result);
4294 static char rjust__doc__[] =
4295 "S.rjust(width) -> unicode\n\
4297 Return S right justified in a Unicode string of length width. Padding is\n\
4298 done using spaces.";
4300 static PyObject *
4301 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4303 int width;
4304 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4305 return NULL;
4307 if (self->length >= width) {
4308 Py_INCREF(self);
4309 return (PyObject*) self;
4312 return (PyObject*) pad(self, width - self->length, 0, ' ');
4315 static char rstrip__doc__[] =
4316 "S.rstrip() -> unicode\n\
4318 Return a copy of the string S with trailing whitespace removed.";
4320 static PyObject *
4321 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4323 if (!PyArg_NoArgs(args))
4324 return NULL;
4325 return strip(self, 0, 1);
4328 static PyObject*
4329 unicode_slice(PyUnicodeObject *self, int start, int end)
4331 /* standard clamping */
4332 if (start < 0)
4333 start = 0;
4334 if (end < 0)
4335 end = 0;
4336 if (end > self->length)
4337 end = self->length;
4338 if (start == 0 && end == self->length) {
4339 /* full slice, return original string */
4340 Py_INCREF(self);
4341 return (PyObject*) self;
4343 if (start > end)
4344 start = end;
4345 /* copy slice */
4346 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4347 end - start);
4350 PyObject *PyUnicode_Split(PyObject *s,
4351 PyObject *sep,
4352 int maxsplit)
4354 PyObject *result;
4356 s = PyUnicode_FromObject(s);
4357 if (s == NULL)
4358 return NULL;
4359 if (sep != NULL) {
4360 sep = PyUnicode_FromObject(sep);
4361 if (sep == NULL) {
4362 Py_DECREF(s);
4363 return NULL;
4367 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4369 Py_DECREF(s);
4370 Py_XDECREF(sep);
4371 return result;
4374 static char split__doc__[] =
4375 "S.split([sep [,maxsplit]]) -> list of strings\n\
4377 Return a list of the words in S, using sep as the\n\
4378 delimiter string. If maxsplit is given, at most maxsplit\n\
4379 splits are done. If sep is not specified, any whitespace string\n\
4380 is a separator.";
4382 static PyObject*
4383 unicode_split(PyUnicodeObject *self, PyObject *args)
4385 PyObject *substring = Py_None;
4386 int maxcount = -1;
4388 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4389 return NULL;
4391 if (substring == Py_None)
4392 return split(self, NULL, maxcount);
4393 else if (PyUnicode_Check(substring))
4394 return split(self, (PyUnicodeObject *)substring, maxcount);
4395 else
4396 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4399 static char splitlines__doc__[] =
4400 "S.splitlines([keepends]]) -> list of strings\n\
4402 Return a list of the lines in S, breaking at line boundaries.\n\
4403 Line breaks are not included in the resulting list unless keepends\n\
4404 is given and true.";
4406 static PyObject*
4407 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4409 int keepends = 0;
4411 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4412 return NULL;
4414 return PyUnicode_Splitlines((PyObject *)self, keepends);
4417 static
4418 PyObject *unicode_str(PyUnicodeObject *self)
4420 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4423 static char strip__doc__[] =
4424 "S.strip() -> unicode\n\
4426 Return a copy of S with leading and trailing whitespace removed.";
4428 static PyObject *
4429 unicode_strip(PyUnicodeObject *self, PyObject *args)
4431 if (!PyArg_NoArgs(args))
4432 return NULL;
4433 return strip(self, 1, 1);
4436 static char swapcase__doc__[] =
4437 "S.swapcase() -> unicode\n\
4439 Return a copy of S with uppercase characters converted to lowercase\n\
4440 and vice versa.";
4442 static PyObject*
4443 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4445 if (!PyArg_NoArgs(args))
4446 return NULL;
4447 return fixup(self, fixswapcase);
4450 static char translate__doc__[] =
4451 "S.translate(table) -> unicode\n\
4453 Return a copy of the string S, where all characters have been mapped\n\
4454 through the given translation table, which must be a mapping of\n\
4455 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4456 are left untouched. Characters mapped to None are deleted.";
4458 static PyObject*
4459 unicode_translate(PyUnicodeObject *self, PyObject *args)
4461 PyObject *table;
4463 if (!PyArg_ParseTuple(args, "O:translate", &table))
4464 return NULL;
4465 return PyUnicode_TranslateCharmap(self->str,
4466 self->length,
4467 table,
4468 "ignore");
4471 static char upper__doc__[] =
4472 "S.upper() -> unicode\n\
4474 Return a copy of S converted to uppercase.";
4476 static PyObject*
4477 unicode_upper(PyUnicodeObject *self, PyObject *args)
4479 if (!PyArg_NoArgs(args))
4480 return NULL;
4481 return fixup(self, fixupper);
4484 #if 0
4485 static char zfill__doc__[] =
4486 "S.zfill(width) -> unicode\n\
4488 Pad a numeric string x with zeros on the left, to fill a field\n\
4489 of the specified width. The string x is never truncated.";
4491 static PyObject *
4492 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4494 int fill;
4495 PyUnicodeObject *u;
4497 int width;
4498 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4499 return NULL;
4501 if (self->length >= width) {
4502 Py_INCREF(self);
4503 return (PyObject*) self;
4506 fill = width - self->length;
4508 u = pad(self, fill, 0, '0');
4510 if (u->str[fill] == '+' || u->str[fill] == '-') {
4511 /* move sign to beginning of string */
4512 u->str[0] = u->str[fill];
4513 u->str[fill] = '0';
4516 return (PyObject*) u;
4518 #endif
4520 #if 0
4521 static PyObject*
4522 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4524 if (!PyArg_NoArgs(args))
4525 return NULL;
4526 return PyInt_FromLong(unicode_freelist_size);
4528 #endif
4530 static char startswith__doc__[] =
4531 "S.startswith(prefix[, start[, end]]) -> int\n\
4533 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4534 optional start, test S beginning at that position. With optional end, stop\n\
4535 comparing S at that position.";
4537 static PyObject *
4538 unicode_startswith(PyUnicodeObject *self,
4539 PyObject *args)
4541 PyUnicodeObject *substring;
4542 int start = 0;
4543 int end = INT_MAX;
4544 PyObject *result;
4546 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4547 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4548 return NULL;
4549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4550 (PyObject *)substring);
4551 if (substring == NULL)
4552 return NULL;
4554 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4556 Py_DECREF(substring);
4557 return result;
4561 static char endswith__doc__[] =
4562 "S.endswith(suffix[, start[, end]]) -> int\n\
4564 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4565 optional start, test S beginning at that position. With optional end, stop\n\
4566 comparing S at that position.";
4568 static PyObject *
4569 unicode_endswith(PyUnicodeObject *self,
4570 PyObject *args)
4572 PyUnicodeObject *substring;
4573 int start = 0;
4574 int end = INT_MAX;
4575 PyObject *result;
4577 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4579 return NULL;
4580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4581 (PyObject *)substring);
4582 if (substring == NULL)
4583 return NULL;
4585 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4587 Py_DECREF(substring);
4588 return result;
4592 static PyMethodDef unicode_methods[] = {
4594 /* Order is according to common usage: often used methods should
4595 appear first, since lookup is done sequentially. */
4597 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4598 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4599 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4600 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4601 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4602 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4603 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4604 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4605 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4606 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4607 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4608 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4609 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4610 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4611 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4612 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4613 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4614 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4615 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4616 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4617 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4618 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4619 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4620 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4621 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4622 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4623 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4624 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4625 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4626 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4627 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4628 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4629 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4630 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4631 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4632 #if 0
4633 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4634 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4635 #endif
4637 #if 0
4638 /* This one is just used for debugging the implementation. */
4639 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4640 #endif
4642 {NULL, NULL}
4645 static PyObject *
4646 unicode_getattr(PyUnicodeObject *self, char *name)
4648 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4651 static PySequenceMethods unicode_as_sequence = {
4652 (inquiry) unicode_length, /* sq_length */
4653 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4654 (intargfunc) unicode_repeat, /* sq_repeat */
4655 (intargfunc) unicode_getitem, /* sq_item */
4656 (intintargfunc) unicode_slice, /* sq_slice */
4657 0, /* sq_ass_item */
4658 0, /* sq_ass_slice */
4659 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4662 static int
4663 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4664 int index,
4665 const void **ptr)
4667 if (index != 0) {
4668 PyErr_SetString(PyExc_SystemError,
4669 "accessing non-existent unicode segment");
4670 return -1;
4672 *ptr = (void *) self->str;
4673 return PyUnicode_GET_DATA_SIZE(self);
4676 static int
4677 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4678 const void **ptr)
4680 PyErr_SetString(PyExc_TypeError,
4681 "cannot use unicode as modifyable buffer");
4682 return -1;
4685 static int
4686 unicode_buffer_getsegcount(PyUnicodeObject *self,
4687 int *lenp)
4689 if (lenp)
4690 *lenp = PyUnicode_GET_DATA_SIZE(self);
4691 return 1;
4694 static int
4695 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4696 int index,
4697 const void **ptr)
4699 PyObject *str;
4701 if (index != 0) {
4702 PyErr_SetString(PyExc_SystemError,
4703 "accessing non-existent unicode segment");
4704 return -1;
4706 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4707 if (str == NULL)
4708 return -1;
4709 *ptr = (void *) PyString_AS_STRING(str);
4710 return PyString_GET_SIZE(str);
4713 /* Helpers for PyUnicode_Format() */
4715 static PyObject *
4716 getnextarg(PyObject *args, int arglen, int *p_argidx)
4718 int argidx = *p_argidx;
4719 if (argidx < arglen) {
4720 (*p_argidx)++;
4721 if (arglen < 0)
4722 return args;
4723 else
4724 return PyTuple_GetItem(args, argidx);
4726 PyErr_SetString(PyExc_TypeError,
4727 "not enough arguments for format string");
4728 return NULL;
4731 #define F_LJUST (1<<0)
4732 #define F_SIGN (1<<1)
4733 #define F_BLANK (1<<2)
4734 #define F_ALT (1<<3)
4735 #define F_ZERO (1<<4)
4737 static
4738 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4740 register int i;
4741 int len;
4742 va_list va;
4743 char *charbuffer;
4744 va_start(va, format);
4746 /* First, format the string as char array, then expand to Py_UNICODE
4747 array. */
4748 charbuffer = (char *)buffer;
4749 len = vsprintf(charbuffer, format, va);
4750 for (i = len - 1; i >= 0; i--)
4751 buffer[i] = (Py_UNICODE) charbuffer[i];
4753 va_end(va);
4754 return len;
4757 static int
4758 formatfloat(Py_UNICODE *buf,
4759 size_t buflen,
4760 int flags,
4761 int prec,
4762 int type,
4763 PyObject *v)
4765 /* fmt = '%#.' + `prec` + `type`
4766 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4767 char fmt[20];
4768 double x;
4770 x = PyFloat_AsDouble(v);
4771 if (x == -1.0 && PyErr_Occurred())
4772 return -1;
4773 if (prec < 0)
4774 prec = 6;
4775 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4776 type = 'g';
4777 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4778 /* worst case length calc to ensure no buffer overrun:
4779 fmt = %#.<prec>g
4780 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4781 for any double rep.)
4782 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4783 If prec=0 the effective precision is 1 (the leading digit is
4784 always given), therefore increase by one to 10+prec. */
4785 if (buflen <= (size_t)10 + (size_t)prec) {
4786 PyErr_SetString(PyExc_OverflowError,
4787 "formatted float is too long (precision too long?)");
4788 return -1;
4790 return usprintf(buf, fmt, x);
4793 static PyObject*
4794 formatlong(PyObject *val, int flags, int prec, int type)
4796 char *buf;
4797 int i, len;
4798 PyObject *str; /* temporary string object. */
4799 PyUnicodeObject *result;
4801 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4802 if (!str)
4803 return NULL;
4804 result = _PyUnicode_New(len);
4805 for (i = 0; i < len; i++)
4806 result->str[i] = buf[i];
4807 result->str[len] = 0;
4808 Py_DECREF(str);
4809 return (PyObject*)result;
4812 static int
4813 formatint(Py_UNICODE *buf,
4814 size_t buflen,
4815 int flags,
4816 int prec,
4817 int type,
4818 PyObject *v)
4820 /* fmt = '%#.' + `prec` + 'l' + `type`
4821 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4822 + 1 + 1 = 24*/
4823 char fmt[64]; /* plenty big enough! */
4824 long x;
4825 int use_native_c_format = 1;
4827 x = PyInt_AsLong(v);
4828 if (x == -1 && PyErr_Occurred())
4829 return -1;
4830 if (prec < 0)
4831 prec = 1;
4832 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4833 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4834 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4835 PyErr_SetString(PyExc_OverflowError,
4836 "formatted integer is too long (precision too long?)");
4837 return -1;
4839 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4840 * but we want it (for consistency with other %#x conversions, and
4841 * for consistency with Python's hex() function).
4842 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4843 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4844 * So add it only if the platform doesn't already.
4846 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4847 /* Only way to know what the platform does is to try it. */
4848 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4849 if (fmt[1] != (char)type) {
4850 /* Supply our own leading 0x/0X -- needed under std C */
4851 use_native_c_format = 0;
4852 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4855 if (use_native_c_format)
4856 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4857 return usprintf(buf, fmt, x);
4860 static int
4861 formatchar(Py_UNICODE *buf,
4862 size_t buflen,
4863 PyObject *v)
4865 /* presume that the buffer is at least 2 characters long */
4866 if (PyUnicode_Check(v)) {
4867 if (PyUnicode_GET_SIZE(v) != 1)
4868 goto onError;
4869 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4872 else if (PyString_Check(v)) {
4873 if (PyString_GET_SIZE(v) != 1)
4874 goto onError;
4875 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4878 else {
4879 /* Integer input truncated to a character */
4880 long x;
4881 x = PyInt_AsLong(v);
4882 if (x == -1 && PyErr_Occurred())
4883 goto onError;
4884 buf[0] = (char) x;
4886 buf[1] = '\0';
4887 return 1;
4889 onError:
4890 PyErr_SetString(PyExc_TypeError,
4891 "%c requires int or char");
4892 return -1;
4895 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4897 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4898 chars are formatted. XXX This is a magic number. Each formatting
4899 routine does bounds checking to ensure no overflow, but a better
4900 solution may be to malloc a buffer of appropriate size for each
4901 format. For now, the current solution is sufficient.
4903 #define FORMATBUFLEN (size_t)120
4905 PyObject *PyUnicode_Format(PyObject *format,
4906 PyObject *args)
4908 Py_UNICODE *fmt, *res;
4909 int fmtcnt, rescnt, reslen, arglen, argidx;
4910 int args_owned = 0;
4911 PyUnicodeObject *result = NULL;
4912 PyObject *dict = NULL;
4913 PyObject *uformat;
4915 if (format == NULL || args == NULL) {
4916 PyErr_BadInternalCall();
4917 return NULL;
4919 uformat = PyUnicode_FromObject(format);
4920 if (uformat == NULL)
4921 return NULL;
4922 fmt = PyUnicode_AS_UNICODE(uformat);
4923 fmtcnt = PyUnicode_GET_SIZE(uformat);
4925 reslen = rescnt = fmtcnt + 100;
4926 result = _PyUnicode_New(reslen);
4927 if (result == NULL)
4928 goto onError;
4929 res = PyUnicode_AS_UNICODE(result);
4931 if (PyTuple_Check(args)) {
4932 arglen = PyTuple_Size(args);
4933 argidx = 0;
4935 else {
4936 arglen = -1;
4937 argidx = -2;
4939 if (args->ob_type->tp_as_mapping)
4940 dict = args;
4942 while (--fmtcnt >= 0) {
4943 if (*fmt != '%') {
4944 if (--rescnt < 0) {
4945 rescnt = fmtcnt + 100;
4946 reslen += rescnt;
4947 if (_PyUnicode_Resize(&result, reslen) < 0)
4948 return NULL;
4949 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4950 --rescnt;
4952 *res++ = *fmt++;
4954 else {
4955 /* Got a format specifier */
4956 int flags = 0;
4957 int width = -1;
4958 int prec = -1;
4959 Py_UNICODE c = '\0';
4960 Py_UNICODE fill;
4961 PyObject *v = NULL;
4962 PyObject *temp = NULL;
4963 Py_UNICODE *pbuf;
4964 Py_UNICODE sign;
4965 int len;
4966 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4968 fmt++;
4969 if (*fmt == '(') {
4970 Py_UNICODE *keystart;
4971 int keylen;
4972 PyObject *key;
4973 int pcount = 1;
4975 if (dict == NULL) {
4976 PyErr_SetString(PyExc_TypeError,
4977 "format requires a mapping");
4978 goto onError;
4980 ++fmt;
4981 --fmtcnt;
4982 keystart = fmt;
4983 /* Skip over balanced parentheses */
4984 while (pcount > 0 && --fmtcnt >= 0) {
4985 if (*fmt == ')')
4986 --pcount;
4987 else if (*fmt == '(')
4988 ++pcount;
4989 fmt++;
4991 keylen = fmt - keystart - 1;
4992 if (fmtcnt < 0 || pcount > 0) {
4993 PyErr_SetString(PyExc_ValueError,
4994 "incomplete format key");
4995 goto onError;
4997 /* keys are converted to strings using UTF-8 and
4998 then looked up since Python uses strings to hold
4999 variables names etc. in its namespaces and we
5000 wouldn't want to break common idioms. */
5001 key = PyUnicode_EncodeUTF8(keystart,
5002 keylen,
5003 NULL);
5004 if (key == NULL)
5005 goto onError;
5006 if (args_owned) {
5007 Py_DECREF(args);
5008 args_owned = 0;
5010 args = PyObject_GetItem(dict, key);
5011 Py_DECREF(key);
5012 if (args == NULL) {
5013 goto onError;
5015 args_owned = 1;
5016 arglen = -1;
5017 argidx = -2;
5019 while (--fmtcnt >= 0) {
5020 switch (c = *fmt++) {
5021 case '-': flags |= F_LJUST; continue;
5022 case '+': flags |= F_SIGN; continue;
5023 case ' ': flags |= F_BLANK; continue;
5024 case '#': flags |= F_ALT; continue;
5025 case '0': flags |= F_ZERO; continue;
5027 break;
5029 if (c == '*') {
5030 v = getnextarg(args, arglen, &argidx);
5031 if (v == NULL)
5032 goto onError;
5033 if (!PyInt_Check(v)) {
5034 PyErr_SetString(PyExc_TypeError,
5035 "* wants int");
5036 goto onError;
5038 width = PyInt_AsLong(v);
5039 if (width < 0) {
5040 flags |= F_LJUST;
5041 width = -width;
5043 if (--fmtcnt >= 0)
5044 c = *fmt++;
5046 else if (c >= '0' && c <= '9') {
5047 width = c - '0';
5048 while (--fmtcnt >= 0) {
5049 c = *fmt++;
5050 if (c < '0' || c > '9')
5051 break;
5052 if ((width*10) / 10 != width) {
5053 PyErr_SetString(PyExc_ValueError,
5054 "width too big");
5055 goto onError;
5057 width = width*10 + (c - '0');
5060 if (c == '.') {
5061 prec = 0;
5062 if (--fmtcnt >= 0)
5063 c = *fmt++;
5064 if (c == '*') {
5065 v = getnextarg(args, arglen, &argidx);
5066 if (v == NULL)
5067 goto onError;
5068 if (!PyInt_Check(v)) {
5069 PyErr_SetString(PyExc_TypeError,
5070 "* wants int");
5071 goto onError;
5073 prec = PyInt_AsLong(v);
5074 if (prec < 0)
5075 prec = 0;
5076 if (--fmtcnt >= 0)
5077 c = *fmt++;
5079 else if (c >= '0' && c <= '9') {
5080 prec = c - '0';
5081 while (--fmtcnt >= 0) {
5082 c = Py_CHARMASK(*fmt++);
5083 if (c < '0' || c > '9')
5084 break;
5085 if ((prec*10) / 10 != prec) {
5086 PyErr_SetString(PyExc_ValueError,
5087 "prec too big");
5088 goto onError;
5090 prec = prec*10 + (c - '0');
5093 } /* prec */
5094 if (fmtcnt >= 0) {
5095 if (c == 'h' || c == 'l' || c == 'L') {
5096 if (--fmtcnt >= 0)
5097 c = *fmt++;
5100 if (fmtcnt < 0) {
5101 PyErr_SetString(PyExc_ValueError,
5102 "incomplete format");
5103 goto onError;
5105 if (c != '%') {
5106 v = getnextarg(args, arglen, &argidx);
5107 if (v == NULL)
5108 goto onError;
5110 sign = 0;
5111 fill = ' ';
5112 switch (c) {
5114 case '%':
5115 pbuf = formatbuf;
5116 /* presume that buffer length is at least 1 */
5117 pbuf[0] = '%';
5118 len = 1;
5119 break;
5121 case 's':
5122 case 'r':
5123 if (PyUnicode_Check(v) && c == 's') {
5124 temp = v;
5125 Py_INCREF(temp);
5127 else {
5128 PyObject *unicode;
5129 if (c == 's')
5130 temp = PyObject_Str(v);
5131 else
5132 temp = PyObject_Repr(v);
5133 if (temp == NULL)
5134 goto onError;
5135 if (!PyString_Check(temp)) {
5136 /* XXX Note: this should never happen, since
5137 PyObject_Repr() and PyObject_Str() assure
5138 this */
5139 Py_DECREF(temp);
5140 PyErr_SetString(PyExc_TypeError,
5141 "%s argument has non-string str()");
5142 goto onError;
5144 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5145 PyString_GET_SIZE(temp),
5146 NULL,
5147 "strict");
5148 Py_DECREF(temp);
5149 temp = unicode;
5150 if (temp == NULL)
5151 goto onError;
5153 pbuf = PyUnicode_AS_UNICODE(temp);
5154 len = PyUnicode_GET_SIZE(temp);
5155 if (prec >= 0 && len > prec)
5156 len = prec;
5157 break;
5159 case 'i':
5160 case 'd':
5161 case 'u':
5162 case 'o':
5163 case 'x':
5164 case 'X':
5165 if (c == 'i')
5166 c = 'd';
5167 if (PyLong_Check(v)) {
5168 temp = formatlong(v, flags, prec, c);
5169 if (!temp)
5170 goto onError;
5171 pbuf = PyUnicode_AS_UNICODE(temp);
5172 len = PyUnicode_GET_SIZE(temp);
5173 /* unbounded ints can always produce
5174 a sign character! */
5175 sign = 1;
5177 else {
5178 pbuf = formatbuf;
5179 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5180 flags, prec, c, v);
5181 if (len < 0)
5182 goto onError;
5183 /* only d conversion is signed */
5184 sign = c == 'd';
5186 if (flags & F_ZERO)
5187 fill = '0';
5188 break;
5190 case 'e':
5191 case 'E':
5192 case 'f':
5193 case 'g':
5194 case 'G':
5195 pbuf = formatbuf;
5196 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5197 flags, prec, c, v);
5198 if (len < 0)
5199 goto onError;
5200 sign = 1;
5201 if (flags & F_ZERO)
5202 fill = '0';
5203 break;
5205 case 'c':
5206 pbuf = formatbuf;
5207 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5208 if (len < 0)
5209 goto onError;
5210 break;
5212 default:
5213 PyErr_Format(PyExc_ValueError,
5214 "unsupported format character '%c' (0x%x) "
5215 "at index %i",
5216 (31<=c && c<=126) ? c : '?',
5217 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5218 goto onError;
5220 if (sign) {
5221 if (*pbuf == '-' || *pbuf == '+') {
5222 sign = *pbuf++;
5223 len--;
5225 else if (flags & F_SIGN)
5226 sign = '+';
5227 else if (flags & F_BLANK)
5228 sign = ' ';
5229 else
5230 sign = 0;
5232 if (width < len)
5233 width = len;
5234 if (rescnt < width + (sign != 0)) {
5235 reslen -= rescnt;
5236 rescnt = width + fmtcnt + 100;
5237 reslen += rescnt;
5238 if (_PyUnicode_Resize(&result, reslen) < 0)
5239 return NULL;
5240 res = PyUnicode_AS_UNICODE(result)
5241 + reslen - rescnt;
5243 if (sign) {
5244 if (fill != ' ')
5245 *res++ = sign;
5246 rescnt--;
5247 if (width > len)
5248 width--;
5250 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5251 assert(pbuf[0] == '0');
5252 assert(pbuf[1] == c);
5253 if (fill != ' ') {
5254 *res++ = *pbuf++;
5255 *res++ = *pbuf++;
5257 rescnt -= 2;
5258 width -= 2;
5259 if (width < 0)
5260 width = 0;
5261 len -= 2;
5263 if (width > len && !(flags & F_LJUST)) {
5264 do {
5265 --rescnt;
5266 *res++ = fill;
5267 } while (--width > len);
5269 if (fill == ' ') {
5270 if (sign)
5271 *res++ = sign;
5272 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5273 assert(pbuf[0] == '0');
5274 assert(pbuf[1] == c);
5275 *res++ = *pbuf++;
5276 *res++ = *pbuf++;
5279 Py_UNICODE_COPY(res, pbuf, len);
5280 res += len;
5281 rescnt -= len;
5282 while (--width >= len) {
5283 --rescnt;
5284 *res++ = ' ';
5286 if (dict && (argidx < arglen) && c != '%') {
5287 PyErr_SetString(PyExc_TypeError,
5288 "not all arguments converted");
5289 goto onError;
5291 Py_XDECREF(temp);
5292 } /* '%' */
5293 } /* until end */
5294 if (argidx < arglen && !dict) {
5295 PyErr_SetString(PyExc_TypeError,
5296 "not all arguments converted");
5297 goto onError;
5300 if (args_owned) {
5301 Py_DECREF(args);
5303 Py_DECREF(uformat);
5304 if (_PyUnicode_Resize(&result, reslen - rescnt))
5305 goto onError;
5306 return (PyObject *)result;
5308 onError:
5309 Py_XDECREF(result);
5310 Py_DECREF(uformat);
5311 if (args_owned) {
5312 Py_DECREF(args);
5314 return NULL;
5317 static PyBufferProcs unicode_as_buffer = {
5318 (getreadbufferproc) unicode_buffer_getreadbuf,
5319 (getwritebufferproc) unicode_buffer_getwritebuf,
5320 (getsegcountproc) unicode_buffer_getsegcount,
5321 (getcharbufferproc) unicode_buffer_getcharbuf,
5324 PyTypeObject PyUnicode_Type = {
5325 PyObject_HEAD_INIT(&PyType_Type)
5326 0, /* ob_size */
5327 "unicode", /* tp_name */
5328 sizeof(PyUnicodeObject), /* tp_size */
5329 0, /* tp_itemsize */
5330 /* Slots */
5331 (destructor)_PyUnicode_Free, /* tp_dealloc */
5332 0, /* tp_print */
5333 (getattrfunc)unicode_getattr, /* tp_getattr */
5334 0, /* tp_setattr */
5335 (cmpfunc) unicode_compare, /* tp_compare */
5336 (reprfunc) unicode_repr, /* tp_repr */
5337 0, /* tp_as_number */
5338 &unicode_as_sequence, /* tp_as_sequence */
5339 0, /* tp_as_mapping */
5340 (hashfunc) unicode_hash, /* tp_hash*/
5341 0, /* tp_call*/
5342 (reprfunc) unicode_str, /* tp_str */
5343 (getattrofunc) NULL, /* tp_getattro */
5344 (setattrofunc) NULL, /* tp_setattro */
5345 &unicode_as_buffer, /* tp_as_buffer */
5346 Py_TPFLAGS_DEFAULT, /* tp_flags */
5349 /* Initialize the Unicode implementation */
5351 void _PyUnicode_Init(void)
5353 int i;
5355 /* Init the implementation */
5356 unicode_freelist = NULL;
5357 unicode_freelist_size = 0;
5358 unicode_empty = _PyUnicode_New(0);
5359 strcpy(unicode_default_encoding, "ascii");
5360 for (i = 0; i < 256; i++)
5361 unicode_latin1[i] = NULL;
5364 /* Finalize the Unicode implementation */
5366 void
5367 _PyUnicode_Fini(void)
5369 PyUnicodeObject *u;
5370 int i;
5372 Py_XDECREF(unicode_empty);
5373 unicode_empty = NULL;
5375 for (i = 0; i < 256; i++) {
5376 if (unicode_latin1[i]) {
5377 Py_DECREF(unicode_latin1[i]);
5378 unicode_latin1[i] = NULL;
5382 for (u = unicode_freelist; u != NULL;) {
5383 PyUnicodeObject *v = u;
5384 u = *(PyUnicodeObject **)u;
5385 if (v->str)
5386 PyMem_DEL(v->str);
5387 Py_XDECREF(v->defenc);
5388 PyObject_DEL(v);
5390 unicode_freelist = NULL;
5391 unicode_freelist_size = 0;