Updated for 2.1b2 distribution.
[python/dscho.git] / Objects / unicodeobject.c
blobc237789a79edfef77e4ecb224b24e7781dc2f263
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* The empty Unicode object */
87 static PyUnicodeObject *unicode_empty;
89 /* Free list for Unicode objects */
90 static PyUnicodeObject *unicode_freelist;
91 static int unicode_freelist_size;
93 /* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
101 static char unicode_default_encoding[100];
103 /* --- Unicode Object ----------------------------------------------------- */
105 static
106 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
109 void *oldstr;
111 /* Shortcut if there's nothing much to do. */
112 if (unicode->length == length)
113 goto reset;
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
131 unicode->str[length] = 0;
132 unicode->length = length;
134 reset:
135 /* Reset the object caches */
136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
140 unicode->hash = -1;
142 return 0;
145 int PyUnicode_Resize(PyObject **unicode,
146 int length)
148 PyUnicodeObject *v;
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
159 return _PyUnicode_Resize(v, length);
162 /* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
170 static
171 PyUnicodeObject *_PyUnicode_New(int length)
173 register PyUnicodeObject *unicode;
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
184 unicode_freelist = *(PyUnicodeObject **)unicode;
185 unicode_freelist_size--;
186 if (unicode->str) {
187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
190 _PyUnicode_Resize(unicode, length)) {
191 PyMem_DEL(unicode->str);
192 goto onError;
195 else {
196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
198 PyObject_INIT(unicode, &PyUnicode_Type);
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
207 if (!unicode->str) {
208 PyErr_NoMemory();
209 goto onError;
211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
214 unicode->defenc = NULL;
215 return unicode;
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
219 PyObject_DEL(unicode);
220 return NULL;
223 static
224 void _PyUnicode_Free(register PyUnicodeObject *unicode)
226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
229 PyMem_DEL(unicode->str);
230 unicode->str = NULL;
231 unicode->length = 0;
233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
237 /* Add to free list */
238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
242 else {
243 PyMem_DEL(unicode->str);
244 Py_XDECREF(unicode->defenc);
245 PyObject_DEL(unicode);
249 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
252 PyUnicodeObject *unicode;
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
262 return (PyObject *)unicode;
265 #ifdef HAVE_WCHAR_H
267 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
270 PyUnicodeObject *unicode;
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
281 /* Copy the wchar_t data into the new object */
282 #ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284 #else
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
292 #endif
294 return (PyObject *)unicode;
297 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307 #ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309 #else
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
317 #endif
319 return size;
322 #endif
324 PyObject *PyUnicode_FromObject(register PyObject *obj)
326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
329 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
333 const char *s;
334 int len;
335 int owned = 0;
336 PyObject *v;
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
358 if (PyUnicode_Check(obj)) {
359 Py_INCREF(obj);
360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
366 goto done;
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
380 goto onError;
383 /* Convert to Unicode */
384 if (len == 0) {
385 Py_INCREF(unicode_empty);
386 v = (PyObject *)unicode_empty;
388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
391 done:
392 if (owned) {
393 Py_DECREF(obj);
395 return v;
397 onError:
398 if (owned) {
399 Py_DECREF(obj);
401 return NULL;
404 PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
409 PyObject *buffer = NULL, *unicode;
411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
416 return PyUnicode_DecodeUTF8(s, size, errors);
417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
431 "decoder did not return an unicode object (type=%.400s)",
432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
436 Py_DECREF(buffer);
437 return unicode;
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
444 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
449 PyObject *v, *unicode;
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
459 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
463 PyObject *v;
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
476 return PyUnicode_AsUTF8String(unicode);
477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
490 "encoder did not return a string object (type=%.400s)",
491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
495 return v;
497 onError:
498 return NULL;
501 /* Return a Python string holding the default encoded value of the
502 Unicode object.
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
509 The refcount of the string is *not* incremented.
511 *** Exported for internal use by the interpreter only !!! ***
515 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
528 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
534 return PyUnicode_AS_UNICODE(unicode);
536 onError:
537 return NULL;
540 int PyUnicode_GetSize(PyObject *unicode)
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
546 return PyUnicode_GET_SIZE(unicode);
548 onError:
549 return -1;
552 const char *PyUnicode_GetDefaultEncoding(void)
554 return unicode_default_encoding;
557 int PyUnicode_SetDefaultEncoding(const char *encoding)
559 PyObject *v;
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
572 onError:
573 return -1;
576 /* --- UTF-8 Codec -------------------------------------------------------- */
578 static
579 char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
600 static
601 int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
609 "UTF-8 decoding error: %.400s",
610 details);
611 return -1;
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
623 else {
624 PyErr_Format(PyExc_ValueError,
625 "UTF-8 decoding error; unknown error handling code: %.400s",
626 errors);
627 return -1;
631 PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
639 const char *errmsg = "";
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
653 while (s < e) {
654 Py_UCS4 ch = (unsigned char)*s;
656 if (ch < 0x80) {
657 *p++ = (Py_UNICODE)ch;
658 s++;
659 continue;
662 n = utf8_code_length[ch];
664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
669 switch (n) {
671 case 0:
672 errmsg = "unexpected code byte";
673 goto utf8Error;
674 break;
676 case 1:
677 errmsg = "internal error";
678 goto utf8Error;
679 break;
681 case 2:
682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
691 else
692 *p++ = (Py_UNICODE)ch;
693 break;
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
706 else
707 *p++ = (Py_UNICODE)ch;
708 break;
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
727 /* compute and append the two surrogates: */
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
737 break;
739 default:
740 /* Other sizes are only needed for UCS-4 */
741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
745 s += n;
746 continue;
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
757 return (PyObject *)unicode;
759 onError:
760 Py_DECREF(unicode);
761 return NULL;
764 /* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
766 #if 0
767 static
768 int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
776 "UTF-8 encoding error: %.400s",
777 details);
778 return -1;
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
791 "unknown error handling code: %.400s",
792 errors);
793 return -1;
796 #endif
798 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
802 PyObject *v;
803 char *p;
804 char *q;
805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
810 v = PyString_FromStringAndSize(NULL, cbAllocated);
811 if (v == NULL)
812 return NULL;
813 if (size == 0)
814 return v;
816 p = q = PyString_AS_STRING(v);
817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
820 *p++ = (char) ch;
821 cbWritten++;
823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
826 cbWritten += 2;
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
840 goto onError;
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
846 *p++ = (char)((ch >> 18) | 0xf0);
847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
848 i++;
849 cbWritten += 4;
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
861 *p = '\0';
862 if (_PyString_Resize(&v, p - q))
863 goto onError;
864 return v;
866 onError:
867 Py_DECREF(v);
868 return NULL;
871 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
882 /* --- UTF-16 Codec ------------------------------------------------------- */
884 static
885 int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
893 "UTF-16 decoding error: %.400s",
894 details);
895 return -1;
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
905 return 0;
907 else {
908 PyErr_Format(PyExc_ValueError,
909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
911 errors);
912 return -1;
916 PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
925 const char *errmsg = "";
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
948 if (byteorder)
949 bo = *byteorder;
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968 #else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978 #endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
984 /* UTF-16 code pair: */
985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
991 if (0xD800 <= *q && *q <= 0xDBFF) {
992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
996 errmsg = "code pairs are not supported";
997 goto utf16Error;
999 else
1000 continue;
1002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
1010 if (byteorder)
1011 *byteorder = bo;
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1017 return (PyObject *)unicode;
1019 onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1024 #undef UTF16_ERROR
1026 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
1043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
1045 if (size == 0)
1046 return v;
1047 if (byteorder == 0 ||
1048 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050 #else
1051 byteorder == 1
1052 #endif
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1060 return v;
1063 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1075 /* --- Unicode Escape Codec ----------------------------------------------- */
1077 static
1078 int unicodeescape_decoding_error(const char **source,
1079 Py_UNICODE *x,
1080 const char *errors,
1081 const char *details)
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
1086 "Unicode-Escape decoding error: %.400s",
1087 details);
1088 return -1;
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1093 else if (strcmp(errors,"replace") == 0) {
1094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1095 return 0;
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
1100 "unknown error handling code: %.400s",
1101 errors);
1102 return -1;
1106 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1108 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p, *buf;
1114 const char *end;
1115 char* message;
1116 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1118 /* Escaped strings will always be longer than the resulting
1119 Unicode string, so we start with size here and then reduce the
1120 length after conversion to the true value. */
1121 v = _PyUnicode_New(size);
1122 if (v == NULL)
1123 goto onError;
1124 if (size == 0)
1125 return (PyObject *)v;
1127 p = buf = PyUnicode_AS_UNICODE(v);
1128 end = s + size;
1130 while (s < end) {
1131 unsigned char c;
1132 Py_UNICODE x;
1133 int i, digits;
1135 /* Non-escape characters are interpreted as Unicode ordinals */
1136 if (*s != '\\') {
1137 *p++ = (unsigned char) *s++;
1138 continue;
1141 /* \ - Escapes */
1142 s++;
1143 switch (*s++) {
1145 /* \x escapes */
1146 case '\n': break;
1147 case '\\': *p++ = '\\'; break;
1148 case '\'': *p++ = '\''; break;
1149 case '\"': *p++ = '\"'; break;
1150 case 'b': *p++ = '\b'; break;
1151 case 'f': *p++ = '\014'; break; /* FF */
1152 case 't': *p++ = '\t'; break;
1153 case 'n': *p++ = '\n'; break;
1154 case 'r': *p++ = '\r'; break;
1155 case 'v': *p++ = '\013'; break; /* VT */
1156 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1158 /* \OOO (octal) escapes */
1159 case '0': case '1': case '2': case '3':
1160 case '4': case '5': case '6': case '7':
1161 x = s[-1] - '0';
1162 if ('0' <= *s && *s <= '7') {
1163 x = (x<<3) + *s++ - '0';
1164 if ('0' <= *s && *s <= '7')
1165 x = (x<<3) + *s++ - '0';
1167 *p++ = x;
1168 break;
1170 /* hex escapes */
1171 /* \xXX */
1172 case 'x':
1173 digits = 2;
1174 message = "truncated \\xXX escape";
1175 goto hexescape;
1177 /* \uXXXX */
1178 case 'u':
1179 digits = 4;
1180 message = "truncated \\uXXXX escape";
1181 goto hexescape;
1183 /* \UXXXXXXXX */
1184 case 'U':
1185 digits = 8;
1186 message = "truncated \\UXXXXXXXX escape";
1187 hexescape:
1188 chr = 0;
1189 for (i = 0; i < digits; i++) {
1190 c = (unsigned char) s[i];
1191 if (!isxdigit(c)) {
1192 if (unicodeescape_decoding_error(&s, &x, errors, message))
1193 goto onError;
1194 chr = x;
1195 i++;
1196 break;
1198 chr = (chr<<4) & ~0xF;
1199 if (c >= '0' && c <= '9')
1200 chr += c - '0';
1201 else if (c >= 'a' && c <= 'f')
1202 chr += 10 + c - 'a';
1203 else
1204 chr += 10 + c - 'A';
1206 s += i;
1207 store:
1208 /* when we get here, chr is a 32-bit unicode character */
1209 if (chr <= 0xffff)
1210 /* UCS-2 character */
1211 *p++ = (Py_UNICODE) chr;
1212 else if (chr <= 0x10ffff) {
1213 /* UCS-4 character. store as two surrogate characters */
1214 chr -= 0x10000L;
1215 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1216 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1217 } else {
1218 if (unicodeescape_decoding_error(
1219 &s, &x, errors,
1220 "illegal Unicode character")
1222 goto onError;
1223 *p++ = x; /* store replacement character */
1225 break;
1227 /* \N{name} */
1228 case 'N':
1229 message = "malformed \\N character escape";
1230 if (ucnhash_CAPI == NULL) {
1231 /* load the unicode data module */
1232 PyObject *m, *v;
1233 m = PyImport_ImportModule("unicodedata");
1234 if (m == NULL)
1235 goto ucnhashError;
1236 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1237 Py_DECREF(m);
1238 if (v == NULL)
1239 goto ucnhashError;
1240 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1241 Py_DECREF(v);
1242 if (ucnhash_CAPI == NULL)
1243 goto ucnhashError;
1245 if (*s == '{') {
1246 const char *start = s+1;
1247 /* look for the closing brace */
1248 while (*s != '}' && s < end)
1249 s++;
1250 if (s > start && s < end && *s == '}') {
1251 /* found a name. look it up in the unicode database */
1252 message = "unknown Unicode character name";
1253 s++;
1254 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1255 goto store;
1258 if (unicodeescape_decoding_error(&s, &x, errors, message))
1259 goto onError;
1260 *p++ = x;
1261 break;
1263 default:
1264 *p++ = '\\';
1265 *p++ = (unsigned char)s[-1];
1266 break;
1269 if (_PyUnicode_Resize(v, (int)(p - buf)))
1270 goto onError;
1271 return (PyObject *)v;
1273 ucnhashError:
1274 PyErr_SetString(
1275 PyExc_UnicodeError,
1276 "\\N escapes not supported (can't load unicodedata module)"
1278 return NULL;
1280 onError:
1281 Py_XDECREF(v);
1282 return NULL;
1285 /* Return a Unicode-Escape string version of the Unicode object.
1287 If quotes is true, the string is enclosed in u"" or u'' quotes as
1288 appropriate.
1292 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1293 int size,
1294 Py_UNICODE ch);
1296 static
1297 PyObject *unicodeescape_string(const Py_UNICODE *s,
1298 int size,
1299 int quotes)
1301 PyObject *repr;
1302 char *p;
1303 char *q;
1305 static const char *hexdigit = "0123456789abcdef";
1307 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1308 if (repr == NULL)
1309 return NULL;
1311 p = q = PyString_AS_STRING(repr);
1313 if (quotes) {
1314 *p++ = 'u';
1315 *p++ = (findchar(s, size, '\'') &&
1316 !findchar(s, size, '"')) ? '"' : '\'';
1318 while (size-- > 0) {
1319 Py_UNICODE ch = *s++;
1320 /* Escape quotes */
1321 if (quotes && (ch == q[1] || ch == '\\')) {
1322 *p++ = '\\';
1323 *p++ = (char) ch;
1325 /* Map 16-bit characters to '\uxxxx' */
1326 else if (ch >= 256) {
1327 *p++ = '\\';
1328 *p++ = 'u';
1329 *p++ = hexdigit[(ch >> 12) & 0xf];
1330 *p++ = hexdigit[(ch >> 8) & 0xf];
1331 *p++ = hexdigit[(ch >> 4) & 0xf];
1332 *p++ = hexdigit[ch & 15];
1334 /* Map special whitespace to '\t', \n', '\r' */
1335 else if (ch == '\t') {
1336 *p++ = '\\';
1337 *p++ = 't';
1339 else if (ch == '\n') {
1340 *p++ = '\\';
1341 *p++ = 'n';
1343 else if (ch == '\r') {
1344 *p++ = '\\';
1345 *p++ = 'r';
1347 /* Map non-printable US ASCII to '\xhh' */
1348 else if (ch < ' ' || ch >= 128) {
1349 *p++ = '\\';
1350 *p++ = 'x';
1351 *p++ = hexdigit[(ch >> 4) & 0xf];
1352 *p++ = hexdigit[ch & 15];
1354 /* Copy everything else as-is */
1355 else
1356 *p++ = (char) ch;
1358 if (quotes)
1359 *p++ = q[1];
1361 *p = '\0';
1362 if (_PyString_Resize(&repr, p - q))
1363 goto onError;
1365 return repr;
1367 onError:
1368 Py_DECREF(repr);
1369 return NULL;
1372 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1373 int size)
1375 return unicodeescape_string(s, size, 0);
1378 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 return NULL;
1384 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1385 PyUnicode_GET_SIZE(unicode));
1388 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1390 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1391 int size,
1392 const char *errors)
1394 PyUnicodeObject *v;
1395 Py_UNICODE *p, *buf;
1396 const char *end;
1397 const char *bs;
1399 /* Escaped strings will always be longer than the resulting
1400 Unicode string, so we start with size here and then reduce the
1401 length after conversion to the true value. */
1402 v = _PyUnicode_New(size);
1403 if (v == NULL)
1404 goto onError;
1405 if (size == 0)
1406 return (PyObject *)v;
1407 p = buf = PyUnicode_AS_UNICODE(v);
1408 end = s + size;
1409 while (s < end) {
1410 unsigned char c;
1411 Py_UNICODE x;
1412 int i;
1414 /* Non-escape characters are interpreted as Unicode ordinals */
1415 if (*s != '\\') {
1416 *p++ = (unsigned char)*s++;
1417 continue;
1420 /* \u-escapes are only interpreted iff the number of leading
1421 backslashes if odd */
1422 bs = s;
1423 for (;s < end;) {
1424 if (*s != '\\')
1425 break;
1426 *p++ = (unsigned char)*s++;
1428 if (((s - bs) & 1) == 0 ||
1429 s >= end ||
1430 *s != 'u') {
1431 continue;
1433 p--;
1434 s++;
1436 /* \uXXXX with 4 hex digits */
1437 for (x = 0, i = 0; i < 4; i++) {
1438 c = (unsigned char)s[i];
1439 if (!isxdigit(c)) {
1440 if (unicodeescape_decoding_error(&s, &x, errors,
1441 "truncated \\uXXXX"))
1442 goto onError;
1443 i++;
1444 break;
1446 x = (x<<4) & ~0xF;
1447 if (c >= '0' && c <= '9')
1448 x += c - '0';
1449 else if (c >= 'a' && c <= 'f')
1450 x += 10 + c - 'a';
1451 else
1452 x += 10 + c - 'A';
1454 s += i;
1455 *p++ = x;
1457 if (_PyUnicode_Resize(v, (int)(p - buf)))
1458 goto onError;
1459 return (PyObject *)v;
1461 onError:
1462 Py_XDECREF(v);
1463 return NULL;
1466 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1467 int size)
1469 PyObject *repr;
1470 char *p;
1471 char *q;
1473 static const char *hexdigit = "0123456789abcdef";
1475 repr = PyString_FromStringAndSize(NULL, 6 * size);
1476 if (repr == NULL)
1477 return NULL;
1478 if (size == 0)
1479 return repr;
1481 p = q = PyString_AS_STRING(repr);
1482 while (size-- > 0) {
1483 Py_UNICODE ch = *s++;
1484 /* Map 16-bit characters to '\uxxxx' */
1485 if (ch >= 256) {
1486 *p++ = '\\';
1487 *p++ = 'u';
1488 *p++ = hexdigit[(ch >> 12) & 0xf];
1489 *p++ = hexdigit[(ch >> 8) & 0xf];
1490 *p++ = hexdigit[(ch >> 4) & 0xf];
1491 *p++ = hexdigit[ch & 15];
1493 /* Copy everything else as-is */
1494 else
1495 *p++ = (char) ch;
1497 *p = '\0';
1498 if (_PyString_Resize(&repr, p - q))
1499 goto onError;
1501 return repr;
1503 onError:
1504 Py_DECREF(repr);
1505 return NULL;
1508 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1510 if (!PyUnicode_Check(unicode)) {
1511 PyErr_BadArgument();
1512 return NULL;
1514 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode));
1518 /* --- Latin-1 Codec ------------------------------------------------------ */
1520 PyObject *PyUnicode_DecodeLatin1(const char *s,
1521 int size,
1522 const char *errors)
1524 PyUnicodeObject *v;
1525 Py_UNICODE *p;
1527 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1528 v = _PyUnicode_New(size);
1529 if (v == NULL)
1530 goto onError;
1531 if (size == 0)
1532 return (PyObject *)v;
1533 p = PyUnicode_AS_UNICODE(v);
1534 while (size-- > 0)
1535 *p++ = (unsigned char)*s++;
1536 return (PyObject *)v;
1538 onError:
1539 Py_XDECREF(v);
1540 return NULL;
1543 static
1544 int latin1_encoding_error(const Py_UNICODE **source,
1545 char **dest,
1546 const char *errors,
1547 const char *details)
1549 if ((errors == NULL) ||
1550 (strcmp(errors,"strict") == 0)) {
1551 PyErr_Format(PyExc_UnicodeError,
1552 "Latin-1 encoding error: %.400s",
1553 details);
1554 return -1;
1556 else if (strcmp(errors,"ignore") == 0) {
1557 return 0;
1559 else if (strcmp(errors,"replace") == 0) {
1560 **dest = '?';
1561 (*dest)++;
1562 return 0;
1564 else {
1565 PyErr_Format(PyExc_ValueError,
1566 "Latin-1 encoding error; "
1567 "unknown error handling code: %.400s",
1568 errors);
1569 return -1;
1573 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1574 int size,
1575 const char *errors)
1577 PyObject *repr;
1578 char *s, *start;
1580 repr = PyString_FromStringAndSize(NULL, size);
1581 if (repr == NULL)
1582 return NULL;
1583 if (size == 0)
1584 return repr;
1586 s = PyString_AS_STRING(repr);
1587 start = s;
1588 while (size-- > 0) {
1589 Py_UNICODE ch = *p++;
1590 if (ch >= 256) {
1591 if (latin1_encoding_error(&p, &s, errors,
1592 "ordinal not in range(256)"))
1593 goto onError;
1595 else
1596 *s++ = (char)ch;
1598 /* Resize if error handling skipped some characters */
1599 if (s - start < PyString_GET_SIZE(repr))
1600 if (_PyString_Resize(&repr, s - start))
1601 goto onError;
1602 return repr;
1604 onError:
1605 Py_DECREF(repr);
1606 return NULL;
1609 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1611 if (!PyUnicode_Check(unicode)) {
1612 PyErr_BadArgument();
1613 return NULL;
1615 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1616 PyUnicode_GET_SIZE(unicode),
1617 NULL);
1620 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1622 static
1623 int ascii_decoding_error(const char **source,
1624 Py_UNICODE **dest,
1625 const char *errors,
1626 const char *details)
1628 if ((errors == NULL) ||
1629 (strcmp(errors,"strict") == 0)) {
1630 PyErr_Format(PyExc_UnicodeError,
1631 "ASCII decoding error: %.400s",
1632 details);
1633 return -1;
1635 else if (strcmp(errors,"ignore") == 0) {
1636 return 0;
1638 else if (strcmp(errors,"replace") == 0) {
1639 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1640 (*dest)++;
1641 return 0;
1643 else {
1644 PyErr_Format(PyExc_ValueError,
1645 "ASCII decoding error; "
1646 "unknown error handling code: %.400s",
1647 errors);
1648 return -1;
1652 PyObject *PyUnicode_DecodeASCII(const char *s,
1653 int size,
1654 const char *errors)
1656 PyUnicodeObject *v;
1657 Py_UNICODE *p;
1659 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1660 v = _PyUnicode_New(size);
1661 if (v == NULL)
1662 goto onError;
1663 if (size == 0)
1664 return (PyObject *)v;
1665 p = PyUnicode_AS_UNICODE(v);
1666 while (size-- > 0) {
1667 register unsigned char c;
1669 c = (unsigned char)*s++;
1670 if (c < 128)
1671 *p++ = c;
1672 else if (ascii_decoding_error(&s, &p, errors,
1673 "ordinal not in range(128)"))
1674 goto onError;
1676 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1677 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1678 goto onError;
1679 return (PyObject *)v;
1681 onError:
1682 Py_XDECREF(v);
1683 return NULL;
1686 static
1687 int ascii_encoding_error(const Py_UNICODE **source,
1688 char **dest,
1689 const char *errors,
1690 const char *details)
1692 if ((errors == NULL) ||
1693 (strcmp(errors,"strict") == 0)) {
1694 PyErr_Format(PyExc_UnicodeError,
1695 "ASCII encoding error: %.400s",
1696 details);
1697 return -1;
1699 else if (strcmp(errors,"ignore") == 0) {
1700 return 0;
1702 else if (strcmp(errors,"replace") == 0) {
1703 **dest = '?';
1704 (*dest)++;
1705 return 0;
1707 else {
1708 PyErr_Format(PyExc_ValueError,
1709 "ASCII encoding error; "
1710 "unknown error handling code: %.400s",
1711 errors);
1712 return -1;
1716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1717 int size,
1718 const char *errors)
1720 PyObject *repr;
1721 char *s, *start;
1723 repr = PyString_FromStringAndSize(NULL, size);
1724 if (repr == NULL)
1725 return NULL;
1726 if (size == 0)
1727 return repr;
1729 s = PyString_AS_STRING(repr);
1730 start = s;
1731 while (size-- > 0) {
1732 Py_UNICODE ch = *p++;
1733 if (ch >= 128) {
1734 if (ascii_encoding_error(&p, &s, errors,
1735 "ordinal not in range(128)"))
1736 goto onError;
1738 else
1739 *s++ = (char)ch;
1741 /* Resize if error handling skipped some characters */
1742 if (s - start < PyString_GET_SIZE(repr))
1743 if (_PyString_Resize(&repr, s - start))
1744 goto onError;
1745 return repr;
1747 onError:
1748 Py_DECREF(repr);
1749 return NULL;
1752 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1754 if (!PyUnicode_Check(unicode)) {
1755 PyErr_BadArgument();
1756 return NULL;
1758 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1759 PyUnicode_GET_SIZE(unicode),
1760 NULL);
1763 #ifdef MS_WIN32
1765 /* --- MBCS codecs for Windows -------------------------------------------- */
1767 PyObject *PyUnicode_DecodeMBCS(const char *s,
1768 int size,
1769 const char *errors)
1771 PyUnicodeObject *v;
1772 Py_UNICODE *p;
1774 /* First get the size of the result */
1775 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1776 if (size > 0 && usize==0)
1777 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1779 v = _PyUnicode_New(usize);
1780 if (v == NULL)
1781 return NULL;
1782 if (usize == 0)
1783 return (PyObject *)v;
1784 p = PyUnicode_AS_UNICODE(v);
1785 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1786 Py_DECREF(v);
1787 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1790 return (PyObject *)v;
1793 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1794 int size,
1795 const char *errors)
1797 PyObject *repr;
1798 char *s;
1799 DWORD mbcssize;
1801 /* If there are no characters, bail now! */
1802 if (size==0)
1803 return PyString_FromString("");
1805 /* First get the size of the result */
1806 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1807 if (mbcssize==0)
1808 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1810 repr = PyString_FromStringAndSize(NULL, mbcssize);
1811 if (repr == NULL)
1812 return NULL;
1813 if (mbcssize == 0)
1814 return repr;
1816 /* Do the conversion */
1817 s = PyString_AS_STRING(repr);
1818 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1819 Py_DECREF(repr);
1820 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1822 return repr;
1825 #endif /* MS_WIN32 */
1827 /* --- Character Mapping Codec -------------------------------------------- */
1829 static
1830 int charmap_decoding_error(const char **source,
1831 Py_UNICODE **dest,
1832 const char *errors,
1833 const char *details)
1835 if ((errors == NULL) ||
1836 (strcmp(errors,"strict") == 0)) {
1837 PyErr_Format(PyExc_UnicodeError,
1838 "charmap decoding error: %.400s",
1839 details);
1840 return -1;
1842 else if (strcmp(errors,"ignore") == 0) {
1843 return 0;
1845 else if (strcmp(errors,"replace") == 0) {
1846 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1847 (*dest)++;
1848 return 0;
1850 else {
1851 PyErr_Format(PyExc_ValueError,
1852 "charmap decoding error; "
1853 "unknown error handling code: %.400s",
1854 errors);
1855 return -1;
1859 PyObject *PyUnicode_DecodeCharmap(const char *s,
1860 int size,
1861 PyObject *mapping,
1862 const char *errors)
1864 PyUnicodeObject *v;
1865 Py_UNICODE *p;
1866 int extrachars = 0;
1868 /* Default to Latin-1 */
1869 if (mapping == NULL)
1870 return PyUnicode_DecodeLatin1(s, size, errors);
1872 v = _PyUnicode_New(size);
1873 if (v == NULL)
1874 goto onError;
1875 if (size == 0)
1876 return (PyObject *)v;
1877 p = PyUnicode_AS_UNICODE(v);
1878 while (size-- > 0) {
1879 unsigned char ch = *s++;
1880 PyObject *w, *x;
1882 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1883 w = PyInt_FromLong((long)ch);
1884 if (w == NULL)
1885 goto onError;
1886 x = PyObject_GetItem(mapping, w);
1887 Py_DECREF(w);
1888 if (x == NULL) {
1889 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1890 /* No mapping found means: mapping is undefined. */
1891 PyErr_Clear();
1892 x = Py_None;
1893 Py_INCREF(x);
1894 } else
1895 goto onError;
1898 /* Apply mapping */
1899 if (PyInt_Check(x)) {
1900 long value = PyInt_AS_LONG(x);
1901 if (value < 0 || value > 65535) {
1902 PyErr_SetString(PyExc_TypeError,
1903 "character mapping must be in range(65536)");
1904 Py_DECREF(x);
1905 goto onError;
1907 *p++ = (Py_UNICODE)value;
1909 else if (x == Py_None) {
1910 /* undefined mapping */
1911 if (charmap_decoding_error(&s, &p, errors,
1912 "character maps to <undefined>")) {
1913 Py_DECREF(x);
1914 goto onError;
1917 else if (PyUnicode_Check(x)) {
1918 int targetsize = PyUnicode_GET_SIZE(x);
1920 if (targetsize == 1)
1921 /* 1-1 mapping */
1922 *p++ = *PyUnicode_AS_UNICODE(x);
1924 else if (targetsize > 1) {
1925 /* 1-n mapping */
1926 if (targetsize > extrachars) {
1927 /* resize first */
1928 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1929 int needed = (targetsize - extrachars) + \
1930 (targetsize << 2);
1931 extrachars += needed;
1932 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
1933 Py_DECREF(x);
1934 goto onError;
1936 p = PyUnicode_AS_UNICODE(v) + oldpos;
1938 Py_UNICODE_COPY(p,
1939 PyUnicode_AS_UNICODE(x),
1940 targetsize);
1941 p += targetsize;
1942 extrachars -= targetsize;
1944 /* 1-0 mapping: skip the character */
1946 else {
1947 /* wrong return value */
1948 PyErr_SetString(PyExc_TypeError,
1949 "character mapping must return integer, None or unicode");
1950 Py_DECREF(x);
1951 goto onError;
1953 Py_DECREF(x);
1955 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1956 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1957 goto onError;
1958 return (PyObject *)v;
1960 onError:
1961 Py_XDECREF(v);
1962 return NULL;
1965 static
1966 int charmap_encoding_error(const Py_UNICODE **source,
1967 char **dest,
1968 const char *errors,
1969 const char *details)
1971 if ((errors == NULL) ||
1972 (strcmp(errors,"strict") == 0)) {
1973 PyErr_Format(PyExc_UnicodeError,
1974 "charmap encoding error: %.400s",
1975 details);
1976 return -1;
1978 else if (strcmp(errors,"ignore") == 0) {
1979 return 0;
1981 else if (strcmp(errors,"replace") == 0) {
1982 **dest = '?';
1983 (*dest)++;
1984 return 0;
1986 else {
1987 PyErr_Format(PyExc_ValueError,
1988 "charmap encoding error; "
1989 "unknown error handling code: %.400s",
1990 errors);
1991 return -1;
1995 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1996 int size,
1997 PyObject *mapping,
1998 const char *errors)
2000 PyObject *v;
2001 char *s;
2002 int extrachars = 0;
2004 /* Default to Latin-1 */
2005 if (mapping == NULL)
2006 return PyUnicode_EncodeLatin1(p, size, errors);
2008 v = PyString_FromStringAndSize(NULL, size);
2009 if (v == NULL)
2010 return NULL;
2011 if (size == 0)
2012 return v;
2013 s = PyString_AS_STRING(v);
2014 while (size-- > 0) {
2015 Py_UNICODE ch = *p++;
2016 PyObject *w, *x;
2018 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2019 w = PyInt_FromLong((long)ch);
2020 if (w == NULL)
2021 goto onError;
2022 x = PyObject_GetItem(mapping, w);
2023 Py_DECREF(w);
2024 if (x == NULL) {
2025 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2026 /* No mapping found means: mapping is undefined. */
2027 PyErr_Clear();
2028 x = Py_None;
2029 Py_INCREF(x);
2030 } else
2031 goto onError;
2034 /* Apply mapping */
2035 if (PyInt_Check(x)) {
2036 long value = PyInt_AS_LONG(x);
2037 if (value < 0 || value > 255) {
2038 PyErr_SetString(PyExc_TypeError,
2039 "character mapping must be in range(256)");
2040 Py_DECREF(x);
2041 goto onError;
2043 *s++ = (char)value;
2045 else if (x == Py_None) {
2046 /* undefined mapping */
2047 if (charmap_encoding_error(&p, &s, errors,
2048 "character maps to <undefined>")) {
2049 Py_DECREF(x);
2050 goto onError;
2053 else if (PyString_Check(x)) {
2054 int targetsize = PyString_GET_SIZE(x);
2056 if (targetsize == 1)
2057 /* 1-1 mapping */
2058 *s++ = *PyString_AS_STRING(x);
2060 else if (targetsize > 1) {
2061 /* 1-n mapping */
2062 if (targetsize > extrachars) {
2063 /* resize first */
2064 int oldpos = (int)(s - PyString_AS_STRING(v));
2065 int needed = (targetsize - extrachars) + \
2066 (targetsize << 2);
2067 extrachars += needed;
2068 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2069 Py_DECREF(x);
2070 goto onError;
2072 s = PyString_AS_STRING(v) + oldpos;
2074 memcpy(s,
2075 PyString_AS_STRING(x),
2076 targetsize);
2077 s += targetsize;
2078 extrachars -= targetsize;
2080 /* 1-0 mapping: skip the character */
2082 else {
2083 /* wrong return value */
2084 PyErr_SetString(PyExc_TypeError,
2085 "character mapping must return integer, None or unicode");
2086 Py_DECREF(x);
2087 goto onError;
2089 Py_DECREF(x);
2091 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2092 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2093 goto onError;
2094 return v;
2096 onError:
2097 Py_DECREF(v);
2098 return NULL;
2101 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2102 PyObject *mapping)
2104 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2105 PyErr_BadArgument();
2106 return NULL;
2108 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2109 PyUnicode_GET_SIZE(unicode),
2110 mapping,
2111 NULL);
2114 static
2115 int translate_error(const Py_UNICODE **source,
2116 Py_UNICODE **dest,
2117 const char *errors,
2118 const char *details)
2120 if ((errors == NULL) ||
2121 (strcmp(errors,"strict") == 0)) {
2122 PyErr_Format(PyExc_UnicodeError,
2123 "translate error: %.400s",
2124 details);
2125 return -1;
2127 else if (strcmp(errors,"ignore") == 0) {
2128 return 0;
2130 else if (strcmp(errors,"replace") == 0) {
2131 **dest = '?';
2132 (*dest)++;
2133 return 0;
2135 else {
2136 PyErr_Format(PyExc_ValueError,
2137 "translate error; "
2138 "unknown error handling code: %.400s",
2139 errors);
2140 return -1;
2144 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2145 int size,
2146 PyObject *mapping,
2147 const char *errors)
2149 PyUnicodeObject *v;
2150 Py_UNICODE *p;
2152 if (mapping == NULL) {
2153 PyErr_BadArgument();
2154 return NULL;
2157 /* Output will never be longer than input */
2158 v = _PyUnicode_New(size);
2159 if (v == NULL)
2160 goto onError;
2161 if (size == 0)
2162 goto done;
2163 p = PyUnicode_AS_UNICODE(v);
2164 while (size-- > 0) {
2165 Py_UNICODE ch = *s++;
2166 PyObject *w, *x;
2168 /* Get mapping */
2169 w = PyInt_FromLong(ch);
2170 if (w == NULL)
2171 goto onError;
2172 x = PyObject_GetItem(mapping, w);
2173 Py_DECREF(w);
2174 if (x == NULL) {
2175 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2176 /* No mapping found: default to 1-1 mapping */
2177 PyErr_Clear();
2178 *p++ = ch;
2179 continue;
2181 goto onError;
2184 /* Apply mapping */
2185 if (PyInt_Check(x))
2186 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2187 else if (x == Py_None) {
2188 /* undefined mapping */
2189 if (translate_error(&s, &p, errors,
2190 "character maps to <undefined>")) {
2191 Py_DECREF(x);
2192 goto onError;
2195 else if (PyUnicode_Check(x)) {
2196 if (PyUnicode_GET_SIZE(x) != 1) {
2197 /* 1-n mapping */
2198 PyErr_SetString(PyExc_NotImplementedError,
2199 "1-n mappings are currently not implemented");
2200 Py_DECREF(x);
2201 goto onError;
2203 *p++ = *PyUnicode_AS_UNICODE(x);
2205 else {
2206 /* wrong return value */
2207 PyErr_SetString(PyExc_TypeError,
2208 "translate mapping must return integer, None or unicode");
2209 Py_DECREF(x);
2210 goto onError;
2212 Py_DECREF(x);
2214 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2215 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2216 goto onError;
2218 done:
2219 return (PyObject *)v;
2221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2226 PyObject *PyUnicode_Translate(PyObject *str,
2227 PyObject *mapping,
2228 const char *errors)
2230 PyObject *result;
2232 str = PyUnicode_FromObject(str);
2233 if (str == NULL)
2234 goto onError;
2235 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2236 PyUnicode_GET_SIZE(str),
2237 mapping,
2238 errors);
2239 Py_DECREF(str);
2240 return result;
2242 onError:
2243 Py_XDECREF(str);
2244 return NULL;
2247 /* --- Decimal Encoder ---------------------------------------------------- */
2249 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2250 int length,
2251 char *output,
2252 const char *errors)
2254 Py_UNICODE *p, *end;
2256 if (output == NULL) {
2257 PyErr_BadArgument();
2258 return -1;
2261 p = s;
2262 end = s + length;
2263 while (p < end) {
2264 register Py_UNICODE ch = *p++;
2265 int decimal;
2267 if (Py_UNICODE_ISSPACE(ch)) {
2268 *output++ = ' ';
2269 continue;
2271 decimal = Py_UNICODE_TODECIMAL(ch);
2272 if (decimal >= 0) {
2273 *output++ = '0' + decimal;
2274 continue;
2276 if (0 < ch && ch < 256) {
2277 *output++ = (char)ch;
2278 continue;
2280 /* All other characters are considered invalid */
2281 if (errors == NULL || strcmp(errors, "strict") == 0) {
2282 PyErr_SetString(PyExc_ValueError,
2283 "invalid decimal Unicode string");
2284 goto onError;
2286 else if (strcmp(errors, "ignore") == 0)
2287 continue;
2288 else if (strcmp(errors, "replace") == 0) {
2289 *output++ = '?';
2290 continue;
2293 /* 0-terminate the output string */
2294 *output++ = '\0';
2295 return 0;
2297 onError:
2298 return -1;
2301 /* --- Helpers ------------------------------------------------------------ */
2303 static
2304 int count(PyUnicodeObject *self,
2305 int start,
2306 int end,
2307 PyUnicodeObject *substring)
2309 int count = 0;
2311 if (start < 0)
2312 start += self->length;
2313 if (start < 0)
2314 start = 0;
2315 if (end > self->length)
2316 end = self->length;
2317 if (end < 0)
2318 end += self->length;
2319 if (end < 0)
2320 end = 0;
2322 if (substring->length == 0)
2323 return (end - start + 1);
2325 end -= substring->length;
2327 while (start <= end)
2328 if (Py_UNICODE_MATCH(self, start, substring)) {
2329 count++;
2330 start += substring->length;
2331 } else
2332 start++;
2334 return count;
2337 int PyUnicode_Count(PyObject *str,
2338 PyObject *substr,
2339 int start,
2340 int end)
2342 int result;
2344 str = PyUnicode_FromObject(str);
2345 if (str == NULL)
2346 return -1;
2347 substr = PyUnicode_FromObject(substr);
2348 if (substr == NULL) {
2349 Py_DECREF(str);
2350 return -1;
2353 result = count((PyUnicodeObject *)str,
2354 start, end,
2355 (PyUnicodeObject *)substr);
2357 Py_DECREF(str);
2358 Py_DECREF(substr);
2359 return result;
2362 static
2363 int findstring(PyUnicodeObject *self,
2364 PyUnicodeObject *substring,
2365 int start,
2366 int end,
2367 int direction)
2369 if (start < 0)
2370 start += self->length;
2371 if (start < 0)
2372 start = 0;
2374 if (substring->length == 0)
2375 return start;
2377 if (end > self->length)
2378 end = self->length;
2379 if (end < 0)
2380 end += self->length;
2381 if (end < 0)
2382 end = 0;
2384 end -= substring->length;
2386 if (direction < 0) {
2387 for (; end >= start; end--)
2388 if (Py_UNICODE_MATCH(self, end, substring))
2389 return end;
2390 } else {
2391 for (; start <= end; start++)
2392 if (Py_UNICODE_MATCH(self, start, substring))
2393 return start;
2396 return -1;
2399 int PyUnicode_Find(PyObject *str,
2400 PyObject *substr,
2401 int start,
2402 int end,
2403 int direction)
2405 int result;
2407 str = PyUnicode_FromObject(str);
2408 if (str == NULL)
2409 return -1;
2410 substr = PyUnicode_FromObject(substr);
2411 if (substr == NULL) {
2412 Py_DECREF(substr);
2413 return -1;
2416 result = findstring((PyUnicodeObject *)str,
2417 (PyUnicodeObject *)substr,
2418 start, end, direction);
2419 Py_DECREF(str);
2420 Py_DECREF(substr);
2421 return result;
2424 static
2425 int tailmatch(PyUnicodeObject *self,
2426 PyUnicodeObject *substring,
2427 int start,
2428 int end,
2429 int direction)
2431 if (start < 0)
2432 start += self->length;
2433 if (start < 0)
2434 start = 0;
2436 if (substring->length == 0)
2437 return 1;
2439 if (end > self->length)
2440 end = self->length;
2441 if (end < 0)
2442 end += self->length;
2443 if (end < 0)
2444 end = 0;
2446 end -= substring->length;
2447 if (end < start)
2448 return 0;
2450 if (direction > 0) {
2451 if (Py_UNICODE_MATCH(self, end, substring))
2452 return 1;
2453 } else {
2454 if (Py_UNICODE_MATCH(self, start, substring))
2455 return 1;
2458 return 0;
2461 int PyUnicode_Tailmatch(PyObject *str,
2462 PyObject *substr,
2463 int start,
2464 int end,
2465 int direction)
2467 int result;
2469 str = PyUnicode_FromObject(str);
2470 if (str == NULL)
2471 return -1;
2472 substr = PyUnicode_FromObject(substr);
2473 if (substr == NULL) {
2474 Py_DECREF(substr);
2475 return -1;
2478 result = tailmatch((PyUnicodeObject *)str,
2479 (PyUnicodeObject *)substr,
2480 start, end, direction);
2481 Py_DECREF(str);
2482 Py_DECREF(substr);
2483 return result;
2486 static
2487 const Py_UNICODE *findchar(const Py_UNICODE *s,
2488 int size,
2489 Py_UNICODE ch)
2491 /* like wcschr, but doesn't stop at NULL characters */
2493 while (size-- > 0) {
2494 if (*s == ch)
2495 return s;
2496 s++;
2499 return NULL;
2502 /* Apply fixfct filter to the Unicode object self and return a
2503 reference to the modified object */
2505 static
2506 PyObject *fixup(PyUnicodeObject *self,
2507 int (*fixfct)(PyUnicodeObject *s))
2510 PyUnicodeObject *u;
2512 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2513 self->length);
2514 if (u == NULL)
2515 return NULL;
2516 if (!fixfct(u)) {
2517 /* fixfct should return TRUE if it modified the buffer. If
2518 FALSE, return a reference to the original buffer instead
2519 (to save space, not time) */
2520 Py_INCREF(self);
2521 Py_DECREF(u);
2522 return (PyObject*) self;
2524 return (PyObject*) u;
2527 static
2528 int fixupper(PyUnicodeObject *self)
2530 int len = self->length;
2531 Py_UNICODE *s = self->str;
2532 int status = 0;
2534 while (len-- > 0) {
2535 register Py_UNICODE ch;
2537 ch = Py_UNICODE_TOUPPER(*s);
2538 if (ch != *s) {
2539 status = 1;
2540 *s = ch;
2542 s++;
2545 return status;
2548 static
2549 int fixlower(PyUnicodeObject *self)
2551 int len = self->length;
2552 Py_UNICODE *s = self->str;
2553 int status = 0;
2555 while (len-- > 0) {
2556 register Py_UNICODE ch;
2558 ch = Py_UNICODE_TOLOWER(*s);
2559 if (ch != *s) {
2560 status = 1;
2561 *s = ch;
2563 s++;
2566 return status;
2569 static
2570 int fixswapcase(PyUnicodeObject *self)
2572 int len = self->length;
2573 Py_UNICODE *s = self->str;
2574 int status = 0;
2576 while (len-- > 0) {
2577 if (Py_UNICODE_ISUPPER(*s)) {
2578 *s = Py_UNICODE_TOLOWER(*s);
2579 status = 1;
2580 } else if (Py_UNICODE_ISLOWER(*s)) {
2581 *s = Py_UNICODE_TOUPPER(*s);
2582 status = 1;
2584 s++;
2587 return status;
2590 static
2591 int fixcapitalize(PyUnicodeObject *self)
2593 int len = self->length;
2594 Py_UNICODE *s = self->str;
2595 int status = 0;
2597 if (len == 0)
2598 return 0;
2599 if (Py_UNICODE_ISLOWER(*s)) {
2600 *s = Py_UNICODE_TOUPPER(*s);
2601 status = 1;
2603 s++;
2604 while (--len > 0) {
2605 if (Py_UNICODE_ISUPPER(*s)) {
2606 *s = Py_UNICODE_TOLOWER(*s);
2607 status = 1;
2609 s++;
2611 return status;
2614 static
2615 int fixtitle(PyUnicodeObject *self)
2617 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2618 register Py_UNICODE *e;
2619 int previous_is_cased;
2621 /* Shortcut for single character strings */
2622 if (PyUnicode_GET_SIZE(self) == 1) {
2623 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2624 if (*p != ch) {
2625 *p = ch;
2626 return 1;
2628 else
2629 return 0;
2632 e = p + PyUnicode_GET_SIZE(self);
2633 previous_is_cased = 0;
2634 for (; p < e; p++) {
2635 register const Py_UNICODE ch = *p;
2637 if (previous_is_cased)
2638 *p = Py_UNICODE_TOLOWER(ch);
2639 else
2640 *p = Py_UNICODE_TOTITLE(ch);
2642 if (Py_UNICODE_ISLOWER(ch) ||
2643 Py_UNICODE_ISUPPER(ch) ||
2644 Py_UNICODE_ISTITLE(ch))
2645 previous_is_cased = 1;
2646 else
2647 previous_is_cased = 0;
2649 return 1;
2652 PyObject *PyUnicode_Join(PyObject *separator,
2653 PyObject *seq)
2655 Py_UNICODE *sep;
2656 int seplen;
2657 PyUnicodeObject *res = NULL;
2658 int reslen = 0;
2659 Py_UNICODE *p;
2660 int seqlen = 0;
2661 int sz = 100;
2662 int i;
2664 seqlen = PySequence_Size(seq);
2665 if (seqlen < 0 && PyErr_Occurred())
2666 return NULL;
2668 if (separator == NULL) {
2669 Py_UNICODE blank = ' ';
2670 sep = &blank;
2671 seplen = 1;
2673 else {
2674 separator = PyUnicode_FromObject(separator);
2675 if (separator == NULL)
2676 return NULL;
2677 sep = PyUnicode_AS_UNICODE(separator);
2678 seplen = PyUnicode_GET_SIZE(separator);
2681 res = _PyUnicode_New(sz);
2682 if (res == NULL)
2683 goto onError;
2684 p = PyUnicode_AS_UNICODE(res);
2685 reslen = 0;
2687 for (i = 0; i < seqlen; i++) {
2688 int itemlen;
2689 PyObject *item;
2691 item = PySequence_GetItem(seq, i);
2692 if (item == NULL)
2693 goto onError;
2694 if (!PyUnicode_Check(item)) {
2695 PyObject *v;
2696 v = PyUnicode_FromObject(item);
2697 Py_DECREF(item);
2698 item = v;
2699 if (item == NULL)
2700 goto onError;
2702 itemlen = PyUnicode_GET_SIZE(item);
2703 while (reslen + itemlen + seplen >= sz) {
2704 if (_PyUnicode_Resize(res, sz*2))
2705 goto onError;
2706 sz *= 2;
2707 p = PyUnicode_AS_UNICODE(res) + reslen;
2709 if (i > 0) {
2710 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2711 p += seplen;
2712 reslen += seplen;
2714 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2715 p += itemlen;
2716 reslen += itemlen;
2717 Py_DECREF(item);
2719 if (_PyUnicode_Resize(res, reslen))
2720 goto onError;
2722 Py_XDECREF(separator);
2723 return (PyObject *)res;
2725 onError:
2726 Py_XDECREF(separator);
2727 Py_DECREF(res);
2728 return NULL;
2731 static
2732 PyUnicodeObject *pad(PyUnicodeObject *self,
2733 int left,
2734 int right,
2735 Py_UNICODE fill)
2737 PyUnicodeObject *u;
2739 if (left < 0)
2740 left = 0;
2741 if (right < 0)
2742 right = 0;
2744 if (left == 0 && right == 0) {
2745 Py_INCREF(self);
2746 return self;
2749 u = _PyUnicode_New(left + self->length + right);
2750 if (u) {
2751 if (left)
2752 Py_UNICODE_FILL(u->str, fill, left);
2753 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2754 if (right)
2755 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2758 return u;
2761 #define SPLIT_APPEND(data, left, right) \
2762 str = PyUnicode_FromUnicode(data + left, right - left); \
2763 if (!str) \
2764 goto onError; \
2765 if (PyList_Append(list, str)) { \
2766 Py_DECREF(str); \
2767 goto onError; \
2769 else \
2770 Py_DECREF(str);
2772 static
2773 PyObject *split_whitespace(PyUnicodeObject *self,
2774 PyObject *list,
2775 int maxcount)
2777 register int i;
2778 register int j;
2779 int len = self->length;
2780 PyObject *str;
2782 for (i = j = 0; i < len; ) {
2783 /* find a token */
2784 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2785 i++;
2786 j = i;
2787 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2788 i++;
2789 if (j < i) {
2790 if (maxcount-- <= 0)
2791 break;
2792 SPLIT_APPEND(self->str, j, i);
2793 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2794 i++;
2795 j = i;
2798 if (j < len) {
2799 SPLIT_APPEND(self->str, j, len);
2801 return list;
2803 onError:
2804 Py_DECREF(list);
2805 return NULL;
2808 PyObject *PyUnicode_Splitlines(PyObject *string,
2809 int keepends)
2811 register int i;
2812 register int j;
2813 int len;
2814 PyObject *list;
2815 PyObject *str;
2816 Py_UNICODE *data;
2818 string = PyUnicode_FromObject(string);
2819 if (string == NULL)
2820 return NULL;
2821 data = PyUnicode_AS_UNICODE(string);
2822 len = PyUnicode_GET_SIZE(string);
2824 list = PyList_New(0);
2825 if (!list)
2826 goto onError;
2828 for (i = j = 0; i < len; ) {
2829 int eol;
2831 /* Find a line and append it */
2832 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2833 i++;
2835 /* Skip the line break reading CRLF as one line break */
2836 eol = i;
2837 if (i < len) {
2838 if (data[i] == '\r' && i + 1 < len &&
2839 data[i+1] == '\n')
2840 i += 2;
2841 else
2842 i++;
2843 if (keepends)
2844 eol = i;
2846 SPLIT_APPEND(data, j, eol);
2847 j = i;
2849 if (j < len) {
2850 SPLIT_APPEND(data, j, len);
2853 Py_DECREF(string);
2854 return list;
2856 onError:
2857 Py_DECREF(list);
2858 Py_DECREF(string);
2859 return NULL;
2862 static
2863 PyObject *split_char(PyUnicodeObject *self,
2864 PyObject *list,
2865 Py_UNICODE ch,
2866 int maxcount)
2868 register int i;
2869 register int j;
2870 int len = self->length;
2871 PyObject *str;
2873 for (i = j = 0; i < len; ) {
2874 if (self->str[i] == ch) {
2875 if (maxcount-- <= 0)
2876 break;
2877 SPLIT_APPEND(self->str, j, i);
2878 i = j = i + 1;
2879 } else
2880 i++;
2882 if (j <= len) {
2883 SPLIT_APPEND(self->str, j, len);
2885 return list;
2887 onError:
2888 Py_DECREF(list);
2889 return NULL;
2892 static
2893 PyObject *split_substring(PyUnicodeObject *self,
2894 PyObject *list,
2895 PyUnicodeObject *substring,
2896 int maxcount)
2898 register int i;
2899 register int j;
2900 int len = self->length;
2901 int sublen = substring->length;
2902 PyObject *str;
2904 for (i = j = 0; i <= len - sublen; ) {
2905 if (Py_UNICODE_MATCH(self, i, substring)) {
2906 if (maxcount-- <= 0)
2907 break;
2908 SPLIT_APPEND(self->str, j, i);
2909 i = j = i + sublen;
2910 } else
2911 i++;
2913 if (j <= len) {
2914 SPLIT_APPEND(self->str, j, len);
2916 return list;
2918 onError:
2919 Py_DECREF(list);
2920 return NULL;
2923 #undef SPLIT_APPEND
2925 static
2926 PyObject *split(PyUnicodeObject *self,
2927 PyUnicodeObject *substring,
2928 int maxcount)
2930 PyObject *list;
2932 if (maxcount < 0)
2933 maxcount = INT_MAX;
2935 list = PyList_New(0);
2936 if (!list)
2937 return NULL;
2939 if (substring == NULL)
2940 return split_whitespace(self,list,maxcount);
2942 else if (substring->length == 1)
2943 return split_char(self,list,substring->str[0],maxcount);
2945 else if (substring->length == 0) {
2946 Py_DECREF(list);
2947 PyErr_SetString(PyExc_ValueError, "empty separator");
2948 return NULL;
2950 else
2951 return split_substring(self,list,substring,maxcount);
2954 static
2955 PyObject *strip(PyUnicodeObject *self,
2956 int left,
2957 int right)
2959 Py_UNICODE *p = self->str;
2960 int start = 0;
2961 int end = self->length;
2963 if (left)
2964 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2965 start++;
2967 if (right)
2968 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2969 end--;
2971 if (start == 0 && end == self->length) {
2972 /* couldn't strip anything off, return original string */
2973 Py_INCREF(self);
2974 return (PyObject*) self;
2977 return (PyObject*) PyUnicode_FromUnicode(
2978 self->str + start,
2979 end - start
2983 static
2984 PyObject *replace(PyUnicodeObject *self,
2985 PyUnicodeObject *str1,
2986 PyUnicodeObject *str2,
2987 int maxcount)
2989 PyUnicodeObject *u;
2991 if (maxcount < 0)
2992 maxcount = INT_MAX;
2994 if (str1->length == 1 && str2->length == 1) {
2995 int i;
2997 /* replace characters */
2998 if (!findchar(self->str, self->length, str1->str[0])) {
2999 /* nothing to replace, return original string */
3000 Py_INCREF(self);
3001 u = self;
3002 } else {
3003 Py_UNICODE u1 = str1->str[0];
3004 Py_UNICODE u2 = str2->str[0];
3006 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3007 self->str,
3008 self->length
3010 if (u)
3011 for (i = 0; i < u->length; i++)
3012 if (u->str[i] == u1) {
3013 if (--maxcount < 0)
3014 break;
3015 u->str[i] = u2;
3019 } else {
3020 int n, i;
3021 Py_UNICODE *p;
3023 /* replace strings */
3024 n = count(self, 0, self->length, str1);
3025 if (n > maxcount)
3026 n = maxcount;
3027 if (n == 0) {
3028 /* nothing to replace, return original string */
3029 Py_INCREF(self);
3030 u = self;
3031 } else {
3032 u = _PyUnicode_New(
3033 self->length + n * (str2->length - str1->length));
3034 if (u) {
3035 i = 0;
3036 p = u->str;
3037 while (i <= self->length - str1->length)
3038 if (Py_UNICODE_MATCH(self, i, str1)) {
3039 /* replace string segment */
3040 Py_UNICODE_COPY(p, str2->str, str2->length);
3041 p += str2->length;
3042 i += str1->length;
3043 if (--n <= 0) {
3044 /* copy remaining part */
3045 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3046 break;
3048 } else
3049 *p++ = self->str[i++];
3054 return (PyObject *) u;
3057 /* --- Unicode Object Methods --------------------------------------------- */
3059 static char title__doc__[] =
3060 "S.title() -> unicode\n\
3062 Return a titlecased version of S, i.e. words start with title case\n\
3063 characters, all remaining cased characters have lower case.";
3065 static PyObject*
3066 unicode_title(PyUnicodeObject *self, PyObject *args)
3068 if (!PyArg_NoArgs(args))
3069 return NULL;
3070 return fixup(self, fixtitle);
3073 static char capitalize__doc__[] =
3074 "S.capitalize() -> unicode\n\
3076 Return a capitalized version of S, i.e. make the first character\n\
3077 have upper case.";
3079 static PyObject*
3080 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3082 if (!PyArg_NoArgs(args))
3083 return NULL;
3084 return fixup(self, fixcapitalize);
3087 #if 0
3088 static char capwords__doc__[] =
3089 "S.capwords() -> unicode\n\
3091 Apply .capitalize() to all words in S and return the result with\n\
3092 normalized whitespace (all whitespace strings are replaced by ' ').";
3094 static PyObject*
3095 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3097 PyObject *list;
3098 PyObject *item;
3099 int i;
3101 if (!PyArg_NoArgs(args))
3102 return NULL;
3104 /* Split into words */
3105 list = split(self, NULL, -1);
3106 if (!list)
3107 return NULL;
3109 /* Capitalize each word */
3110 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3111 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3112 fixcapitalize);
3113 if (item == NULL)
3114 goto onError;
3115 Py_DECREF(PyList_GET_ITEM(list, i));
3116 PyList_SET_ITEM(list, i, item);
3119 /* Join the words to form a new string */
3120 item = PyUnicode_Join(NULL, list);
3122 onError:
3123 Py_DECREF(list);
3124 return (PyObject *)item;
3126 #endif
3128 static char center__doc__[] =
3129 "S.center(width) -> unicode\n\
3131 Return S centered in a Unicode string of length width. Padding is done\n\
3132 using spaces.";
3134 static PyObject *
3135 unicode_center(PyUnicodeObject *self, PyObject *args)
3137 int marg, left;
3138 int width;
3140 if (!PyArg_ParseTuple(args, "i:center", &width))
3141 return NULL;
3143 if (self->length >= width) {
3144 Py_INCREF(self);
3145 return (PyObject*) self;
3148 marg = width - self->length;
3149 left = marg / 2 + (marg & width & 1);
3151 return (PyObject*) pad(self, left, marg - left, ' ');
3154 #if 0
3156 /* This code should go into some future Unicode collation support
3157 module. The basic comparison should compare ordinals on a naive
3158 basis (this is what Java does and thus JPython too). */
3160 /* speedy UTF-16 code point order comparison */
3161 /* gleaned from: */
3162 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3164 static short utf16Fixup[32] =
3166 0, 0, 0, 0, 0, 0, 0, 0,
3167 0, 0, 0, 0, 0, 0, 0, 0,
3168 0, 0, 0, 0, 0, 0, 0, 0,
3169 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3172 static int
3173 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3175 int len1, len2;
3177 Py_UNICODE *s1 = str1->str;
3178 Py_UNICODE *s2 = str2->str;
3180 len1 = str1->length;
3181 len2 = str2->length;
3183 while (len1 > 0 && len2 > 0) {
3184 Py_UNICODE c1, c2;
3185 long diff;
3187 c1 = *s1++;
3188 c2 = *s2++;
3189 if (c1 > (1<<11) * 26)
3190 c1 += utf16Fixup[c1>>11];
3191 if (c2 > (1<<11) * 26)
3192 c2 += utf16Fixup[c2>>11];
3194 /* now c1 and c2 are in UTF-32-compatible order */
3195 diff = (long)c1 - (long)c2;
3196 if (diff)
3197 return (diff < 0) ? -1 : (diff != 0);
3198 len1--; len2--;
3201 return (len1 < len2) ? -1 : (len1 != len2);
3204 #else
3206 static int
3207 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3209 register int len1, len2;
3211 Py_UNICODE *s1 = str1->str;
3212 Py_UNICODE *s2 = str2->str;
3214 len1 = str1->length;
3215 len2 = str2->length;
3217 while (len1 > 0 && len2 > 0) {
3218 register long diff;
3220 diff = (long)*s1++ - (long)*s2++;
3221 if (diff)
3222 return (diff < 0) ? -1 : (diff != 0);
3223 len1--; len2--;
3226 return (len1 < len2) ? -1 : (len1 != len2);
3229 #endif
3231 int PyUnicode_Compare(PyObject *left,
3232 PyObject *right)
3234 PyUnicodeObject *u = NULL, *v = NULL;
3235 int result;
3237 /* Coerce the two arguments */
3238 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3239 if (u == NULL)
3240 goto onError;
3241 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3242 if (v == NULL)
3243 goto onError;
3245 /* Shortcut for empty or interned objects */
3246 if (v == u) {
3247 Py_DECREF(u);
3248 Py_DECREF(v);
3249 return 0;
3252 result = unicode_compare(u, v);
3254 Py_DECREF(u);
3255 Py_DECREF(v);
3256 return result;
3258 onError:
3259 Py_XDECREF(u);
3260 Py_XDECREF(v);
3261 return -1;
3264 int PyUnicode_Contains(PyObject *container,
3265 PyObject *element)
3267 PyUnicodeObject *u = NULL, *v = NULL;
3268 int result;
3269 register const Py_UNICODE *p, *e;
3270 register Py_UNICODE ch;
3272 /* Coerce the two arguments */
3273 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3274 if (v == NULL) {
3275 PyErr_SetString(PyExc_TypeError,
3276 "'in <string>' requires character as left operand");
3277 goto onError;
3279 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3280 if (u == NULL) {
3281 Py_DECREF(v);
3282 goto onError;
3285 /* Check v in u */
3286 if (PyUnicode_GET_SIZE(v) != 1) {
3287 PyErr_SetString(PyExc_TypeError,
3288 "'in <string>' requires character as left operand");
3289 goto onError;
3291 ch = *PyUnicode_AS_UNICODE(v);
3292 p = PyUnicode_AS_UNICODE(u);
3293 e = p + PyUnicode_GET_SIZE(u);
3294 result = 0;
3295 while (p < e) {
3296 if (*p++ == ch) {
3297 result = 1;
3298 break;
3302 Py_DECREF(u);
3303 Py_DECREF(v);
3304 return result;
3306 onError:
3307 Py_XDECREF(u);
3308 Py_XDECREF(v);
3309 return -1;
3312 /* Concat to string or Unicode object giving a new Unicode object. */
3314 PyObject *PyUnicode_Concat(PyObject *left,
3315 PyObject *right)
3317 PyUnicodeObject *u = NULL, *v = NULL, *w;
3319 /* Coerce the two arguments */
3320 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3321 if (u == NULL)
3322 goto onError;
3323 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3324 if (v == NULL)
3325 goto onError;
3327 /* Shortcuts */
3328 if (v == unicode_empty) {
3329 Py_DECREF(v);
3330 return (PyObject *)u;
3332 if (u == unicode_empty) {
3333 Py_DECREF(u);
3334 return (PyObject *)v;
3337 /* Concat the two Unicode strings */
3338 w = _PyUnicode_New(u->length + v->length);
3339 if (w == NULL)
3340 goto onError;
3341 Py_UNICODE_COPY(w->str, u->str, u->length);
3342 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3344 Py_DECREF(u);
3345 Py_DECREF(v);
3346 return (PyObject *)w;
3348 onError:
3349 Py_XDECREF(u);
3350 Py_XDECREF(v);
3351 return NULL;
3354 static char count__doc__[] =
3355 "S.count(sub[, start[, end]]) -> int\n\
3357 Return the number of occurrences of substring sub in Unicode string\n\
3358 S[start:end]. Optional arguments start and end are\n\
3359 interpreted as in slice notation.";
3361 static PyObject *
3362 unicode_count(PyUnicodeObject *self, PyObject *args)
3364 PyUnicodeObject *substring;
3365 int start = 0;
3366 int end = INT_MAX;
3367 PyObject *result;
3369 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3370 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3371 return NULL;
3373 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3374 (PyObject *)substring);
3375 if (substring == NULL)
3376 return NULL;
3378 if (start < 0)
3379 start += self->length;
3380 if (start < 0)
3381 start = 0;
3382 if (end > self->length)
3383 end = self->length;
3384 if (end < 0)
3385 end += self->length;
3386 if (end < 0)
3387 end = 0;
3389 result = PyInt_FromLong((long) count(self, start, end, substring));
3391 Py_DECREF(substring);
3392 return result;
3395 static char encode__doc__[] =
3396 "S.encode([encoding[,errors]]) -> string\n\
3398 Return an encoded string version of S. Default encoding is the current\n\
3399 default string encoding. errors may be given to set a different error\n\
3400 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3401 a ValueError. Other possible values are 'ignore' and 'replace'.";
3403 static PyObject *
3404 unicode_encode(PyUnicodeObject *self, PyObject *args)
3406 char *encoding = NULL;
3407 char *errors = NULL;
3408 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3409 return NULL;
3410 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3413 static char expandtabs__doc__[] =
3414 "S.expandtabs([tabsize]) -> unicode\n\
3416 Return a copy of S where all tab characters are expanded using spaces.\n\
3417 If tabsize is not given, a tab size of 8 characters is assumed.";
3419 static PyObject*
3420 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3422 Py_UNICODE *e;
3423 Py_UNICODE *p;
3424 Py_UNICODE *q;
3425 int i, j;
3426 PyUnicodeObject *u;
3427 int tabsize = 8;
3429 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3430 return NULL;
3432 /* First pass: determine size of output string */
3433 i = j = 0;
3434 e = self->str + self->length;
3435 for (p = self->str; p < e; p++)
3436 if (*p == '\t') {
3437 if (tabsize > 0)
3438 j += tabsize - (j % tabsize);
3440 else {
3441 j++;
3442 if (*p == '\n' || *p == '\r') {
3443 i += j;
3444 j = 0;
3448 /* Second pass: create output string and fill it */
3449 u = _PyUnicode_New(i + j);
3450 if (!u)
3451 return NULL;
3453 j = 0;
3454 q = u->str;
3456 for (p = self->str; p < e; p++)
3457 if (*p == '\t') {
3458 if (tabsize > 0) {
3459 i = tabsize - (j % tabsize);
3460 j += i;
3461 while (i--)
3462 *q++ = ' ';
3465 else {
3466 j++;
3467 *q++ = *p;
3468 if (*p == '\n' || *p == '\r')
3469 j = 0;
3472 return (PyObject*) u;
3475 static char find__doc__[] =
3476 "S.find(sub [,start [,end]]) -> int\n\
3478 Return the lowest index in S where substring sub is found,\n\
3479 such that sub is contained within s[start,end]. Optional\n\
3480 arguments start and end are interpreted as in slice notation.\n\
3482 Return -1 on failure.";
3484 static PyObject *
3485 unicode_find(PyUnicodeObject *self, PyObject *args)
3487 PyUnicodeObject *substring;
3488 int start = 0;
3489 int end = INT_MAX;
3490 PyObject *result;
3492 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3493 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3494 return NULL;
3495 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3496 (PyObject *)substring);
3497 if (substring == NULL)
3498 return NULL;
3500 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3502 Py_DECREF(substring);
3503 return result;
3506 static PyObject *
3507 unicode_getitem(PyUnicodeObject *self, int index)
3509 if (index < 0 || index >= self->length) {
3510 PyErr_SetString(PyExc_IndexError, "string index out of range");
3511 return NULL;
3514 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3517 static long
3518 unicode_hash(PyUnicodeObject *self)
3520 /* Since Unicode objects compare equal to their ASCII string
3521 counterparts, they should use the individual character values
3522 as basis for their hash value. This is needed to assure that
3523 strings and Unicode objects behave in the same way as
3524 dictionary keys. */
3526 register int len;
3527 register Py_UNICODE *p;
3528 register long x;
3530 if (self->hash != -1)
3531 return self->hash;
3532 len = PyUnicode_GET_SIZE(self);
3533 p = PyUnicode_AS_UNICODE(self);
3534 x = *p << 7;
3535 while (--len >= 0)
3536 x = (1000003*x) ^ *p++;
3537 x ^= PyUnicode_GET_SIZE(self);
3538 if (x == -1)
3539 x = -2;
3540 self->hash = x;
3541 return x;
3544 static char index__doc__[] =
3545 "S.index(sub [,start [,end]]) -> int\n\
3547 Like S.find() but raise ValueError when the substring is not found.";
3549 static PyObject *
3550 unicode_index(PyUnicodeObject *self, PyObject *args)
3552 int result;
3553 PyUnicodeObject *substring;
3554 int start = 0;
3555 int end = INT_MAX;
3557 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3558 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3559 return NULL;
3561 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3562 (PyObject *)substring);
3563 if (substring == NULL)
3564 return NULL;
3566 result = findstring(self, substring, start, end, 1);
3568 Py_DECREF(substring);
3569 if (result < 0) {
3570 PyErr_SetString(PyExc_ValueError, "substring not found");
3571 return NULL;
3573 return PyInt_FromLong(result);
3576 static char islower__doc__[] =
3577 "S.islower() -> int\n\
3579 Return 1 if all cased characters in S are lowercase and there is\n\
3580 at least one cased character in S, 0 otherwise.";
3582 static PyObject*
3583 unicode_islower(PyUnicodeObject *self, PyObject *args)
3585 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3586 register const Py_UNICODE *e;
3587 int cased;
3589 if (!PyArg_NoArgs(args))
3590 return NULL;
3592 /* Shortcut for single character strings */
3593 if (PyUnicode_GET_SIZE(self) == 1)
3594 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3596 /* Special case for empty strings */
3597 if (PyString_GET_SIZE(self) == 0)
3598 return PyInt_FromLong(0);
3600 e = p + PyUnicode_GET_SIZE(self);
3601 cased = 0;
3602 for (; p < e; p++) {
3603 register const Py_UNICODE ch = *p;
3605 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3606 return PyInt_FromLong(0);
3607 else if (!cased && Py_UNICODE_ISLOWER(ch))
3608 cased = 1;
3610 return PyInt_FromLong(cased);
3613 static char isupper__doc__[] =
3614 "S.isupper() -> int\n\
3616 Return 1 if all cased characters in S are uppercase and there is\n\
3617 at least one cased character in S, 0 otherwise.";
3619 static PyObject*
3620 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3622 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3623 register const Py_UNICODE *e;
3624 int cased;
3626 if (!PyArg_NoArgs(args))
3627 return NULL;
3629 /* Shortcut for single character strings */
3630 if (PyUnicode_GET_SIZE(self) == 1)
3631 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3633 /* Special case for empty strings */
3634 if (PyString_GET_SIZE(self) == 0)
3635 return PyInt_FromLong(0);
3637 e = p + PyUnicode_GET_SIZE(self);
3638 cased = 0;
3639 for (; p < e; p++) {
3640 register const Py_UNICODE ch = *p;
3642 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3643 return PyInt_FromLong(0);
3644 else if (!cased && Py_UNICODE_ISUPPER(ch))
3645 cased = 1;
3647 return PyInt_FromLong(cased);
3650 static char istitle__doc__[] =
3651 "S.istitle() -> int\n\
3653 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3654 may only follow uncased characters and lowercase characters only cased\n\
3655 ones. Return 0 otherwise.";
3657 static PyObject*
3658 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3660 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3661 register const Py_UNICODE *e;
3662 int cased, previous_is_cased;
3664 if (!PyArg_NoArgs(args))
3665 return NULL;
3667 /* Shortcut for single character strings */
3668 if (PyUnicode_GET_SIZE(self) == 1)
3669 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3670 (Py_UNICODE_ISUPPER(*p) != 0));
3672 /* Special case for empty strings */
3673 if (PyString_GET_SIZE(self) == 0)
3674 return PyInt_FromLong(0);
3676 e = p + PyUnicode_GET_SIZE(self);
3677 cased = 0;
3678 previous_is_cased = 0;
3679 for (; p < e; p++) {
3680 register const Py_UNICODE ch = *p;
3682 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3683 if (previous_is_cased)
3684 return PyInt_FromLong(0);
3685 previous_is_cased = 1;
3686 cased = 1;
3688 else if (Py_UNICODE_ISLOWER(ch)) {
3689 if (!previous_is_cased)
3690 return PyInt_FromLong(0);
3691 previous_is_cased = 1;
3692 cased = 1;
3694 else
3695 previous_is_cased = 0;
3697 return PyInt_FromLong(cased);
3700 static char isspace__doc__[] =
3701 "S.isspace() -> int\n\
3703 Return 1 if there are only whitespace characters in S,\n\
3704 0 otherwise.";
3706 static PyObject*
3707 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3709 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3710 register const Py_UNICODE *e;
3712 if (!PyArg_NoArgs(args))
3713 return NULL;
3715 /* Shortcut for single character strings */
3716 if (PyUnicode_GET_SIZE(self) == 1 &&
3717 Py_UNICODE_ISSPACE(*p))
3718 return PyInt_FromLong(1);
3720 /* Special case for empty strings */
3721 if (PyString_GET_SIZE(self) == 0)
3722 return PyInt_FromLong(0);
3724 e = p + PyUnicode_GET_SIZE(self);
3725 for (; p < e; p++) {
3726 if (!Py_UNICODE_ISSPACE(*p))
3727 return PyInt_FromLong(0);
3729 return PyInt_FromLong(1);
3732 static char isalpha__doc__[] =
3733 "S.isalpha() -> int\n\
3735 Return 1 if all characters in S are alphabetic\n\
3736 and there is at least one character in S, 0 otherwise.";
3738 static PyObject*
3739 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3742 register const Py_UNICODE *e;
3744 if (!PyArg_NoArgs(args))
3745 return NULL;
3747 /* Shortcut for single character strings */
3748 if (PyUnicode_GET_SIZE(self) == 1 &&
3749 Py_UNICODE_ISALPHA(*p))
3750 return PyInt_FromLong(1);
3752 /* Special case for empty strings */
3753 if (PyString_GET_SIZE(self) == 0)
3754 return PyInt_FromLong(0);
3756 e = p + PyUnicode_GET_SIZE(self);
3757 for (; p < e; p++) {
3758 if (!Py_UNICODE_ISALPHA(*p))
3759 return PyInt_FromLong(0);
3761 return PyInt_FromLong(1);
3764 static char isalnum__doc__[] =
3765 "S.isalnum() -> int\n\
3767 Return 1 if all characters in S are alphanumeric\n\
3768 and there is at least one character in S, 0 otherwise.";
3770 static PyObject*
3771 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3774 register const Py_UNICODE *e;
3776 if (!PyArg_NoArgs(args))
3777 return NULL;
3779 /* Shortcut for single character strings */
3780 if (PyUnicode_GET_SIZE(self) == 1 &&
3781 Py_UNICODE_ISALNUM(*p))
3782 return PyInt_FromLong(1);
3784 /* Special case for empty strings */
3785 if (PyString_GET_SIZE(self) == 0)
3786 return PyInt_FromLong(0);
3788 e = p + PyUnicode_GET_SIZE(self);
3789 for (; p < e; p++) {
3790 if (!Py_UNICODE_ISALNUM(*p))
3791 return PyInt_FromLong(0);
3793 return PyInt_FromLong(1);
3796 static char isdecimal__doc__[] =
3797 "S.isdecimal() -> int\n\
3799 Return 1 if there are only decimal characters in S,\n\
3800 0 otherwise.";
3802 static PyObject*
3803 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3806 register const Py_UNICODE *e;
3808 if (!PyArg_NoArgs(args))
3809 return NULL;
3811 /* Shortcut for single character strings */
3812 if (PyUnicode_GET_SIZE(self) == 1 &&
3813 Py_UNICODE_ISDECIMAL(*p))
3814 return PyInt_FromLong(1);
3816 /* Special case for empty strings */
3817 if (PyString_GET_SIZE(self) == 0)
3818 return PyInt_FromLong(0);
3820 e = p + PyUnicode_GET_SIZE(self);
3821 for (; p < e; p++) {
3822 if (!Py_UNICODE_ISDECIMAL(*p))
3823 return PyInt_FromLong(0);
3825 return PyInt_FromLong(1);
3828 static char isdigit__doc__[] =
3829 "S.isdigit() -> int\n\
3831 Return 1 if there are only digit characters in S,\n\
3832 0 otherwise.";
3834 static PyObject*
3835 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3838 register const Py_UNICODE *e;
3840 if (!PyArg_NoArgs(args))
3841 return NULL;
3843 /* Shortcut for single character strings */
3844 if (PyUnicode_GET_SIZE(self) == 1 &&
3845 Py_UNICODE_ISDIGIT(*p))
3846 return PyInt_FromLong(1);
3848 /* Special case for empty strings */
3849 if (PyString_GET_SIZE(self) == 0)
3850 return PyInt_FromLong(0);
3852 e = p + PyUnicode_GET_SIZE(self);
3853 for (; p < e; p++) {
3854 if (!Py_UNICODE_ISDIGIT(*p))
3855 return PyInt_FromLong(0);
3857 return PyInt_FromLong(1);
3860 static char isnumeric__doc__[] =
3861 "S.isnumeric() -> int\n\
3863 Return 1 if there are only numeric characters in S,\n\
3864 0 otherwise.";
3866 static PyObject*
3867 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3869 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3870 register const Py_UNICODE *e;
3872 if (!PyArg_NoArgs(args))
3873 return NULL;
3875 /* Shortcut for single character strings */
3876 if (PyUnicode_GET_SIZE(self) == 1 &&
3877 Py_UNICODE_ISNUMERIC(*p))
3878 return PyInt_FromLong(1);
3880 /* Special case for empty strings */
3881 if (PyString_GET_SIZE(self) == 0)
3882 return PyInt_FromLong(0);
3884 e = p + PyUnicode_GET_SIZE(self);
3885 for (; p < e; p++) {
3886 if (!Py_UNICODE_ISNUMERIC(*p))
3887 return PyInt_FromLong(0);
3889 return PyInt_FromLong(1);
3892 static char join__doc__[] =
3893 "S.join(sequence) -> unicode\n\
3895 Return a string which is the concatenation of the strings in the\n\
3896 sequence. The separator between elements is S.";
3898 static PyObject*
3899 unicode_join(PyUnicodeObject *self, PyObject *args)
3901 PyObject *data;
3902 if (!PyArg_ParseTuple(args, "O:join", &data))
3903 return NULL;
3905 return PyUnicode_Join((PyObject *)self, data);
3908 static int
3909 unicode_length(PyUnicodeObject *self)
3911 return self->length;
3914 static char ljust__doc__[] =
3915 "S.ljust(width) -> unicode\n\
3917 Return S left justified in a Unicode string of length width. Padding is\n\
3918 done using spaces.";
3920 static PyObject *
3921 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3923 int width;
3924 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3925 return NULL;
3927 if (self->length >= width) {
3928 Py_INCREF(self);
3929 return (PyObject*) self;
3932 return (PyObject*) pad(self, 0, width - self->length, ' ');
3935 static char lower__doc__[] =
3936 "S.lower() -> unicode\n\
3938 Return a copy of the string S converted to lowercase.";
3940 static PyObject*
3941 unicode_lower(PyUnicodeObject *self, PyObject *args)
3943 if (!PyArg_NoArgs(args))
3944 return NULL;
3945 return fixup(self, fixlower);
3948 static char lstrip__doc__[] =
3949 "S.lstrip() -> unicode\n\
3951 Return a copy of the string S with leading whitespace removed.";
3953 static PyObject *
3954 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3956 if (!PyArg_NoArgs(args))
3957 return NULL;
3958 return strip(self, 1, 0);
3961 static PyObject*
3962 unicode_repeat(PyUnicodeObject *str, int len)
3964 PyUnicodeObject *u;
3965 Py_UNICODE *p;
3966 int nchars;
3967 size_t nbytes;
3969 if (len < 0)
3970 len = 0;
3972 if (len == 1) {
3973 /* no repeat, return original string */
3974 Py_INCREF(str);
3975 return (PyObject*) str;
3978 /* ensure # of chars needed doesn't overflow int and # of bytes
3979 * needed doesn't overflow size_t
3981 nchars = len * str->length;
3982 if (len && nchars / len != str->length) {
3983 PyErr_SetString(PyExc_OverflowError,
3984 "repeated string is too long");
3985 return NULL;
3987 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
3988 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
3989 PyErr_SetString(PyExc_OverflowError,
3990 "repeated string is too long");
3991 return NULL;
3993 u = _PyUnicode_New(nchars);
3994 if (!u)
3995 return NULL;
3997 p = u->str;
3999 while (len-- > 0) {
4000 Py_UNICODE_COPY(p, str->str, str->length);
4001 p += str->length;
4004 return (PyObject*) u;
4007 PyObject *PyUnicode_Replace(PyObject *obj,
4008 PyObject *subobj,
4009 PyObject *replobj,
4010 int maxcount)
4012 PyObject *self;
4013 PyObject *str1;
4014 PyObject *str2;
4015 PyObject *result;
4017 self = PyUnicode_FromObject(obj);
4018 if (self == NULL)
4019 return NULL;
4020 str1 = PyUnicode_FromObject(subobj);
4021 if (str1 == NULL) {
4022 Py_DECREF(self);
4023 return NULL;
4025 str2 = PyUnicode_FromObject(replobj);
4026 if (str2 == NULL) {
4027 Py_DECREF(self);
4028 Py_DECREF(str1);
4029 return NULL;
4031 result = replace((PyUnicodeObject *)self,
4032 (PyUnicodeObject *)str1,
4033 (PyUnicodeObject *)str2,
4034 maxcount);
4035 Py_DECREF(self);
4036 Py_DECREF(str1);
4037 Py_DECREF(str2);
4038 return result;
4041 static char replace__doc__[] =
4042 "S.replace (old, new[, maxsplit]) -> unicode\n\
4044 Return a copy of S with all occurrences of substring\n\
4045 old replaced by new. If the optional argument maxsplit is\n\
4046 given, only the first maxsplit occurrences are replaced.";
4048 static PyObject*
4049 unicode_replace(PyUnicodeObject *self, PyObject *args)
4051 PyUnicodeObject *str1;
4052 PyUnicodeObject *str2;
4053 int maxcount = -1;
4054 PyObject *result;
4056 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4057 return NULL;
4058 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4059 if (str1 == NULL)
4060 return NULL;
4061 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4062 if (str2 == NULL)
4063 return NULL;
4065 result = replace(self, str1, str2, maxcount);
4067 Py_DECREF(str1);
4068 Py_DECREF(str2);
4069 return result;
4072 static
4073 PyObject *unicode_repr(PyObject *unicode)
4075 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4076 PyUnicode_GET_SIZE(unicode),
4080 static char rfind__doc__[] =
4081 "S.rfind(sub [,start [,end]]) -> int\n\
4083 Return the highest index in S where substring sub is found,\n\
4084 such that sub is contained within s[start,end]. Optional\n\
4085 arguments start and end are interpreted as in slice notation.\n\
4087 Return -1 on failure.";
4089 static PyObject *
4090 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4092 PyUnicodeObject *substring;
4093 int start = 0;
4094 int end = INT_MAX;
4095 PyObject *result;
4097 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4098 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4099 return NULL;
4100 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4101 (PyObject *)substring);
4102 if (substring == NULL)
4103 return NULL;
4105 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4107 Py_DECREF(substring);
4108 return result;
4111 static char rindex__doc__[] =
4112 "S.rindex(sub [,start [,end]]) -> int\n\
4114 Like S.rfind() but raise ValueError when the substring is not found.";
4116 static PyObject *
4117 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4119 int result;
4120 PyUnicodeObject *substring;
4121 int start = 0;
4122 int end = INT_MAX;
4124 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4126 return NULL;
4127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4128 (PyObject *)substring);
4129 if (substring == NULL)
4130 return NULL;
4132 result = findstring(self, substring, start, end, -1);
4134 Py_DECREF(substring);
4135 if (result < 0) {
4136 PyErr_SetString(PyExc_ValueError, "substring not found");
4137 return NULL;
4139 return PyInt_FromLong(result);
4142 static char rjust__doc__[] =
4143 "S.rjust(width) -> unicode\n\
4145 Return S right justified in a Unicode string of length width. Padding is\n\
4146 done using spaces.";
4148 static PyObject *
4149 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4151 int width;
4152 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4153 return NULL;
4155 if (self->length >= width) {
4156 Py_INCREF(self);
4157 return (PyObject*) self;
4160 return (PyObject*) pad(self, width - self->length, 0, ' ');
4163 static char rstrip__doc__[] =
4164 "S.rstrip() -> unicode\n\
4166 Return a copy of the string S with trailing whitespace removed.";
4168 static PyObject *
4169 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4171 if (!PyArg_NoArgs(args))
4172 return NULL;
4173 return strip(self, 0, 1);
4176 static PyObject*
4177 unicode_slice(PyUnicodeObject *self, int start, int end)
4179 /* standard clamping */
4180 if (start < 0)
4181 start = 0;
4182 if (end < 0)
4183 end = 0;
4184 if (end > self->length)
4185 end = self->length;
4186 if (start == 0 && end == self->length) {
4187 /* full slice, return original string */
4188 Py_INCREF(self);
4189 return (PyObject*) self;
4191 if (start > end)
4192 start = end;
4193 /* copy slice */
4194 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4195 end - start);
4198 PyObject *PyUnicode_Split(PyObject *s,
4199 PyObject *sep,
4200 int maxsplit)
4202 PyObject *result;
4204 s = PyUnicode_FromObject(s);
4205 if (s == NULL)
4206 return NULL;
4207 if (sep != NULL) {
4208 sep = PyUnicode_FromObject(sep);
4209 if (sep == NULL) {
4210 Py_DECREF(s);
4211 return NULL;
4215 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4217 Py_DECREF(s);
4218 Py_XDECREF(sep);
4219 return result;
4222 static char split__doc__[] =
4223 "S.split([sep [,maxsplit]]) -> list of strings\n\
4225 Return a list of the words in S, using sep as the\n\
4226 delimiter string. If maxsplit is given, at most maxsplit\n\
4227 splits are done. If sep is not specified, any whitespace string\n\
4228 is a separator.";
4230 static PyObject*
4231 unicode_split(PyUnicodeObject *self, PyObject *args)
4233 PyObject *substring = Py_None;
4234 int maxcount = -1;
4236 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4237 return NULL;
4239 if (substring == Py_None)
4240 return split(self, NULL, maxcount);
4241 else if (PyUnicode_Check(substring))
4242 return split(self, (PyUnicodeObject *)substring, maxcount);
4243 else
4244 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4247 static char splitlines__doc__[] =
4248 "S.splitlines([keepends]]) -> list of strings\n\
4250 Return a list of the lines in S, breaking at line boundaries.\n\
4251 Line breaks are not included in the resulting list unless keepends\n\
4252 is given and true.";
4254 static PyObject*
4255 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4257 int keepends = 0;
4259 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4260 return NULL;
4262 return PyUnicode_Splitlines((PyObject *)self, keepends);
4265 static
4266 PyObject *unicode_str(PyUnicodeObject *self)
4268 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4271 static char strip__doc__[] =
4272 "S.strip() -> unicode\n\
4274 Return a copy of S with leading and trailing whitespace removed.";
4276 static PyObject *
4277 unicode_strip(PyUnicodeObject *self, PyObject *args)
4279 if (!PyArg_NoArgs(args))
4280 return NULL;
4281 return strip(self, 1, 1);
4284 static char swapcase__doc__[] =
4285 "S.swapcase() -> unicode\n\
4287 Return a copy of S with uppercase characters converted to lowercase\n\
4288 and vice versa.";
4290 static PyObject*
4291 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4293 if (!PyArg_NoArgs(args))
4294 return NULL;
4295 return fixup(self, fixswapcase);
4298 static char translate__doc__[] =
4299 "S.translate(table) -> unicode\n\
4301 Return a copy of the string S, where all characters have been mapped\n\
4302 through the given translation table, which must be a mapping of\n\
4303 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4304 are left untouched. Characters mapped to None are deleted.";
4306 static PyObject*
4307 unicode_translate(PyUnicodeObject *self, PyObject *args)
4309 PyObject *table;
4311 if (!PyArg_ParseTuple(args, "O:translate", &table))
4312 return NULL;
4313 return PyUnicode_TranslateCharmap(self->str,
4314 self->length,
4315 table,
4316 "ignore");
4319 static char upper__doc__[] =
4320 "S.upper() -> unicode\n\
4322 Return a copy of S converted to uppercase.";
4324 static PyObject*
4325 unicode_upper(PyUnicodeObject *self, PyObject *args)
4327 if (!PyArg_NoArgs(args))
4328 return NULL;
4329 return fixup(self, fixupper);
4332 #if 0
4333 static char zfill__doc__[] =
4334 "S.zfill(width) -> unicode\n\
4336 Pad a numeric string x with zeros on the left, to fill a field\n\
4337 of the specified width. The string x is never truncated.";
4339 static PyObject *
4340 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4342 int fill;
4343 PyUnicodeObject *u;
4345 int width;
4346 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4347 return NULL;
4349 if (self->length >= width) {
4350 Py_INCREF(self);
4351 return (PyObject*) self;
4354 fill = width - self->length;
4356 u = pad(self, fill, 0, '0');
4358 if (u->str[fill] == '+' || u->str[fill] == '-') {
4359 /* move sign to beginning of string */
4360 u->str[0] = u->str[fill];
4361 u->str[fill] = '0';
4364 return (PyObject*) u;
4366 #endif
4368 #if 0
4369 static PyObject*
4370 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4372 if (!PyArg_NoArgs(args))
4373 return NULL;
4374 return PyInt_FromLong(unicode_freelist_size);
4376 #endif
4378 static char startswith__doc__[] =
4379 "S.startswith(prefix[, start[, end]]) -> int\n\
4381 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4382 optional start, test S beginning at that position. With optional end, stop\n\
4383 comparing S at that position.";
4385 static PyObject *
4386 unicode_startswith(PyUnicodeObject *self,
4387 PyObject *args)
4389 PyUnicodeObject *substring;
4390 int start = 0;
4391 int end = INT_MAX;
4392 PyObject *result;
4394 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4395 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4396 return NULL;
4397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4398 (PyObject *)substring);
4399 if (substring == NULL)
4400 return NULL;
4402 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4404 Py_DECREF(substring);
4405 return result;
4409 static char endswith__doc__[] =
4410 "S.endswith(suffix[, start[, end]]) -> int\n\
4412 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4413 optional start, test S beginning at that position. With optional end, stop\n\
4414 comparing S at that position.";
4416 static PyObject *
4417 unicode_endswith(PyUnicodeObject *self,
4418 PyObject *args)
4420 PyUnicodeObject *substring;
4421 int start = 0;
4422 int end = INT_MAX;
4423 PyObject *result;
4425 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4426 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4427 return NULL;
4428 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4429 (PyObject *)substring);
4430 if (substring == NULL)
4431 return NULL;
4433 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4435 Py_DECREF(substring);
4436 return result;
4440 static PyMethodDef unicode_methods[] = {
4442 /* Order is according to common usage: often used methods should
4443 appear first, since lookup is done sequentially. */
4445 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4446 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4447 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4448 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4449 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4450 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4451 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4452 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4453 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4454 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4455 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4456 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4457 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4458 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4459 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4460 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4461 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4462 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4463 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4464 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4465 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4466 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4467 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4468 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4469 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4470 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4471 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4472 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4473 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4474 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4475 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4476 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4477 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4478 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4479 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4480 #if 0
4481 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4482 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4483 #endif
4485 #if 0
4486 /* This one is just used for debugging the implementation. */
4487 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4488 #endif
4490 {NULL, NULL}
4493 static PyObject *
4494 unicode_getattr(PyUnicodeObject *self, char *name)
4496 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4499 static PySequenceMethods unicode_as_sequence = {
4500 (inquiry) unicode_length, /* sq_length */
4501 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4502 (intargfunc) unicode_repeat, /* sq_repeat */
4503 (intargfunc) unicode_getitem, /* sq_item */
4504 (intintargfunc) unicode_slice, /* sq_slice */
4505 0, /* sq_ass_item */
4506 0, /* sq_ass_slice */
4507 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4510 static int
4511 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4512 int index,
4513 const void **ptr)
4515 if (index != 0) {
4516 PyErr_SetString(PyExc_SystemError,
4517 "accessing non-existent unicode segment");
4518 return -1;
4520 *ptr = (void *) self->str;
4521 return PyUnicode_GET_DATA_SIZE(self);
4524 static int
4525 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4526 const void **ptr)
4528 PyErr_SetString(PyExc_TypeError,
4529 "cannot use unicode as modifyable buffer");
4530 return -1;
4533 static int
4534 unicode_buffer_getsegcount(PyUnicodeObject *self,
4535 int *lenp)
4537 if (lenp)
4538 *lenp = PyUnicode_GET_DATA_SIZE(self);
4539 return 1;
4542 static int
4543 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4544 int index,
4545 const void **ptr)
4547 PyObject *str;
4549 if (index != 0) {
4550 PyErr_SetString(PyExc_SystemError,
4551 "accessing non-existent unicode segment");
4552 return -1;
4554 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4555 if (str == NULL)
4556 return -1;
4557 *ptr = (void *) PyString_AS_STRING(str);
4558 return PyString_GET_SIZE(str);
4561 /* Helpers for PyUnicode_Format() */
4563 static PyObject *
4564 getnextarg(PyObject *args, int arglen, int *p_argidx)
4566 int argidx = *p_argidx;
4567 if (argidx < arglen) {
4568 (*p_argidx)++;
4569 if (arglen < 0)
4570 return args;
4571 else
4572 return PyTuple_GetItem(args, argidx);
4574 PyErr_SetString(PyExc_TypeError,
4575 "not enough arguments for format string");
4576 return NULL;
4579 #define F_LJUST (1<<0)
4580 #define F_SIGN (1<<1)
4581 #define F_BLANK (1<<2)
4582 #define F_ALT (1<<3)
4583 #define F_ZERO (1<<4)
4585 static
4586 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4588 register int i;
4589 int len;
4590 va_list va;
4591 char *charbuffer;
4592 va_start(va, format);
4594 /* First, format the string as char array, then expand to Py_UNICODE
4595 array. */
4596 charbuffer = (char *)buffer;
4597 len = vsprintf(charbuffer, format, va);
4598 for (i = len - 1; i >= 0; i--)
4599 buffer[i] = (Py_UNICODE) charbuffer[i];
4601 va_end(va);
4602 return len;
4605 static int
4606 formatfloat(Py_UNICODE *buf,
4607 size_t buflen,
4608 int flags,
4609 int prec,
4610 int type,
4611 PyObject *v)
4613 /* fmt = '%#.' + `prec` + `type`
4614 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4615 char fmt[20];
4616 double x;
4618 x = PyFloat_AsDouble(v);
4619 if (x == -1.0 && PyErr_Occurred())
4620 return -1;
4621 if (prec < 0)
4622 prec = 6;
4623 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4624 type = 'g';
4625 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4626 /* worst case length calc to ensure no buffer overrun:
4627 fmt = %#.<prec>g
4628 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4629 for any double rep.)
4630 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4631 If prec=0 the effective precision is 1 (the leading digit is
4632 always given), therefore increase by one to 10+prec. */
4633 if (buflen <= (size_t)10 + (size_t)prec) {
4634 PyErr_SetString(PyExc_OverflowError,
4635 "formatted float is too long (precision too long?)");
4636 return -1;
4638 return usprintf(buf, fmt, x);
4641 static PyObject*
4642 formatlong(PyObject *val, int flags, int prec, int type)
4644 char *buf;
4645 int i, len;
4646 PyObject *str; /* temporary string object. */
4647 PyUnicodeObject *result;
4649 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4650 if (!str)
4651 return NULL;
4652 result = _PyUnicode_New(len);
4653 for (i = 0; i < len; i++)
4654 result->str[i] = buf[i];
4655 result->str[len] = 0;
4656 Py_DECREF(str);
4657 return (PyObject*)result;
4660 static int
4661 formatint(Py_UNICODE *buf,
4662 size_t buflen,
4663 int flags,
4664 int prec,
4665 int type,
4666 PyObject *v)
4668 /* fmt = '%#.' + `prec` + 'l' + `type`
4669 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4670 + 1 + 1 = 24*/
4671 char fmt[64]; /* plenty big enough! */
4672 long x;
4674 x = PyInt_AsLong(v);
4675 if (x == -1 && PyErr_Occurred())
4676 return -1;
4677 if (prec < 0)
4678 prec = 1;
4679 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4680 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4681 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4682 PyErr_SetString(PyExc_OverflowError,
4683 "formatted integer is too long (precision too long?)");
4684 return -1;
4686 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4687 return usprintf(buf, fmt, x);
4690 static int
4691 formatchar(Py_UNICODE *buf,
4692 size_t buflen,
4693 PyObject *v)
4695 /* presume that the buffer is at least 2 characters long */
4696 if (PyUnicode_Check(v)) {
4697 if (PyUnicode_GET_SIZE(v) != 1)
4698 goto onError;
4699 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4702 else if (PyString_Check(v)) {
4703 if (PyString_GET_SIZE(v) != 1)
4704 goto onError;
4705 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4708 else {
4709 /* Integer input truncated to a character */
4710 long x;
4711 x = PyInt_AsLong(v);
4712 if (x == -1 && PyErr_Occurred())
4713 goto onError;
4714 buf[0] = (char) x;
4716 buf[1] = '\0';
4717 return 1;
4719 onError:
4720 PyErr_SetString(PyExc_TypeError,
4721 "%c requires int or char");
4722 return -1;
4725 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4727 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4728 chars are formatted. XXX This is a magic number. Each formatting
4729 routine does bounds checking to ensure no overflow, but a better
4730 solution may be to malloc a buffer of appropriate size for each
4731 format. For now, the current solution is sufficient.
4733 #define FORMATBUFLEN (size_t)120
4735 PyObject *PyUnicode_Format(PyObject *format,
4736 PyObject *args)
4738 Py_UNICODE *fmt, *res;
4739 int fmtcnt, rescnt, reslen, arglen, argidx;
4740 int args_owned = 0;
4741 PyUnicodeObject *result = NULL;
4742 PyObject *dict = NULL;
4743 PyObject *uformat;
4745 if (format == NULL || args == NULL) {
4746 PyErr_BadInternalCall();
4747 return NULL;
4749 uformat = PyUnicode_FromObject(format);
4750 if (uformat == NULL)
4751 return NULL;
4752 fmt = PyUnicode_AS_UNICODE(uformat);
4753 fmtcnt = PyUnicode_GET_SIZE(uformat);
4755 reslen = rescnt = fmtcnt + 100;
4756 result = _PyUnicode_New(reslen);
4757 if (result == NULL)
4758 goto onError;
4759 res = PyUnicode_AS_UNICODE(result);
4761 if (PyTuple_Check(args)) {
4762 arglen = PyTuple_Size(args);
4763 argidx = 0;
4765 else {
4766 arglen = -1;
4767 argidx = -2;
4769 if (args->ob_type->tp_as_mapping)
4770 dict = args;
4772 while (--fmtcnt >= 0) {
4773 if (*fmt != '%') {
4774 if (--rescnt < 0) {
4775 rescnt = fmtcnt + 100;
4776 reslen += rescnt;
4777 if (_PyUnicode_Resize(result, reslen) < 0)
4778 return NULL;
4779 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4780 --rescnt;
4782 *res++ = *fmt++;
4784 else {
4785 /* Got a format specifier */
4786 int flags = 0;
4787 int width = -1;
4788 int prec = -1;
4789 int size = 0;
4790 Py_UNICODE c = '\0';
4791 Py_UNICODE fill;
4792 PyObject *v = NULL;
4793 PyObject *temp = NULL;
4794 Py_UNICODE *pbuf;
4795 Py_UNICODE sign;
4796 int len;
4797 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4799 fmt++;
4800 if (*fmt == '(') {
4801 Py_UNICODE *keystart;
4802 int keylen;
4803 PyObject *key;
4804 int pcount = 1;
4806 if (dict == NULL) {
4807 PyErr_SetString(PyExc_TypeError,
4808 "format requires a mapping");
4809 goto onError;
4811 ++fmt;
4812 --fmtcnt;
4813 keystart = fmt;
4814 /* Skip over balanced parentheses */
4815 while (pcount > 0 && --fmtcnt >= 0) {
4816 if (*fmt == ')')
4817 --pcount;
4818 else if (*fmt == '(')
4819 ++pcount;
4820 fmt++;
4822 keylen = fmt - keystart - 1;
4823 if (fmtcnt < 0 || pcount > 0) {
4824 PyErr_SetString(PyExc_ValueError,
4825 "incomplete format key");
4826 goto onError;
4828 /* keys are converted to strings using UTF-8 and
4829 then looked up since Python uses strings to hold
4830 variables names etc. in its namespaces and we
4831 wouldn't want to break common idioms. */
4832 key = PyUnicode_EncodeUTF8(keystart,
4833 keylen,
4834 NULL);
4835 if (key == NULL)
4836 goto onError;
4837 if (args_owned) {
4838 Py_DECREF(args);
4839 args_owned = 0;
4841 args = PyObject_GetItem(dict, key);
4842 Py_DECREF(key);
4843 if (args == NULL) {
4844 goto onError;
4846 args_owned = 1;
4847 arglen = -1;
4848 argidx = -2;
4850 while (--fmtcnt >= 0) {
4851 switch (c = *fmt++) {
4852 case '-': flags |= F_LJUST; continue;
4853 case '+': flags |= F_SIGN; continue;
4854 case ' ': flags |= F_BLANK; continue;
4855 case '#': flags |= F_ALT; continue;
4856 case '0': flags |= F_ZERO; continue;
4858 break;
4860 if (c == '*') {
4861 v = getnextarg(args, arglen, &argidx);
4862 if (v == NULL)
4863 goto onError;
4864 if (!PyInt_Check(v)) {
4865 PyErr_SetString(PyExc_TypeError,
4866 "* wants int");
4867 goto onError;
4869 width = PyInt_AsLong(v);
4870 if (width < 0) {
4871 flags |= F_LJUST;
4872 width = -width;
4874 if (--fmtcnt >= 0)
4875 c = *fmt++;
4877 else if (c >= '0' && c <= '9') {
4878 width = c - '0';
4879 while (--fmtcnt >= 0) {
4880 c = *fmt++;
4881 if (c < '0' || c > '9')
4882 break;
4883 if ((width*10) / 10 != width) {
4884 PyErr_SetString(PyExc_ValueError,
4885 "width too big");
4886 goto onError;
4888 width = width*10 + (c - '0');
4891 if (c == '.') {
4892 prec = 0;
4893 if (--fmtcnt >= 0)
4894 c = *fmt++;
4895 if (c == '*') {
4896 v = getnextarg(args, arglen, &argidx);
4897 if (v == NULL)
4898 goto onError;
4899 if (!PyInt_Check(v)) {
4900 PyErr_SetString(PyExc_TypeError,
4901 "* wants int");
4902 goto onError;
4904 prec = PyInt_AsLong(v);
4905 if (prec < 0)
4906 prec = 0;
4907 if (--fmtcnt >= 0)
4908 c = *fmt++;
4910 else if (c >= '0' && c <= '9') {
4911 prec = c - '0';
4912 while (--fmtcnt >= 0) {
4913 c = Py_CHARMASK(*fmt++);
4914 if (c < '0' || c > '9')
4915 break;
4916 if ((prec*10) / 10 != prec) {
4917 PyErr_SetString(PyExc_ValueError,
4918 "prec too big");
4919 goto onError;
4921 prec = prec*10 + (c - '0');
4924 } /* prec */
4925 if (fmtcnt >= 0) {
4926 if (c == 'h' || c == 'l' || c == 'L') {
4927 size = c;
4928 if (--fmtcnt >= 0)
4929 c = *fmt++;
4932 if (fmtcnt < 0) {
4933 PyErr_SetString(PyExc_ValueError,
4934 "incomplete format");
4935 goto onError;
4937 if (c != '%') {
4938 v = getnextarg(args, arglen, &argidx);
4939 if (v == NULL)
4940 goto onError;
4942 sign = 0;
4943 fill = ' ';
4944 switch (c) {
4946 case '%':
4947 pbuf = formatbuf;
4948 /* presume that buffer length is at least 1 */
4949 pbuf[0] = '%';
4950 len = 1;
4951 break;
4953 case 's':
4954 case 'r':
4955 if (PyUnicode_Check(v) && c == 's') {
4956 temp = v;
4957 Py_INCREF(temp);
4959 else {
4960 PyObject *unicode;
4961 if (c == 's')
4962 temp = PyObject_Str(v);
4963 else
4964 temp = PyObject_Repr(v);
4965 if (temp == NULL)
4966 goto onError;
4967 if (!PyString_Check(temp)) {
4968 /* XXX Note: this should never happen, since
4969 PyObject_Repr() and PyObject_Str() assure
4970 this */
4971 Py_DECREF(temp);
4972 PyErr_SetString(PyExc_TypeError,
4973 "%s argument has non-string str()");
4974 goto onError;
4976 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
4977 PyString_GET_SIZE(temp),
4978 NULL,
4979 "strict");
4980 Py_DECREF(temp);
4981 temp = unicode;
4982 if (temp == NULL)
4983 goto onError;
4985 pbuf = PyUnicode_AS_UNICODE(temp);
4986 len = PyUnicode_GET_SIZE(temp);
4987 if (prec >= 0 && len > prec)
4988 len = prec;
4989 break;
4991 case 'i':
4992 case 'd':
4993 case 'u':
4994 case 'o':
4995 case 'x':
4996 case 'X':
4997 if (c == 'i')
4998 c = 'd';
4999 if (PyLong_Check(v)) {
5000 temp = formatlong(v, flags, prec, c);
5001 if (!temp)
5002 goto onError;
5003 pbuf = PyUnicode_AS_UNICODE(temp);
5004 len = PyUnicode_GET_SIZE(temp);
5005 /* unbounded ints can always produce
5006 a sign character! */
5007 sign = 1;
5009 else {
5010 pbuf = formatbuf;
5011 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5012 flags, prec, c, v);
5013 if (len < 0)
5014 goto onError;
5015 /* only d conversion is signed */
5016 sign = c == 'd';
5018 if (flags & F_ZERO)
5019 fill = '0';
5020 break;
5022 case 'e':
5023 case 'E':
5024 case 'f':
5025 case 'g':
5026 case 'G':
5027 pbuf = formatbuf;
5028 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5029 flags, prec, c, v);
5030 if (len < 0)
5031 goto onError;
5032 sign = 1;
5033 if (flags & F_ZERO)
5034 fill = '0';
5035 break;
5037 case 'c':
5038 pbuf = formatbuf;
5039 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5040 if (len < 0)
5041 goto onError;
5042 break;
5044 default:
5045 PyErr_Format(PyExc_ValueError,
5046 "unsupported format character '%c' (0x%x) "
5047 "at index %i",
5048 (31<=c && c<=126) ? c : '?',
5049 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5050 goto onError;
5052 if (sign) {
5053 if (*pbuf == '-' || *pbuf == '+') {
5054 sign = *pbuf++;
5055 len--;
5057 else if (flags & F_SIGN)
5058 sign = '+';
5059 else if (flags & F_BLANK)
5060 sign = ' ';
5061 else
5062 sign = 0;
5064 if (width < len)
5065 width = len;
5066 if (rescnt < width + (sign != 0)) {
5067 reslen -= rescnt;
5068 rescnt = width + fmtcnt + 100;
5069 reslen += rescnt;
5070 if (_PyUnicode_Resize(result, reslen) < 0)
5071 return NULL;
5072 res = PyUnicode_AS_UNICODE(result)
5073 + reslen - rescnt;
5075 if (sign) {
5076 if (fill != ' ')
5077 *res++ = sign;
5078 rescnt--;
5079 if (width > len)
5080 width--;
5082 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5083 assert(pbuf[0] == '0');
5084 assert(pbuf[1] == c);
5085 if (fill != ' ') {
5086 *res++ = *pbuf++;
5087 *res++ = *pbuf++;
5089 rescnt -= 2;
5090 width -= 2;
5091 if (width < 0)
5092 width = 0;
5093 len -= 2;
5095 if (width > len && !(flags & F_LJUST)) {
5096 do {
5097 --rescnt;
5098 *res++ = fill;
5099 } while (--width > len);
5101 if (fill == ' ') {
5102 if (sign)
5103 *res++ = sign;
5104 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5105 assert(pbuf[0] == '0');
5106 assert(pbuf[1] == c);
5107 *res++ = *pbuf++;
5108 *res++ = *pbuf++;
5111 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5112 res += len;
5113 rescnt -= len;
5114 while (--width >= len) {
5115 --rescnt;
5116 *res++ = ' ';
5118 if (dict && (argidx < arglen) && c != '%') {
5119 PyErr_SetString(PyExc_TypeError,
5120 "not all arguments converted");
5121 goto onError;
5123 Py_XDECREF(temp);
5124 } /* '%' */
5125 } /* until end */
5126 if (argidx < arglen && !dict) {
5127 PyErr_SetString(PyExc_TypeError,
5128 "not all arguments converted");
5129 goto onError;
5132 if (args_owned) {
5133 Py_DECREF(args);
5135 Py_DECREF(uformat);
5136 if (_PyUnicode_Resize(result, reslen - rescnt))
5137 goto onError;
5138 return (PyObject *)result;
5140 onError:
5141 Py_XDECREF(result);
5142 Py_DECREF(uformat);
5143 if (args_owned) {
5144 Py_DECREF(args);
5146 return NULL;
5149 static PyBufferProcs unicode_as_buffer = {
5150 (getreadbufferproc) unicode_buffer_getreadbuf,
5151 (getwritebufferproc) unicode_buffer_getwritebuf,
5152 (getsegcountproc) unicode_buffer_getsegcount,
5153 (getcharbufferproc) unicode_buffer_getcharbuf,
5156 PyTypeObject PyUnicode_Type = {
5157 PyObject_HEAD_INIT(&PyType_Type)
5158 0, /* ob_size */
5159 "unicode", /* tp_name */
5160 sizeof(PyUnicodeObject), /* tp_size */
5161 0, /* tp_itemsize */
5162 /* Slots */
5163 (destructor)_PyUnicode_Free, /* tp_dealloc */
5164 0, /* tp_print */
5165 (getattrfunc)unicode_getattr, /* tp_getattr */
5166 0, /* tp_setattr */
5167 (cmpfunc) unicode_compare, /* tp_compare */
5168 (reprfunc) unicode_repr, /* tp_repr */
5169 0, /* tp_as_number */
5170 &unicode_as_sequence, /* tp_as_sequence */
5171 0, /* tp_as_mapping */
5172 (hashfunc) unicode_hash, /* tp_hash*/
5173 0, /* tp_call*/
5174 (reprfunc) unicode_str, /* tp_str */
5175 (getattrofunc) NULL, /* tp_getattro */
5176 (setattrofunc) NULL, /* tp_setattro */
5177 &unicode_as_buffer, /* tp_as_buffer */
5178 Py_TPFLAGS_DEFAULT, /* tp_flags */
5181 /* Initialize the Unicode implementation */
5183 void _PyUnicode_Init(void)
5185 /* Doublecheck the configuration... */
5186 if (sizeof(Py_UNICODE) != 2)
5187 Py_FatalError("Unicode configuration error: "
5188 "sizeof(Py_UNICODE) != 2 bytes");
5190 /* Init the implementation */
5191 unicode_freelist = NULL;
5192 unicode_freelist_size = 0;
5193 unicode_empty = _PyUnicode_New(0);
5194 strcpy(unicode_default_encoding, "ascii");
5197 /* Finalize the Unicode implementation */
5199 void
5200 _PyUnicode_Fini(void)
5202 PyUnicodeObject *u;
5204 Py_XDECREF(unicode_empty);
5205 unicode_empty = NULL;
5207 for (u = unicode_freelist; u != NULL;) {
5208 PyUnicodeObject *v = u;
5209 u = *(PyUnicodeObject **)u;
5210 if (v->str)
5211 PyMem_DEL(v->str);
5212 Py_XDECREF(v->defenc);
5213 PyObject_DEL(v);
5215 unicode_freelist = NULL;
5216 unicode_freelist_size = 0;