Bump version to 0.9.1.
[python/dscho.git] / Objects / unicodeobject.c
blobf4dc9bfe7e60ea822edd5a1f8a2fb7c97f192ab0
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
10 Original header:
11 --------------------------------------------------------------------
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
29 * Written by Fredrik Lundh, January 1999.
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
65 #include "Python.h"
67 #include "unicodeobject.h"
68 #include "ucnhash.h"
70 #if defined(HAVE_LIMITS_H)
71 #include <limits.h>
72 #else
73 #define INT_MAX 2147483647
74 #endif
76 #ifdef MS_WIN32
77 #include <windows.h>
78 #endif
80 /* Limit for the Unicode object free list */
82 #define MAX_UNICODE_FREELIST_SIZE 1024
84 /* Limit for the Unicode object free list stay alive optimization.
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
90 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
91 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
92 malloc()-overhead) bytes of unused garbage.
94 Setting the limit to 0 effectively turns the feature off.
96 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
101 #define KEEPALIVE_SIZE_LIMIT 9
103 /* Endianness switches; defaults to little endian */
105 #ifdef WORDS_BIGENDIAN
106 # define BYTEORDER_IS_BIG_ENDIAN
107 #else
108 # define BYTEORDER_IS_LITTLE_ENDIAN
109 #endif
111 /* --- Globals ------------------------------------------------------------
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
118 /* The empty Unicode object */
119 static PyUnicodeObject *unicode_empty;
121 /* Free list for Unicode objects */
122 static PyUnicodeObject *unicode_freelist;
123 static int unicode_freelist_size;
125 /* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
133 static char unicode_default_encoding[100];
135 /* --- Unicode Object ----------------------------------------------------- */
137 static
138 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
141 void *oldstr;
143 /* Shortcut if there's nothing much to do. */
144 if (unicode->length == length)
145 goto reset;
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
163 unicode->str[length] = 0;
164 unicode->length = length;
166 reset:
167 /* Reset the object caches */
168 if (unicode->defenc) {
169 Py_DECREF(unicode->defenc);
170 unicode->defenc = NULL;
172 unicode->hash = -1;
174 return 0;
177 int PyUnicode_Resize(PyObject **unicode,
178 int length)
180 PyUnicodeObject *v;
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
191 return _PyUnicode_Resize(v, length);
194 /* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
202 static
203 PyUnicodeObject *_PyUnicode_New(int length)
205 register PyUnicodeObject *unicode;
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
216 unicode_freelist = *(PyUnicodeObject **)unicode;
217 unicode_freelist_size--;
218 if (unicode->str) {
219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
222 _PyUnicode_Resize(unicode, length)) {
223 PyMem_DEL(unicode->str);
224 goto onError;
227 else {
228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
230 PyObject_INIT(unicode, &PyUnicode_Type);
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
239 if (!unicode->str) {
240 PyErr_NoMemory();
241 goto onError;
243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
246 unicode->defenc = NULL;
247 return unicode;
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
251 PyObject_DEL(unicode);
252 return NULL;
255 static
256 void _PyUnicode_Free(register PyUnicodeObject *unicode)
258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
261 PyMem_DEL(unicode->str);
262 unicode->str = NULL;
263 unicode->length = 0;
265 if (unicode->defenc) {
266 Py_DECREF(unicode->defenc);
267 unicode->defenc = NULL;
269 /* Add to free list */
270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
274 else {
275 PyMem_DEL(unicode->str);
276 Py_XDECREF(unicode->defenc);
277 PyObject_DEL(unicode);
281 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
284 PyUnicodeObject *unicode;
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
294 return (PyObject *)unicode;
297 #ifdef HAVE_WCHAR_H
299 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
302 PyUnicodeObject *unicode;
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
313 /* Copy the wchar_t data into the new object */
314 #ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316 #else
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
324 #endif
326 return (PyObject *)unicode;
329 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339 #ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341 #else
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
349 #endif
351 return size;
354 #endif
356 PyObject *PyUnicode_FromObject(register PyObject *obj)
358 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
361 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
362 const char *encoding,
363 const char *errors)
365 const char *s;
366 int len;
367 int owned = 0;
368 PyObject *v;
370 if (obj == NULL) {
371 PyErr_BadInternalCall();
372 return NULL;
375 /* Coerce object */
376 if (PyInstance_Check(obj)) {
377 PyObject *func;
378 func = PyObject_GetAttrString(obj, "__str__");
379 if (func == NULL) {
380 PyErr_SetString(PyExc_TypeError,
381 "coercing to Unicode: instance doesn't define __str__");
382 return NULL;
384 obj = PyEval_CallObject(func, NULL);
385 Py_DECREF(func);
386 if (obj == NULL)
387 return NULL;
388 owned = 1;
390 if (PyUnicode_Check(obj)) {
391 Py_INCREF(obj);
392 v = obj;
393 if (encoding) {
394 PyErr_SetString(PyExc_TypeError,
395 "decoding Unicode is not supported");
396 return NULL;
398 goto done;
400 else if (PyString_Check(obj)) {
401 s = PyString_AS_STRING(obj);
402 len = PyString_GET_SIZE(obj);
404 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError))
408 PyErr_Format(PyExc_TypeError,
409 "coercing to Unicode: need string or buffer, "
410 "%.80s found",
411 obj->ob_type->tp_name);
412 goto onError;
415 /* Convert to Unicode */
416 if (len == 0) {
417 Py_INCREF(unicode_empty);
418 v = (PyObject *)unicode_empty;
420 else
421 v = PyUnicode_Decode(s, len, encoding, errors);
422 done:
423 if (owned) {
424 Py_DECREF(obj);
426 return v;
428 onError:
429 if (owned) {
430 Py_DECREF(obj);
432 return NULL;
435 PyObject *PyUnicode_Decode(const char *s,
436 int size,
437 const char *encoding,
438 const char *errors)
440 PyObject *buffer = NULL, *unicode;
442 if (encoding == NULL)
443 encoding = PyUnicode_GetDefaultEncoding();
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding, "utf-8") == 0)
447 return PyUnicode_DecodeUTF8(s, size, errors);
448 else if (strcmp(encoding, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s, size, errors);
450 else if (strcmp(encoding, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s, size, errors);
453 /* Decode via the codec registry */
454 buffer = PyBuffer_FromMemory((void *)s, size);
455 if (buffer == NULL)
456 goto onError;
457 unicode = PyCodec_Decode(buffer, encoding, errors);
458 if (unicode == NULL)
459 goto onError;
460 if (!PyUnicode_Check(unicode)) {
461 PyErr_Format(PyExc_TypeError,
462 "decoder did not return an unicode object (type=%.400s)",
463 unicode->ob_type->tp_name);
464 Py_DECREF(unicode);
465 goto onError;
467 Py_DECREF(buffer);
468 return unicode;
470 onError:
471 Py_XDECREF(buffer);
472 return NULL;
475 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
476 int size,
477 const char *encoding,
478 const char *errors)
480 PyObject *v, *unicode;
482 unicode = PyUnicode_FromUnicode(s, size);
483 if (unicode == NULL)
484 return NULL;
485 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
486 Py_DECREF(unicode);
487 return v;
490 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
491 const char *encoding,
492 const char *errors)
494 PyObject *v;
496 if (!PyUnicode_Check(unicode)) {
497 PyErr_BadArgument();
498 goto onError;
501 if (encoding == NULL)
502 encoding = PyUnicode_GetDefaultEncoding();
504 /* Shortcuts for common default encodings */
505 if (errors == NULL) {
506 if (strcmp(encoding, "utf-8") == 0)
507 return PyUnicode_AsUTF8String(unicode);
508 else if (strcmp(encoding, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode);
510 else if (strcmp(encoding, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode);
514 /* Encode via the codec registry */
515 v = PyCodec_Encode(unicode, encoding, errors);
516 if (v == NULL)
517 goto onError;
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v)) {
520 PyErr_Format(PyExc_TypeError,
521 "encoder did not return a string object (type=%.400s)",
522 v->ob_type->tp_name);
523 Py_DECREF(v);
524 goto onError;
526 return v;
528 onError:
529 return NULL;
532 /* Return a Python string holding the default encoded value of the
533 Unicode object.
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
540 The refcount of the string is *not* incremented.
542 *** Exported for internal use by the interpreter only !!! ***
546 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
547 const char *errors)
549 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
551 if (v)
552 return v;
553 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
554 if (v && errors == NULL)
555 ((PyUnicodeObject *)unicode)->defenc = v;
556 return v;
559 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
561 if (!PyUnicode_Check(unicode)) {
562 PyErr_BadArgument();
563 goto onError;
565 return PyUnicode_AS_UNICODE(unicode);
567 onError:
568 return NULL;
571 int PyUnicode_GetSize(PyObject *unicode)
573 if (!PyUnicode_Check(unicode)) {
574 PyErr_BadArgument();
575 goto onError;
577 return PyUnicode_GET_SIZE(unicode);
579 onError:
580 return -1;
583 const char *PyUnicode_GetDefaultEncoding(void)
585 return unicode_default_encoding;
588 int PyUnicode_SetDefaultEncoding(const char *encoding)
590 PyObject *v;
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v = _PyCodec_Lookup(encoding);
595 if (v == NULL)
596 goto onError;
597 Py_DECREF(v);
598 strncpy(unicode_default_encoding,
599 encoding,
600 sizeof(unicode_default_encoding));
601 return 0;
603 onError:
604 return -1;
607 /* --- UTF-8 Codec -------------------------------------------------------- */
609 static
610 char utf8_code_length[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
631 static
632 int utf8_decoding_error(const char **source,
633 Py_UNICODE **dest,
634 const char *errors,
635 const char *details)
637 if ((errors == NULL) ||
638 (strcmp(errors,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError,
640 "UTF-8 decoding error: %.400s",
641 details);
642 return -1;
644 else if (strcmp(errors,"ignore") == 0) {
645 (*source)++;
646 return 0;
648 else if (strcmp(errors,"replace") == 0) {
649 (*source)++;
650 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
651 (*dest)++;
652 return 0;
654 else {
655 PyErr_Format(PyExc_ValueError,
656 "UTF-8 decoding error; unknown error handling code: %.400s",
657 errors);
658 return -1;
662 PyObject *PyUnicode_DecodeUTF8(const char *s,
663 int size,
664 const char *errors)
666 int n;
667 const char *e;
668 PyUnicodeObject *unicode;
669 Py_UNICODE *p;
670 const char *errmsg = "";
672 /* Note: size will always be longer than the resulting Unicode
673 character count */
674 unicode = _PyUnicode_New(size);
675 if (!unicode)
676 return NULL;
677 if (size == 0)
678 return (PyObject *)unicode;
680 /* Unpack UTF-8 encoded data */
681 p = unicode->str;
682 e = s + size;
684 while (s < e) {
685 Py_UCS4 ch = (unsigned char)*s;
687 if (ch < 0x80) {
688 *p++ = (Py_UNICODE)ch;
689 s++;
690 continue;
693 n = utf8_code_length[ch];
695 if (s + n > e) {
696 errmsg = "unexpected end of data";
697 goto utf8Error;
700 switch (n) {
702 case 0:
703 errmsg = "unexpected code byte";
704 goto utf8Error;
705 break;
707 case 1:
708 errmsg = "internal error";
709 goto utf8Error;
710 break;
712 case 2:
713 if ((s[1] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
717 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
718 if (ch < 0x80) {
719 errmsg = "illegal encoding";
720 goto utf8Error;
722 else
723 *p++ = (Py_UNICODE)ch;
724 break;
726 case 3:
727 if ((s[1] & 0xc0) != 0x80 ||
728 (s[2] & 0xc0) != 0x80) {
729 errmsg = "invalid data";
730 goto utf8Error;
732 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
733 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
734 errmsg = "illegal encoding";
735 goto utf8Error;
737 else
738 *p++ = (Py_UNICODE)ch;
739 break;
741 case 4:
742 if ((s[1] & 0xc0) != 0x80 ||
743 (s[2] & 0xc0) != 0x80 ||
744 (s[3] & 0xc0) != 0x80) {
745 errmsg = "invalid data";
746 goto utf8Error;
748 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
749 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
750 /* validate and convert to UTF-16 */
751 if ((ch < 0x10000) || /* minimum value allowed for 4
752 byte encoding */
753 (ch > 0x10ffff)) { /* maximum value allowed for
754 UTF-16 */
755 errmsg = "illegal encoding";
756 goto utf8Error;
758 /* compute and append the two surrogates: */
760 /* translate from 10000..10FFFF to 0..FFFF */
761 ch -= 0x10000;
763 /* high surrogate = top 10 bits added to D800 */
764 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
768 break;
770 default:
771 /* Other sizes are only needed for UCS-4 */
772 errmsg = "unsupported Unicode code range";
773 goto utf8Error;
774 break;
776 s += n;
777 continue;
779 utf8Error:
780 if (utf8_decoding_error(&s, &p, errors, errmsg))
781 goto onError;
784 /* Adjust length */
785 if (_PyUnicode_Resize(unicode, p - unicode->str))
786 goto onError;
788 return (PyObject *)unicode;
790 onError:
791 Py_DECREF(unicode);
792 return NULL;
795 /* Not used anymore, now that the encoder supports UTF-16
796 surrogates. */
797 #if 0
798 static
799 int utf8_encoding_error(const Py_UNICODE **source,
800 char **dest,
801 const char *errors,
802 const char *details)
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
807 "UTF-8 encoding error: %.400s",
808 details);
809 return -1;
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
814 else if (strcmp(errors,"replace") == 0) {
815 **dest = '?';
816 (*dest)++;
817 return 0;
819 else {
820 PyErr_Format(PyExc_ValueError,
821 "UTF-8 encoding error; "
822 "unknown error handling code: %.400s",
823 errors);
824 return -1;
827 #endif
829 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
830 int size,
831 const char *errors)
833 PyObject *v;
834 char *p;
835 char *q;
836 Py_UCS4 ch2;
837 unsigned int cbAllocated = 3 * size;
838 unsigned int cbWritten = 0;
839 int i = 0;
841 v = PyString_FromStringAndSize(NULL, cbAllocated);
842 if (v == NULL)
843 return NULL;
844 if (size == 0)
845 return v;
847 p = q = PyString_AS_STRING(v);
848 while (i < size) {
849 Py_UCS4 ch = s[i++];
850 if (ch < 0x80) {
851 *p++ = (char) ch;
852 cbWritten++;
854 else if (ch < 0x0800) {
855 *p++ = 0xc0 | (ch >> 6);
856 *p++ = 0x80 | (ch & 0x3f);
857 cbWritten += 2;
859 else {
860 /* Check for high surrogate */
861 if (0xD800 <= ch && ch <= 0xDBFF) {
862 if (i != size) {
863 ch2 = s[i];
864 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
866 if (cbWritten >= (cbAllocated - 4)) {
867 /* Provide enough room for some more
868 surrogates */
869 cbAllocated += 4*10;
870 if (_PyString_Resize(&v, cbAllocated))
871 goto onError;
874 /* combine the two values */
875 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
877 *p++ = (char)((ch >> 18) | 0xf0);
878 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
879 i++;
880 cbWritten += 4;
884 else {
885 *p++ = (char)(0xe0 | (ch >> 12));
886 cbWritten += 3;
888 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
889 *p++ = (char)(0x80 | (ch & 0x3f));
892 *p = '\0';
893 if (_PyString_Resize(&v, p - q))
894 goto onError;
895 return v;
897 onError:
898 Py_DECREF(v);
899 return NULL;
902 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
904 PyObject *str;
906 if (!PyUnicode_Check(unicode)) {
907 PyErr_BadArgument();
908 return NULL;
910 str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
911 PyUnicode_GET_SIZE(unicode),
912 NULL);
913 if (str == NULL)
914 return NULL;
915 Py_INCREF(str);
916 return str;
919 /* --- UTF-16 Codec ------------------------------------------------------- */
921 static
922 int utf16_decoding_error(const Py_UNICODE **source,
923 Py_UNICODE **dest,
924 const char *errors,
925 const char *details)
927 if ((errors == NULL) ||
928 (strcmp(errors,"strict") == 0)) {
929 PyErr_Format(PyExc_UnicodeError,
930 "UTF-16 decoding error: %.400s",
931 details);
932 return -1;
934 else if (strcmp(errors,"ignore") == 0) {
935 return 0;
937 else if (strcmp(errors,"replace") == 0) {
938 if (dest) {
939 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
940 (*dest)++;
942 return 0;
944 else {
945 PyErr_Format(PyExc_ValueError,
946 "UTF-16 decoding error; "
947 "unknown error handling code: %.400s",
948 errors);
949 return -1;
953 PyObject *PyUnicode_DecodeUTF16(const char *s,
954 int size,
955 const char *errors,
956 int *byteorder)
958 PyUnicodeObject *unicode;
959 Py_UNICODE *p;
960 const Py_UNICODE *q, *e;
961 int bo = 0;
962 const char *errmsg = "";
964 /* size should be an even number */
965 if (size % sizeof(Py_UNICODE) != 0) {
966 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
967 return NULL;
968 /* The remaining input chars are ignored if we fall through
969 here... */
972 /* Note: size will always be longer than the resulting Unicode
973 character count */
974 unicode = _PyUnicode_New(size);
975 if (!unicode)
976 return NULL;
977 if (size == 0)
978 return (PyObject *)unicode;
980 /* Unpack UTF-16 encoded data */
981 p = unicode->str;
982 q = (Py_UNICODE *)s;
983 e = q + (size / sizeof(Py_UNICODE));
985 if (byteorder)
986 bo = *byteorder;
988 while (q < e) {
989 register Py_UNICODE ch = *q++;
991 /* Check for BOM marks (U+FEFF) in the input and adjust
992 current byte order setting accordingly. Swap input
993 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
994 !) */
995 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
996 if (ch == 0xFEFF) {
997 bo = -1;
998 continue;
999 } else if (ch == 0xFFFE) {
1000 bo = 1;
1001 continue;
1003 if (bo == 1)
1004 ch = (ch >> 8) | (ch << 8);
1005 #else
1006 if (ch == 0xFEFF) {
1007 bo = 1;
1008 continue;
1009 } else if (ch == 0xFFFE) {
1010 bo = -1;
1011 continue;
1013 if (bo == -1)
1014 ch = (ch >> 8) | (ch << 8);
1015 #endif
1016 if (ch < 0xD800 || ch > 0xDFFF) {
1017 *p++ = ch;
1018 continue;
1021 /* UTF-16 code pair: */
1022 if (q >= e) {
1023 errmsg = "unexpected end of data";
1024 goto utf16Error;
1026 if (0xDC00 <= *q && *q <= 0xDFFF) {
1027 q++;
1028 if (0xD800 <= *q && *q <= 0xDBFF) {
1029 /* This is valid data (a UTF-16 surrogate pair), but
1030 we are not able to store this information since our
1031 Py_UNICODE type only has 16 bits... this might
1032 change someday, even though it's unlikely. */
1033 errmsg = "code pairs are not supported";
1034 goto utf16Error;
1036 else
1037 continue;
1039 errmsg = "illegal encoding";
1040 /* Fall through to report the error */
1042 utf16Error:
1043 if (utf16_decoding_error(&q, &p, errors, errmsg))
1044 goto onError;
1047 if (byteorder)
1048 *byteorder = bo;
1050 /* Adjust length */
1051 if (_PyUnicode_Resize(unicode, p - unicode->str))
1052 goto onError;
1054 return (PyObject *)unicode;
1056 onError:
1057 Py_DECREF(unicode);
1058 return NULL;
1061 #undef UTF16_ERROR
1063 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1064 int size,
1065 const char *errors,
1066 int byteorder)
1068 PyObject *v;
1069 Py_UNICODE *p;
1070 char *q;
1072 /* We don't create UTF-16 pairs... */
1073 v = PyString_FromStringAndSize(NULL,
1074 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1075 if (v == NULL)
1076 return NULL;
1078 q = PyString_AS_STRING(v);
1079 p = (Py_UNICODE *)q;
1080 if (byteorder == 0)
1081 *p++ = 0xFEFF;
1082 if (size == 0)
1083 return v;
1084 if (byteorder == 0 ||
1085 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1086 byteorder == -1
1087 #else
1088 byteorder == 1
1089 #endif
1091 memcpy(p, s, size * sizeof(Py_UNICODE));
1092 else
1093 while (size-- > 0) {
1094 Py_UNICODE ch = *s++;
1095 *p++ = (ch >> 8) | (ch << 8);
1097 return v;
1100 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1102 if (!PyUnicode_Check(unicode)) {
1103 PyErr_BadArgument();
1104 return NULL;
1106 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1107 PyUnicode_GET_SIZE(unicode),
1108 NULL,
1112 /* --- Unicode Escape Codec ----------------------------------------------- */
1114 static
1115 int unicodeescape_decoding_error(const char **source,
1116 Py_UNICODE *x,
1117 const char *errors,
1118 const char *details)
1120 if ((errors == NULL) ||
1121 (strcmp(errors,"strict") == 0)) {
1122 PyErr_Format(PyExc_UnicodeError,
1123 "Unicode-Escape decoding error: %.400s",
1124 details);
1125 return -1;
1127 else if (strcmp(errors,"ignore") == 0) {
1128 return 0;
1130 else if (strcmp(errors,"replace") == 0) {
1131 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1132 return 0;
1134 else {
1135 PyErr_Format(PyExc_ValueError,
1136 "Unicode-Escape decoding error; "
1137 "unknown error handling code: %.400s",
1138 errors);
1139 return -1;
1143 static _Py_UCNHashAPI *pucnHash = NULL;
1145 static
1146 int mystrnicmp(const char *s1, const char *s2, size_t count)
1148 char c1, c2;
1150 if (count)
1154 c1 = tolower(*(s1++));
1155 c2 = tolower(*(s2++));
1157 while(--count && c1 == c2);
1159 return c1 - c2;
1162 return 0;
1165 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1166 int size,
1167 const char *errors)
1169 PyUnicodeObject *v;
1170 Py_UNICODE *p = NULL, *buf = NULL;
1171 const char *end;
1173 /* Escaped strings will always be longer than the resulting
1174 Unicode string, so we start with size here and then reduce the
1175 length after conversion to the true value. */
1176 v = _PyUnicode_New(size);
1177 if (v == NULL)
1178 goto onError;
1179 if (size == 0)
1180 return (PyObject *)v;
1181 p = buf = PyUnicode_AS_UNICODE(v);
1182 end = s + size;
1183 while (s < end) {
1184 unsigned char c;
1185 Py_UNICODE x;
1186 int i;
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1189 if (*s != '\\') {
1190 *p++ = (unsigned char)*s++;
1191 continue;
1194 /* \ - Escapes */
1195 s++;
1196 switch (*s++) {
1198 /* \x escapes */
1199 case '\n': break;
1200 case '\\': *p++ = '\\'; break;
1201 case '\'': *p++ = '\''; break;
1202 case '\"': *p++ = '\"'; break;
1203 case 'b': *p++ = '\b'; break;
1204 case 'f': *p++ = '\014'; break; /* FF */
1205 case 't': *p++ = '\t'; break;
1206 case 'n': *p++ = '\n'; break;
1207 case 'r': *p++ = '\r'; break;
1208 case 'v': *p++ = '\013'; break; /* VT */
1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
1214 x = s[-1] - '0';
1215 if ('0' <= *s && *s <= '7') {
1216 x = (x<<3) + *s++ - '0';
1217 if ('0' <= *s && *s <= '7')
1218 x = (x<<3) + *s++ - '0';
1220 *p++ = x;
1221 break;
1223 /* \xXXXX escape with 1-n hex digits. for compatibility
1224 with 8-bit strings, this code ignores all but the last
1225 two digits */
1226 case 'x':
1227 x = 0;
1228 c = (unsigned char)*s;
1229 if (isxdigit(c)) {
1230 do {
1231 x = (x<<4) & 0xF0;
1232 if ('0' <= c && c <= '9')
1233 x += c - '0';
1234 else if ('a' <= c && c <= 'f')
1235 x += 10 + c - 'a';
1236 else
1237 x += 10 + c - 'A';
1238 c = (unsigned char)*++s;
1239 } while (isxdigit(c));
1240 *p++ = (unsigned char) x;
1241 } else {
1242 *p++ = '\\';
1243 *p++ = (unsigned char)s[-1];
1245 break;
1247 /* \uXXXX with 4 hex digits */
1248 case 'u':
1249 for (x = 0, i = 0; i < 4; i++) {
1250 c = (unsigned char)s[i];
1251 if (!isxdigit(c)) {
1252 if (unicodeescape_decoding_error(&s, &x, errors,
1253 "truncated \\uXXXX"))
1254 goto onError;
1255 i++;
1256 break;
1258 x = (x<<4) & ~0xF;
1259 if (c >= '0' && c <= '9')
1260 x += c - '0';
1261 else if (c >= 'a' && c <= 'f')
1262 x += 10 + c - 'a';
1263 else
1264 x += 10 + c - 'A';
1266 s += i;
1267 *p++ = x;
1268 break;
1270 case 'N':
1271 /* Ok, we need to deal with Unicode Character Names now,
1272 * make sure we've imported the hash table data...
1274 if (pucnHash == NULL)
1276 PyObject *mod = 0, *v = 0;
1278 mod = PyImport_ImportModule("ucnhash");
1279 if (mod == NULL)
1280 goto onError;
1281 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1282 Py_DECREF(mod);
1283 if (v == NULL)
1285 goto onError;
1287 pucnHash = PyCObject_AsVoidPtr(v);
1288 Py_DECREF(v);
1289 if (pucnHash == NULL)
1291 goto onError;
1295 if (*s == '{')
1297 const char *start = s + 1;
1298 const char *endBrace = start;
1299 Py_UCS4 value;
1300 unsigned long j;
1302 /* look for either the closing brace, or we
1303 * exceed the maximum length of the unicode character names
1305 while (*endBrace != '}' &&
1306 (unsigned int)(endBrace - start) <=
1307 pucnHash->cchMax &&
1308 endBrace < end)
1310 endBrace++;
1312 if (endBrace != end && *endBrace == '}')
1314 j = pucnHash->hash(start, endBrace - start);
1315 if (j > pucnHash->cKeys ||
1316 mystrnicmp(
1317 start,
1318 ((_Py_UnicodeCharacterName *)
1319 (pucnHash->getValue(j)))->pszUCN,
1320 (int)(endBrace - start)) != 0)
1322 if (unicodeescape_decoding_error(
1323 &s, &x, errors,
1324 "Invalid Unicode Character Name"))
1326 goto onError;
1328 goto ucnFallthrough;
1330 value = ((_Py_UnicodeCharacterName *)
1331 (pucnHash->getValue(j)))->value;
1332 if (value < 1<<16)
1334 /* In UCS-2 range, easy solution.. */
1335 *p++ = value;
1337 else
1339 /* Oops, its in UCS-4 space, */
1340 /* compute and append the two surrogates: */
1341 /* translate from 10000..10FFFF to 0..FFFFF */
1342 value -= 0x10000;
1344 /* high surrogate = top 10 bits added to D800 */
1345 *p++ = 0xD800 + (value >> 10);
1347 /* low surrogate = bottom 10 bits added to DC00 */
1348 *p++ = 0xDC00 + (value & ~0xFC00);
1350 s = endBrace + 1;
1352 else
1354 if (unicodeescape_decoding_error(
1355 &s, &x, errors,
1356 "Unicode name missing closing brace"))
1357 goto onError;
1358 goto ucnFallthrough;
1360 break;
1362 if (unicodeescape_decoding_error(
1363 &s, &x, errors,
1364 "Missing opening brace for Unicode Character Name escape"))
1365 goto onError;
1366 ucnFallthrough:
1367 /* fall through on purpose */
1368 default:
1369 *p++ = '\\';
1370 *p++ = (unsigned char)s[-1];
1371 break;
1374 if (_PyUnicode_Resize(v, (int)(p - buf)))
1375 goto onError;
1376 return (PyObject *)v;
1378 onError:
1379 Py_XDECREF(v);
1380 return NULL;
1383 /* Return a Unicode-Escape string version of the Unicode object.
1385 If quotes is true, the string is enclosed in u"" or u'' quotes as
1386 appropriate.
1390 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1391 int size,
1392 Py_UNICODE ch);
1394 static
1395 PyObject *unicodeescape_string(const Py_UNICODE *s,
1396 int size,
1397 int quotes)
1399 PyObject *repr;
1400 char *p;
1401 char *q;
1403 static const char *hexdigit = "0123456789ABCDEF";
1405 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1406 if (repr == NULL)
1407 return NULL;
1409 p = q = PyString_AS_STRING(repr);
1411 if (quotes) {
1412 *p++ = 'u';
1413 *p++ = (findchar(s, size, '\'') &&
1414 !findchar(s, size, '"')) ? '"' : '\'';
1416 while (size-- > 0) {
1417 Py_UNICODE ch = *s++;
1418 /* Escape quotes */
1419 if (quotes && (ch == q[1] || ch == '\\')) {
1420 *p++ = '\\';
1421 *p++ = (char) ch;
1423 /* Map 16-bit characters to '\uxxxx' */
1424 else if (ch >= 256) {
1425 *p++ = '\\';
1426 *p++ = 'u';
1427 *p++ = hexdigit[(ch >> 12) & 0xf];
1428 *p++ = hexdigit[(ch >> 8) & 0xf];
1429 *p++ = hexdigit[(ch >> 4) & 0xf];
1430 *p++ = hexdigit[ch & 15];
1432 /* Map non-printable US ASCII to '\ooo' */
1433 else if (ch < ' ' || ch >= 128) {
1434 *p++ = '\\';
1435 *p++ = hexdigit[(ch >> 6) & 7];
1436 *p++ = hexdigit[(ch >> 3) & 7];
1437 *p++ = hexdigit[ch & 7];
1439 /* Copy everything else as-is */
1440 else
1441 *p++ = (char) ch;
1443 if (quotes)
1444 *p++ = q[1];
1446 *p = '\0';
1447 if (_PyString_Resize(&repr, p - q))
1448 goto onError;
1450 return repr;
1452 onError:
1453 Py_DECREF(repr);
1454 return NULL;
1457 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1458 int size)
1460 return unicodeescape_string(s, size, 0);
1463 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1465 if (!PyUnicode_Check(unicode)) {
1466 PyErr_BadArgument();
1467 return NULL;
1469 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1470 PyUnicode_GET_SIZE(unicode));
1473 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1475 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1476 int size,
1477 const char *errors)
1479 PyUnicodeObject *v;
1480 Py_UNICODE *p, *buf;
1481 const char *end;
1482 const char *bs;
1484 /* Escaped strings will always be longer than the resulting
1485 Unicode string, so we start with size here and then reduce the
1486 length after conversion to the true value. */
1487 v = _PyUnicode_New(size);
1488 if (v == NULL)
1489 goto onError;
1490 if (size == 0)
1491 return (PyObject *)v;
1492 p = buf = PyUnicode_AS_UNICODE(v);
1493 end = s + size;
1494 while (s < end) {
1495 unsigned char c;
1496 Py_UNICODE x;
1497 int i;
1499 /* Non-escape characters are interpreted as Unicode ordinals */
1500 if (*s != '\\') {
1501 *p++ = (unsigned char)*s++;
1502 continue;
1505 /* \u-escapes are only interpreted iff the number of leading
1506 backslashes if odd */
1507 bs = s;
1508 for (;s < end;) {
1509 if (*s != '\\')
1510 break;
1511 *p++ = (unsigned char)*s++;
1513 if (((s - bs) & 1) == 0 ||
1514 s >= end ||
1515 *s != 'u') {
1516 continue;
1518 p--;
1519 s++;
1521 /* \uXXXX with 4 hex digits */
1522 for (x = 0, i = 0; i < 4; i++) {
1523 c = (unsigned char)s[i];
1524 if (!isxdigit(c)) {
1525 if (unicodeescape_decoding_error(&s, &x, errors,
1526 "truncated \\uXXXX"))
1527 goto onError;
1528 i++;
1529 break;
1531 x = (x<<4) & ~0xF;
1532 if (c >= '0' && c <= '9')
1533 x += c - '0';
1534 else if (c >= 'a' && c <= 'f')
1535 x += 10 + c - 'a';
1536 else
1537 x += 10 + c - 'A';
1539 s += i;
1540 *p++ = x;
1542 if (_PyUnicode_Resize(v, (int)(p - buf)))
1543 goto onError;
1544 return (PyObject *)v;
1546 onError:
1547 Py_XDECREF(v);
1548 return NULL;
1551 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1552 int size)
1554 PyObject *repr;
1555 char *p;
1556 char *q;
1558 static const char *hexdigit = "0123456789ABCDEF";
1560 repr = PyString_FromStringAndSize(NULL, 6 * size);
1561 if (repr == NULL)
1562 return NULL;
1563 if (size == 0)
1564 return repr;
1566 p = q = PyString_AS_STRING(repr);
1567 while (size-- > 0) {
1568 Py_UNICODE ch = *s++;
1569 /* Map 16-bit characters to '\uxxxx' */
1570 if (ch >= 256) {
1571 *p++ = '\\';
1572 *p++ = 'u';
1573 *p++ = hexdigit[(ch >> 12) & 0xf];
1574 *p++ = hexdigit[(ch >> 8) & 0xf];
1575 *p++ = hexdigit[(ch >> 4) & 0xf];
1576 *p++ = hexdigit[ch & 15];
1578 /* Copy everything else as-is */
1579 else
1580 *p++ = (char) ch;
1582 *p = '\0';
1583 if (_PyString_Resize(&repr, p - q))
1584 goto onError;
1586 return repr;
1588 onError:
1589 Py_DECREF(repr);
1590 return NULL;
1593 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1595 if (!PyUnicode_Check(unicode)) {
1596 PyErr_BadArgument();
1597 return NULL;
1599 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1600 PyUnicode_GET_SIZE(unicode));
1603 /* --- Latin-1 Codec ------------------------------------------------------ */
1605 PyObject *PyUnicode_DecodeLatin1(const char *s,
1606 int size,
1607 const char *errors)
1609 PyUnicodeObject *v;
1610 Py_UNICODE *p;
1612 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1613 v = _PyUnicode_New(size);
1614 if (v == NULL)
1615 goto onError;
1616 if (size == 0)
1617 return (PyObject *)v;
1618 p = PyUnicode_AS_UNICODE(v);
1619 while (size-- > 0)
1620 *p++ = (unsigned char)*s++;
1621 return (PyObject *)v;
1623 onError:
1624 Py_XDECREF(v);
1625 return NULL;
1628 static
1629 int latin1_encoding_error(const Py_UNICODE **source,
1630 char **dest,
1631 const char *errors,
1632 const char *details)
1634 if ((errors == NULL) ||
1635 (strcmp(errors,"strict") == 0)) {
1636 PyErr_Format(PyExc_UnicodeError,
1637 "Latin-1 encoding error: %.400s",
1638 details);
1639 return -1;
1641 else if (strcmp(errors,"ignore") == 0) {
1642 return 0;
1644 else if (strcmp(errors,"replace") == 0) {
1645 **dest = '?';
1646 (*dest)++;
1647 return 0;
1649 else {
1650 PyErr_Format(PyExc_ValueError,
1651 "Latin-1 encoding error; "
1652 "unknown error handling code: %.400s",
1653 errors);
1654 return -1;
1658 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1659 int size,
1660 const char *errors)
1662 PyObject *repr;
1663 char *s, *start;
1665 repr = PyString_FromStringAndSize(NULL, size);
1666 if (repr == NULL)
1667 return NULL;
1668 if (size == 0)
1669 return repr;
1671 s = PyString_AS_STRING(repr);
1672 start = s;
1673 while (size-- > 0) {
1674 Py_UNICODE ch = *p++;
1675 if (ch >= 256) {
1676 if (latin1_encoding_error(&p, &s, errors,
1677 "ordinal not in range(256)"))
1678 goto onError;
1680 else
1681 *s++ = (char)ch;
1683 /* Resize if error handling skipped some characters */
1684 if (s - start < PyString_GET_SIZE(repr))
1685 if (_PyString_Resize(&repr, s - start))
1686 goto onError;
1687 return repr;
1689 onError:
1690 Py_DECREF(repr);
1691 return NULL;
1694 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1696 if (!PyUnicode_Check(unicode)) {
1697 PyErr_BadArgument();
1698 return NULL;
1700 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 NULL);
1705 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1707 static
1708 int ascii_decoding_error(const char **source,
1709 Py_UNICODE **dest,
1710 const char *errors,
1711 const char *details)
1713 if ((errors == NULL) ||
1714 (strcmp(errors,"strict") == 0)) {
1715 PyErr_Format(PyExc_UnicodeError,
1716 "ASCII decoding error: %.400s",
1717 details);
1718 return -1;
1720 else if (strcmp(errors,"ignore") == 0) {
1721 return 0;
1723 else if (strcmp(errors,"replace") == 0) {
1724 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1725 (*dest)++;
1726 return 0;
1728 else {
1729 PyErr_Format(PyExc_ValueError,
1730 "ASCII decoding error; "
1731 "unknown error handling code: %.400s",
1732 errors);
1733 return -1;
1737 PyObject *PyUnicode_DecodeASCII(const char *s,
1738 int size,
1739 const char *errors)
1741 PyUnicodeObject *v;
1742 Py_UNICODE *p;
1744 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1745 v = _PyUnicode_New(size);
1746 if (v == NULL)
1747 goto onError;
1748 if (size == 0)
1749 return (PyObject *)v;
1750 p = PyUnicode_AS_UNICODE(v);
1751 while (size-- > 0) {
1752 register unsigned char c;
1754 c = (unsigned char)*s++;
1755 if (c < 128)
1756 *p++ = c;
1757 else if (ascii_decoding_error(&s, &p, errors,
1758 "ordinal not in range(128)"))
1759 goto onError;
1761 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1762 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1763 goto onError;
1764 return (PyObject *)v;
1766 onError:
1767 Py_XDECREF(v);
1768 return NULL;
1771 static
1772 int ascii_encoding_error(const Py_UNICODE **source,
1773 char **dest,
1774 const char *errors,
1775 const char *details)
1777 if ((errors == NULL) ||
1778 (strcmp(errors,"strict") == 0)) {
1779 PyErr_Format(PyExc_UnicodeError,
1780 "ASCII encoding error: %.400s",
1781 details);
1782 return -1;
1784 else if (strcmp(errors,"ignore") == 0) {
1785 return 0;
1787 else if (strcmp(errors,"replace") == 0) {
1788 **dest = '?';
1789 (*dest)++;
1790 return 0;
1792 else {
1793 PyErr_Format(PyExc_ValueError,
1794 "ASCII encoding error; "
1795 "unknown error handling code: %.400s",
1796 errors);
1797 return -1;
1801 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1802 int size,
1803 const char *errors)
1805 PyObject *repr;
1806 char *s, *start;
1808 repr = PyString_FromStringAndSize(NULL, size);
1809 if (repr == NULL)
1810 return NULL;
1811 if (size == 0)
1812 return repr;
1814 s = PyString_AS_STRING(repr);
1815 start = s;
1816 while (size-- > 0) {
1817 Py_UNICODE ch = *p++;
1818 if (ch >= 128) {
1819 if (ascii_encoding_error(&p, &s, errors,
1820 "ordinal not in range(128)"))
1821 goto onError;
1823 else
1824 *s++ = (char)ch;
1826 /* Resize if error handling skipped some characters */
1827 if (s - start < PyString_GET_SIZE(repr))
1828 if (_PyString_Resize(&repr, s - start))
1829 goto onError;
1830 return repr;
1832 onError:
1833 Py_DECREF(repr);
1834 return NULL;
1837 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1839 if (!PyUnicode_Check(unicode)) {
1840 PyErr_BadArgument();
1841 return NULL;
1843 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1844 PyUnicode_GET_SIZE(unicode),
1845 NULL);
1848 #ifdef MS_WIN32
1850 /* --- MBCS codecs for Windows -------------------------------------------- */
1852 PyObject *PyUnicode_DecodeMBCS(const char *s,
1853 int size,
1854 const char *errors)
1856 PyUnicodeObject *v;
1857 Py_UNICODE *p;
1859 /* First get the size of the result */
1860 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1861 if (size > 0 && usize==0)
1862 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1864 v = _PyUnicode_New(usize);
1865 if (v == NULL)
1866 return NULL;
1867 if (usize == 0)
1868 return (PyObject *)v;
1869 p = PyUnicode_AS_UNICODE(v);
1870 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1871 Py_DECREF(v);
1872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1875 return (PyObject *)v;
1878 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1879 int size,
1880 const char *errors)
1882 PyObject *repr;
1883 char *s;
1884 DWORD mbcssize;
1886 /* If there are no characters, bail now! */
1887 if (size==0)
1888 return PyString_FromString("");
1890 /* First get the size of the result */
1891 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1892 if (mbcssize==0)
1893 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1895 repr = PyString_FromStringAndSize(NULL, mbcssize);
1896 if (repr == NULL)
1897 return NULL;
1898 if (mbcssize == 0)
1899 return repr;
1901 /* Do the conversion */
1902 s = PyString_AS_STRING(repr);
1903 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1904 Py_DECREF(repr);
1905 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1907 return repr;
1910 #endif /* MS_WIN32 */
1912 /* --- Character Mapping Codec -------------------------------------------- */
1914 static
1915 int charmap_decoding_error(const char **source,
1916 Py_UNICODE **dest,
1917 const char *errors,
1918 const char *details)
1920 if ((errors == NULL) ||
1921 (strcmp(errors,"strict") == 0)) {
1922 PyErr_Format(PyExc_UnicodeError,
1923 "charmap decoding error: %.400s",
1924 details);
1925 return -1;
1927 else if (strcmp(errors,"ignore") == 0) {
1928 return 0;
1930 else if (strcmp(errors,"replace") == 0) {
1931 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1932 (*dest)++;
1933 return 0;
1935 else {
1936 PyErr_Format(PyExc_ValueError,
1937 "charmap decoding error; "
1938 "unknown error handling code: %.400s",
1939 errors);
1940 return -1;
1944 PyObject *PyUnicode_DecodeCharmap(const char *s,
1945 int size,
1946 PyObject *mapping,
1947 const char *errors)
1949 PyUnicodeObject *v;
1950 Py_UNICODE *p;
1952 /* Default to Latin-1 */
1953 if (mapping == NULL)
1954 return PyUnicode_DecodeLatin1(s, size, errors);
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 return (PyObject *)v;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 unsigned char ch = *s++;
1964 PyObject *w, *x;
1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967 w = PyInt_FromLong((long)ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974 /* No mapping found: default to Latin-1 mapping */
1975 PyErr_Clear();
1976 *p++ = (Py_UNICODE)ch;
1977 continue;
1979 goto onError;
1982 /* Apply mapping */
1983 if (PyInt_Check(x)) {
1984 long value = PyInt_AS_LONG(x);
1985 if (value < 0 || value > 65535) {
1986 PyErr_SetString(PyExc_TypeError,
1987 "character mapping must be in range(65536)");
1988 Py_DECREF(x);
1989 goto onError;
1991 *p++ = (Py_UNICODE)value;
1993 else if (x == Py_None) {
1994 /* undefined mapping */
1995 if (charmap_decoding_error(&s, &p, errors,
1996 "character maps to <undefined>")) {
1997 Py_DECREF(x);
1998 goto onError;
2001 else if (PyUnicode_Check(x)) {
2002 if (PyUnicode_GET_SIZE(x) != 1) {
2003 /* 1-n mapping */
2004 PyErr_SetString(PyExc_NotImplementedError,
2005 "1-n mappings are currently not implemented");
2006 Py_DECREF(x);
2007 goto onError;
2009 *p++ = *PyUnicode_AS_UNICODE(x);
2011 else {
2012 /* wrong return value */
2013 PyErr_SetString(PyExc_TypeError,
2014 "character mapping must return integer, None or unicode");
2015 Py_DECREF(x);
2016 goto onError;
2018 Py_DECREF(x);
2020 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2021 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2022 goto onError;
2023 return (PyObject *)v;
2025 onError:
2026 Py_XDECREF(v);
2027 return NULL;
2030 static
2031 int charmap_encoding_error(const Py_UNICODE **source,
2032 char **dest,
2033 const char *errors,
2034 const char *details)
2036 if ((errors == NULL) ||
2037 (strcmp(errors,"strict") == 0)) {
2038 PyErr_Format(PyExc_UnicodeError,
2039 "charmap encoding error: %.400s",
2040 details);
2041 return -1;
2043 else if (strcmp(errors,"ignore") == 0) {
2044 return 0;
2046 else if (strcmp(errors,"replace") == 0) {
2047 **dest = '?';
2048 (*dest)++;
2049 return 0;
2051 else {
2052 PyErr_Format(PyExc_ValueError,
2053 "charmap encoding error; "
2054 "unknown error handling code: %.400s",
2055 errors);
2056 return -1;
2060 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2061 int size,
2062 PyObject *mapping,
2063 const char *errors)
2065 PyObject *v;
2066 char *s;
2068 /* Default to Latin-1 */
2069 if (mapping == NULL)
2070 return PyUnicode_EncodeLatin1(p, size, errors);
2072 v = PyString_FromStringAndSize(NULL, size);
2073 if (v == NULL)
2074 return NULL;
2075 if (size == 0)
2076 return v;
2077 s = PyString_AS_STRING(v);
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *p++;
2080 PyObject *w, *x;
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w = PyInt_FromLong((long)ch);
2084 if (w == NULL)
2085 goto onError;
2086 x = PyObject_GetItem(mapping, w);
2087 Py_DECREF(w);
2088 if (x == NULL) {
2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2090 /* No mapping found: default to Latin-1 mapping if possible */
2091 PyErr_Clear();
2092 if (ch < 256) {
2093 *s++ = (char)ch;
2094 continue;
2096 else if (!charmap_encoding_error(&p, &s, errors,
2097 "missing character mapping"))
2098 continue;
2100 goto onError;
2103 /* Apply mapping */
2104 if (PyInt_Check(x)) {
2105 long value = PyInt_AS_LONG(x);
2106 if (value < 0 || value > 255) {
2107 PyErr_SetString(PyExc_TypeError,
2108 "character mapping must be in range(256)");
2109 Py_DECREF(x);
2110 goto onError;
2112 *s++ = (char)value;
2114 else if (x == Py_None) {
2115 /* undefined mapping */
2116 if (charmap_encoding_error(&p, &s, errors,
2117 "character maps to <undefined>")) {
2118 Py_DECREF(x);
2119 goto onError;
2122 else if (PyString_Check(x)) {
2123 if (PyString_GET_SIZE(x) != 1) {
2124 /* 1-n mapping */
2125 PyErr_SetString(PyExc_NotImplementedError,
2126 "1-n mappings are currently not implemented");
2127 Py_DECREF(x);
2128 goto onError;
2130 *s++ = *PyString_AS_STRING(x);
2132 else {
2133 /* wrong return value */
2134 PyErr_SetString(PyExc_TypeError,
2135 "character mapping must return integer, None or unicode");
2136 Py_DECREF(x);
2137 goto onError;
2139 Py_DECREF(x);
2141 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2142 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2143 goto onError;
2144 return v;
2146 onError:
2147 Py_DECREF(v);
2148 return NULL;
2151 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2152 PyObject *mapping)
2154 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2155 PyErr_BadArgument();
2156 return NULL;
2158 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2159 PyUnicode_GET_SIZE(unicode),
2160 mapping,
2161 NULL);
2164 static
2165 int translate_error(const Py_UNICODE **source,
2166 Py_UNICODE **dest,
2167 const char *errors,
2168 const char *details)
2170 if ((errors == NULL) ||
2171 (strcmp(errors,"strict") == 0)) {
2172 PyErr_Format(PyExc_UnicodeError,
2173 "translate error: %.400s",
2174 details);
2175 return -1;
2177 else if (strcmp(errors,"ignore") == 0) {
2178 return 0;
2180 else if (strcmp(errors,"replace") == 0) {
2181 **dest = '?';
2182 (*dest)++;
2183 return 0;
2185 else {
2186 PyErr_Format(PyExc_ValueError,
2187 "translate error; "
2188 "unknown error handling code: %.400s",
2189 errors);
2190 return -1;
2194 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2195 int size,
2196 PyObject *mapping,
2197 const char *errors)
2199 PyUnicodeObject *v;
2200 Py_UNICODE *p;
2202 if (mapping == NULL) {
2203 PyErr_BadArgument();
2204 return NULL;
2207 /* Output will never be longer than input */
2208 v = _PyUnicode_New(size);
2209 if (v == NULL)
2210 goto onError;
2211 if (size == 0)
2212 goto done;
2213 p = PyUnicode_AS_UNICODE(v);
2214 while (size-- > 0) {
2215 Py_UNICODE ch = *s++;
2216 PyObject *w, *x;
2218 /* Get mapping */
2219 w = PyInt_FromLong(ch);
2220 if (w == NULL)
2221 goto onError;
2222 x = PyObject_GetItem(mapping, w);
2223 Py_DECREF(w);
2224 if (x == NULL) {
2225 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2226 /* No mapping found: default to 1-1 mapping */
2227 PyErr_Clear();
2228 *p++ = ch;
2229 continue;
2231 goto onError;
2234 /* Apply mapping */
2235 if (PyInt_Check(x))
2236 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2237 else if (x == Py_None) {
2238 /* undefined mapping */
2239 if (translate_error(&s, &p, errors,
2240 "character maps to <undefined>")) {
2241 Py_DECREF(x);
2242 goto onError;
2245 else if (PyUnicode_Check(x)) {
2246 if (PyUnicode_GET_SIZE(x) != 1) {
2247 /* 1-n mapping */
2248 PyErr_SetString(PyExc_NotImplementedError,
2249 "1-n mappings are currently not implemented");
2250 Py_DECREF(x);
2251 goto onError;
2253 *p++ = *PyUnicode_AS_UNICODE(x);
2255 else {
2256 /* wrong return value */
2257 PyErr_SetString(PyExc_TypeError,
2258 "translate mapping must return integer, None or unicode");
2259 Py_DECREF(x);
2260 goto onError;
2262 Py_DECREF(x);
2264 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2265 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2266 goto onError;
2268 done:
2269 return (PyObject *)v;
2271 onError:
2272 Py_XDECREF(v);
2273 return NULL;
2276 PyObject *PyUnicode_Translate(PyObject *str,
2277 PyObject *mapping,
2278 const char *errors)
2280 PyObject *result;
2282 str = PyUnicode_FromObject(str);
2283 if (str == NULL)
2284 goto onError;
2285 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2286 PyUnicode_GET_SIZE(str),
2287 mapping,
2288 errors);
2289 Py_DECREF(str);
2290 return result;
2292 onError:
2293 Py_XDECREF(str);
2294 return NULL;
2297 /* --- Decimal Encoder ---------------------------------------------------- */
2299 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2300 int length,
2301 char *output,
2302 const char *errors)
2304 Py_UNICODE *p, *end;
2306 if (output == NULL) {
2307 PyErr_BadArgument();
2308 return -1;
2311 p = s;
2312 end = s + length;
2313 while (p < end) {
2314 register Py_UNICODE ch = *p++;
2315 int decimal;
2317 if (Py_UNICODE_ISSPACE(ch)) {
2318 *output++ = ' ';
2319 continue;
2321 decimal = Py_UNICODE_TODECIMAL(ch);
2322 if (decimal >= 0) {
2323 *output++ = '0' + decimal;
2324 continue;
2326 if (0 < ch && ch < 256) {
2327 *output++ = (char)ch;
2328 continue;
2330 /* All other characters are considered invalid */
2331 if (errors == NULL || strcmp(errors, "strict") == 0) {
2332 PyErr_SetString(PyExc_ValueError,
2333 "invalid decimal Unicode string");
2334 goto onError;
2336 else if (strcmp(errors, "ignore") == 0)
2337 continue;
2338 else if (strcmp(errors, "replace") == 0) {
2339 *output++ = '?';
2340 continue;
2343 /* 0-terminate the output string */
2344 *output++ = '\0';
2345 return 0;
2347 onError:
2348 return -1;
2351 /* --- Helpers ------------------------------------------------------------ */
2353 static
2354 int count(PyUnicodeObject *self,
2355 int start,
2356 int end,
2357 PyUnicodeObject *substring)
2359 int count = 0;
2361 if (substring->length == 0)
2362 return (end - start + 1);
2364 end -= substring->length;
2366 while (start <= end)
2367 if (Py_UNICODE_MATCH(self, start, substring)) {
2368 count++;
2369 start += substring->length;
2370 } else
2371 start++;
2373 return count;
2376 int PyUnicode_Count(PyObject *str,
2377 PyObject *substr,
2378 int start,
2379 int end)
2381 int result;
2383 str = PyUnicode_FromObject(str);
2384 if (str == NULL)
2385 return -1;
2386 substr = PyUnicode_FromObject(substr);
2387 if (substr == NULL) {
2388 Py_DECREF(str);
2389 return -1;
2392 result = count((PyUnicodeObject *)str,
2393 start, end,
2394 (PyUnicodeObject *)substr);
2396 Py_DECREF(str);
2397 Py_DECREF(substr);
2398 return result;
2401 static
2402 int findstring(PyUnicodeObject *self,
2403 PyUnicodeObject *substring,
2404 int start,
2405 int end,
2406 int direction)
2408 if (start < 0)
2409 start += self->length;
2410 if (start < 0)
2411 start = 0;
2413 if (substring->length == 0)
2414 return start;
2416 if (end > self->length)
2417 end = self->length;
2418 if (end < 0)
2419 end += self->length;
2420 if (end < 0)
2421 end = 0;
2423 end -= substring->length;
2425 if (direction < 0) {
2426 for (; end >= start; end--)
2427 if (Py_UNICODE_MATCH(self, end, substring))
2428 return end;
2429 } else {
2430 for (; start <= end; start++)
2431 if (Py_UNICODE_MATCH(self, start, substring))
2432 return start;
2435 return -1;
2438 int PyUnicode_Find(PyObject *str,
2439 PyObject *substr,
2440 int start,
2441 int end,
2442 int direction)
2444 int result;
2446 str = PyUnicode_FromObject(str);
2447 if (str == NULL)
2448 return -1;
2449 substr = PyUnicode_FromObject(substr);
2450 if (substr == NULL) {
2451 Py_DECREF(substr);
2452 return -1;
2455 result = findstring((PyUnicodeObject *)str,
2456 (PyUnicodeObject *)substr,
2457 start, end, direction);
2458 Py_DECREF(str);
2459 Py_DECREF(substr);
2460 return result;
2463 static
2464 int tailmatch(PyUnicodeObject *self,
2465 PyUnicodeObject *substring,
2466 int start,
2467 int end,
2468 int direction)
2470 if (start < 0)
2471 start += self->length;
2472 if (start < 0)
2473 start = 0;
2475 if (substring->length == 0)
2476 return 1;
2478 if (end > self->length)
2479 end = self->length;
2480 if (end < 0)
2481 end += self->length;
2482 if (end < 0)
2483 end = 0;
2485 end -= substring->length;
2486 if (end < start)
2487 return 0;
2489 if (direction > 0) {
2490 if (Py_UNICODE_MATCH(self, end, substring))
2491 return 1;
2492 } else {
2493 if (Py_UNICODE_MATCH(self, start, substring))
2494 return 1;
2497 return 0;
2500 int PyUnicode_Tailmatch(PyObject *str,
2501 PyObject *substr,
2502 int start,
2503 int end,
2504 int direction)
2506 int result;
2508 str = PyUnicode_FromObject(str);
2509 if (str == NULL)
2510 return -1;
2511 substr = PyUnicode_FromObject(substr);
2512 if (substr == NULL) {
2513 Py_DECREF(substr);
2514 return -1;
2517 result = tailmatch((PyUnicodeObject *)str,
2518 (PyUnicodeObject *)substr,
2519 start, end, direction);
2520 Py_DECREF(str);
2521 Py_DECREF(substr);
2522 return result;
2525 static
2526 const Py_UNICODE *findchar(const Py_UNICODE *s,
2527 int size,
2528 Py_UNICODE ch)
2530 /* like wcschr, but doesn't stop at NULL characters */
2532 while (size-- > 0) {
2533 if (*s == ch)
2534 return s;
2535 s++;
2538 return NULL;
2541 /* Apply fixfct filter to the Unicode object self and return a
2542 reference to the modified object */
2544 static
2545 PyObject *fixup(PyUnicodeObject *self,
2546 int (*fixfct)(PyUnicodeObject *s))
2549 PyUnicodeObject *u;
2551 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2552 self->length);
2553 if (u == NULL)
2554 return NULL;
2555 if (!fixfct(u)) {
2556 /* fixfct should return TRUE if it modified the buffer. If
2557 FALSE, return a reference to the original buffer instead
2558 (to save space, not time) */
2559 Py_INCREF(self);
2560 Py_DECREF(u);
2561 return (PyObject*) self;
2563 return (PyObject*) u;
2566 static
2567 int fixupper(PyUnicodeObject *self)
2569 int len = self->length;
2570 Py_UNICODE *s = self->str;
2571 int status = 0;
2573 while (len-- > 0) {
2574 register Py_UNICODE ch;
2576 ch = Py_UNICODE_TOUPPER(*s);
2577 if (ch != *s) {
2578 status = 1;
2579 *s = ch;
2581 s++;
2584 return status;
2587 static
2588 int fixlower(PyUnicodeObject *self)
2590 int len = self->length;
2591 Py_UNICODE *s = self->str;
2592 int status = 0;
2594 while (len-- > 0) {
2595 register Py_UNICODE ch;
2597 ch = Py_UNICODE_TOLOWER(*s);
2598 if (ch != *s) {
2599 status = 1;
2600 *s = ch;
2602 s++;
2605 return status;
2608 static
2609 int fixswapcase(PyUnicodeObject *self)
2611 int len = self->length;
2612 Py_UNICODE *s = self->str;
2613 int status = 0;
2615 while (len-- > 0) {
2616 if (Py_UNICODE_ISUPPER(*s)) {
2617 *s = Py_UNICODE_TOLOWER(*s);
2618 status = 1;
2619 } else if (Py_UNICODE_ISLOWER(*s)) {
2620 *s = Py_UNICODE_TOUPPER(*s);
2621 status = 1;
2623 s++;
2626 return status;
2629 static
2630 int fixcapitalize(PyUnicodeObject *self)
2632 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2633 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2634 return 1;
2636 return 0;
2639 static
2640 int fixtitle(PyUnicodeObject *self)
2642 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2643 register Py_UNICODE *e;
2644 int previous_is_cased;
2646 /* Shortcut for single character strings */
2647 if (PyUnicode_GET_SIZE(self) == 1) {
2648 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2649 if (*p != ch) {
2650 *p = ch;
2651 return 1;
2653 else
2654 return 0;
2657 e = p + PyUnicode_GET_SIZE(self);
2658 previous_is_cased = 0;
2659 for (; p < e; p++) {
2660 register const Py_UNICODE ch = *p;
2662 if (previous_is_cased)
2663 *p = Py_UNICODE_TOLOWER(ch);
2664 else
2665 *p = Py_UNICODE_TOTITLE(ch);
2667 if (Py_UNICODE_ISLOWER(ch) ||
2668 Py_UNICODE_ISUPPER(ch) ||
2669 Py_UNICODE_ISTITLE(ch))
2670 previous_is_cased = 1;
2671 else
2672 previous_is_cased = 0;
2674 return 1;
2677 PyObject *PyUnicode_Join(PyObject *separator,
2678 PyObject *seq)
2680 Py_UNICODE *sep;
2681 int seplen;
2682 PyUnicodeObject *res = NULL;
2683 int reslen = 0;
2684 Py_UNICODE *p;
2685 int seqlen = 0;
2686 int sz = 100;
2687 int i;
2689 seqlen = PySequence_Size(seq);
2690 if (seqlen < 0 && PyErr_Occurred())
2691 return NULL;
2693 if (separator == NULL) {
2694 Py_UNICODE blank = ' ';
2695 sep = &blank;
2696 seplen = 1;
2698 else {
2699 separator = PyUnicode_FromObject(separator);
2700 if (separator == NULL)
2701 return NULL;
2702 sep = PyUnicode_AS_UNICODE(separator);
2703 seplen = PyUnicode_GET_SIZE(separator);
2706 res = _PyUnicode_New(sz);
2707 if (res == NULL)
2708 goto onError;
2709 p = PyUnicode_AS_UNICODE(res);
2710 reslen = 0;
2712 for (i = 0; i < seqlen; i++) {
2713 int itemlen;
2714 PyObject *item;
2716 item = PySequence_GetItem(seq, i);
2717 if (item == NULL)
2718 goto onError;
2719 if (!PyUnicode_Check(item)) {
2720 PyObject *v;
2721 v = PyUnicode_FromObject(item);
2722 Py_DECREF(item);
2723 item = v;
2724 if (item == NULL)
2725 goto onError;
2727 itemlen = PyUnicode_GET_SIZE(item);
2728 while (reslen + itemlen + seplen >= sz) {
2729 if (_PyUnicode_Resize(res, sz*2))
2730 goto onError;
2731 sz *= 2;
2732 p = PyUnicode_AS_UNICODE(res) + reslen;
2734 if (i > 0) {
2735 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2736 p += seplen;
2737 reslen += seplen;
2739 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2740 p += itemlen;
2741 reslen += itemlen;
2742 Py_DECREF(item);
2744 if (_PyUnicode_Resize(res, reslen))
2745 goto onError;
2747 Py_XDECREF(separator);
2748 return (PyObject *)res;
2750 onError:
2751 Py_XDECREF(separator);
2752 Py_DECREF(res);
2753 return NULL;
2756 static
2757 PyUnicodeObject *pad(PyUnicodeObject *self,
2758 int left,
2759 int right,
2760 Py_UNICODE fill)
2762 PyUnicodeObject *u;
2764 if (left < 0)
2765 left = 0;
2766 if (right < 0)
2767 right = 0;
2769 if (left == 0 && right == 0) {
2770 Py_INCREF(self);
2771 return self;
2774 u = _PyUnicode_New(left + self->length + right);
2775 if (u) {
2776 if (left)
2777 Py_UNICODE_FILL(u->str, fill, left);
2778 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2779 if (right)
2780 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2783 return u;
2786 #define SPLIT_APPEND(data, left, right) \
2787 str = PyUnicode_FromUnicode(data + left, right - left); \
2788 if (!str) \
2789 goto onError; \
2790 if (PyList_Append(list, str)) { \
2791 Py_DECREF(str); \
2792 goto onError; \
2794 else \
2795 Py_DECREF(str);
2797 static
2798 PyObject *split_whitespace(PyUnicodeObject *self,
2799 PyObject *list,
2800 int maxcount)
2802 register int i;
2803 register int j;
2804 int len = self->length;
2805 PyObject *str;
2807 for (i = j = 0; i < len; ) {
2808 /* find a token */
2809 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2810 i++;
2811 j = i;
2812 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2813 i++;
2814 if (j < i) {
2815 if (maxcount-- <= 0)
2816 break;
2817 SPLIT_APPEND(self->str, j, i);
2818 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2819 i++;
2820 j = i;
2823 if (j < len) {
2824 SPLIT_APPEND(self->str, j, len);
2826 return list;
2828 onError:
2829 Py_DECREF(list);
2830 return NULL;
2833 PyObject *PyUnicode_Splitlines(PyObject *string,
2834 int keepends)
2836 register int i;
2837 register int j;
2838 int len;
2839 PyObject *list;
2840 PyObject *str;
2841 Py_UNICODE *data;
2843 string = PyUnicode_FromObject(string);
2844 if (string == NULL)
2845 return NULL;
2846 data = PyUnicode_AS_UNICODE(string);
2847 len = PyUnicode_GET_SIZE(string);
2849 list = PyList_New(0);
2850 if (!list)
2851 goto onError;
2853 for (i = j = 0; i < len; ) {
2854 int eol;
2856 /* Find a line and append it */
2857 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2858 i++;
2860 /* Skip the line break reading CRLF as one line break */
2861 eol = i;
2862 if (i < len) {
2863 if (data[i] == '\r' && i + 1 < len &&
2864 data[i+1] == '\n')
2865 i += 2;
2866 else
2867 i++;
2868 if (keepends)
2869 eol = i;
2871 SPLIT_APPEND(data, j, eol);
2872 j = i;
2874 if (j < len) {
2875 SPLIT_APPEND(data, j, len);
2878 Py_DECREF(string);
2879 return list;
2881 onError:
2882 Py_DECREF(list);
2883 Py_DECREF(string);
2884 return NULL;
2887 static
2888 PyObject *split_char(PyUnicodeObject *self,
2889 PyObject *list,
2890 Py_UNICODE ch,
2891 int maxcount)
2893 register int i;
2894 register int j;
2895 int len = self->length;
2896 PyObject *str;
2898 for (i = j = 0; i < len; ) {
2899 if (self->str[i] == ch) {
2900 if (maxcount-- <= 0)
2901 break;
2902 SPLIT_APPEND(self->str, j, i);
2903 i = j = i + 1;
2904 } else
2905 i++;
2907 if (j <= len) {
2908 SPLIT_APPEND(self->str, j, len);
2910 return list;
2912 onError:
2913 Py_DECREF(list);
2914 return NULL;
2917 static
2918 PyObject *split_substring(PyUnicodeObject *self,
2919 PyObject *list,
2920 PyUnicodeObject *substring,
2921 int maxcount)
2923 register int i;
2924 register int j;
2925 int len = self->length;
2926 int sublen = substring->length;
2927 PyObject *str;
2929 for (i = j = 0; i < len - sublen; ) {
2930 if (Py_UNICODE_MATCH(self, i, substring)) {
2931 if (maxcount-- <= 0)
2932 break;
2933 SPLIT_APPEND(self->str, j, i);
2934 i = j = i + sublen;
2935 } else
2936 i++;
2938 if (j <= len) {
2939 SPLIT_APPEND(self->str, j, len);
2941 return list;
2943 onError:
2944 Py_DECREF(list);
2945 return NULL;
2948 #undef SPLIT_APPEND
2950 static
2951 PyObject *split(PyUnicodeObject *self,
2952 PyUnicodeObject *substring,
2953 int maxcount)
2955 PyObject *list;
2957 if (maxcount < 0)
2958 maxcount = INT_MAX;
2960 list = PyList_New(0);
2961 if (!list)
2962 return NULL;
2964 if (substring == NULL)
2965 return split_whitespace(self,list,maxcount);
2967 else if (substring->length == 1)
2968 return split_char(self,list,substring->str[0],maxcount);
2970 else if (substring->length == 0) {
2971 Py_DECREF(list);
2972 PyErr_SetString(PyExc_ValueError, "empty separator");
2973 return NULL;
2975 else
2976 return split_substring(self,list,substring,maxcount);
2979 static
2980 PyObject *strip(PyUnicodeObject *self,
2981 int left,
2982 int right)
2984 Py_UNICODE *p = self->str;
2985 int start = 0;
2986 int end = self->length;
2988 if (left)
2989 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2990 start++;
2992 if (right)
2993 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2994 end--;
2996 if (start == 0 && end == self->length) {
2997 /* couldn't strip anything off, return original string */
2998 Py_INCREF(self);
2999 return (PyObject*) self;
3002 return (PyObject*) PyUnicode_FromUnicode(
3003 self->str + start,
3004 end - start
3008 static
3009 PyObject *replace(PyUnicodeObject *self,
3010 PyUnicodeObject *str1,
3011 PyUnicodeObject *str2,
3012 int maxcount)
3014 PyUnicodeObject *u;
3016 if (maxcount < 0)
3017 maxcount = INT_MAX;
3019 if (str1->length == 1 && str2->length == 1) {
3020 int i;
3022 /* replace characters */
3023 if (!findchar(self->str, self->length, str1->str[0])) {
3024 /* nothing to replace, return original string */
3025 Py_INCREF(self);
3026 u = self;
3027 } else {
3028 Py_UNICODE u1 = str1->str[0];
3029 Py_UNICODE u2 = str2->str[0];
3031 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3032 self->str,
3033 self->length
3035 if (u)
3036 for (i = 0; i < u->length; i++)
3037 if (u->str[i] == u1) {
3038 if (--maxcount < 0)
3039 break;
3040 u->str[i] = u2;
3044 } else {
3045 int n, i;
3046 Py_UNICODE *p;
3048 /* replace strings */
3049 n = count(self, 0, self->length, str1);
3050 if (n > maxcount)
3051 n = maxcount;
3052 if (n == 0) {
3053 /* nothing to replace, return original string */
3054 Py_INCREF(self);
3055 u = self;
3056 } else {
3057 u = _PyUnicode_New(
3058 self->length + n * (str2->length - str1->length));
3059 if (u) {
3060 i = 0;
3061 p = u->str;
3062 while (i <= self->length - str1->length)
3063 if (Py_UNICODE_MATCH(self, i, str1)) {
3064 /* replace string segment */
3065 Py_UNICODE_COPY(p, str2->str, str2->length);
3066 p += str2->length;
3067 i += str1->length;
3068 if (--n <= 0) {
3069 /* copy remaining part */
3070 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3071 break;
3073 } else
3074 *p++ = self->str[i++];
3079 return (PyObject *) u;
3082 /* --- Unicode Object Methods --------------------------------------------- */
3084 static char title__doc__[] =
3085 "S.title() -> unicode\n\
3087 Return a titlecased version of S, i.e. words start with title case\n\
3088 characters, all remaining cased characters have lower case.";
3090 static PyObject*
3091 unicode_title(PyUnicodeObject *self, PyObject *args)
3093 if (!PyArg_NoArgs(args))
3094 return NULL;
3095 return fixup(self, fixtitle);
3098 static char capitalize__doc__[] =
3099 "S.capitalize() -> unicode\n\
3101 Return a capitalized version of S, i.e. make the first character\n\
3102 have upper case.";
3104 static PyObject*
3105 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3107 if (!PyArg_NoArgs(args))
3108 return NULL;
3109 return fixup(self, fixcapitalize);
3112 #if 0
3113 static char capwords__doc__[] =
3114 "S.capwords() -> unicode\n\
3116 Apply .capitalize() to all words in S and return the result with\n\
3117 normalized whitespace (all whitespace strings are replaced by ' ').";
3119 static PyObject*
3120 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3122 PyObject *list;
3123 PyObject *item;
3124 int i;
3126 if (!PyArg_NoArgs(args))
3127 return NULL;
3129 /* Split into words */
3130 list = split(self, NULL, -1);
3131 if (!list)
3132 return NULL;
3134 /* Capitalize each word */
3135 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3136 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3137 fixcapitalize);
3138 if (item == NULL)
3139 goto onError;
3140 Py_DECREF(PyList_GET_ITEM(list, i));
3141 PyList_SET_ITEM(list, i, item);
3144 /* Join the words to form a new string */
3145 item = PyUnicode_Join(NULL, list);
3147 onError:
3148 Py_DECREF(list);
3149 return (PyObject *)item;
3151 #endif
3153 static char center__doc__[] =
3154 "S.center(width) -> unicode\n\
3156 Return S centered in a Unicode string of length width. Padding is done\n\
3157 using spaces.";
3159 static PyObject *
3160 unicode_center(PyUnicodeObject *self, PyObject *args)
3162 int marg, left;
3163 int width;
3165 if (!PyArg_ParseTuple(args, "i:center", &width))
3166 return NULL;
3168 if (self->length >= width) {
3169 Py_INCREF(self);
3170 return (PyObject*) self;
3173 marg = width - self->length;
3174 left = marg / 2 + (marg & width & 1);
3176 return (PyObject*) pad(self, left, marg - left, ' ');
3179 #if 0
3181 /* This code should go into some future Unicode collation support
3182 module. The basic comparison should compare ordinals on a naive
3183 basis (this is what Java does and thus JPython too). */
3185 /* speedy UTF-16 code point order comparison */
3186 /* gleaned from: */
3187 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3189 static short utf16Fixup[32] =
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
3193 0, 0, 0, 0, 0, 0, 0, 0,
3194 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3197 static int
3198 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3200 int len1, len2;
3202 Py_UNICODE *s1 = str1->str;
3203 Py_UNICODE *s2 = str2->str;
3205 len1 = str1->length;
3206 len2 = str2->length;
3208 while (len1 > 0 && len2 > 0) {
3209 Py_UNICODE c1, c2;
3210 long diff;
3212 c1 = *s1++;
3213 c2 = *s2++;
3214 if (c1 > (1<<11) * 26)
3215 c1 += utf16Fixup[c1>>11];
3216 if (c2 > (1<<11) * 26)
3217 c2 += utf16Fixup[c2>>11];
3219 /* now c1 and c2 are in UTF-32-compatible order */
3220 diff = (long)c1 - (long)c2;
3221 if (diff)
3222 return (diff < 0) ? -1 : (diff != 0);
3223 len1--; len2--;
3226 return (len1 < len2) ? -1 : (len1 != len2);
3229 #else
3231 static int
3232 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3234 register int len1, len2;
3236 Py_UNICODE *s1 = str1->str;
3237 Py_UNICODE *s2 = str2->str;
3239 len1 = str1->length;
3240 len2 = str2->length;
3242 while (len1 > 0 && len2 > 0) {
3243 register long diff;
3245 diff = (long)*s1++ - (long)*s2++;
3246 if (diff)
3247 return (diff < 0) ? -1 : (diff != 0);
3248 len1--; len2--;
3251 return (len1 < len2) ? -1 : (len1 != len2);
3254 #endif
3256 int PyUnicode_Compare(PyObject *left,
3257 PyObject *right)
3259 PyUnicodeObject *u = NULL, *v = NULL;
3260 int result;
3262 /* Coerce the two arguments */
3263 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3264 if (u == NULL)
3265 goto onError;
3266 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3267 if (v == NULL)
3268 goto onError;
3270 /* Shortcut for empty or interned objects */
3271 if (v == u) {
3272 Py_DECREF(u);
3273 Py_DECREF(v);
3274 return 0;
3277 result = unicode_compare(u, v);
3279 Py_DECREF(u);
3280 Py_DECREF(v);
3281 return result;
3283 onError:
3284 Py_XDECREF(u);
3285 Py_XDECREF(v);
3286 return -1;
3289 int PyUnicode_Contains(PyObject *container,
3290 PyObject *element)
3292 PyUnicodeObject *u = NULL, *v = NULL;
3293 int result;
3294 register const Py_UNICODE *p, *e;
3295 register Py_UNICODE ch;
3297 /* Coerce the two arguments */
3298 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3299 if (v == NULL) {
3300 PyErr_SetString(PyExc_TypeError,
3301 "'in <string>' requires character as left operand");
3302 goto onError;
3304 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3305 if (u == NULL) {
3306 Py_DECREF(v);
3307 goto onError;
3310 /* Check v in u */
3311 if (PyUnicode_GET_SIZE(v) != 1) {
3312 PyErr_SetString(PyExc_TypeError,
3313 "'in <string>' requires character as left operand");
3314 goto onError;
3316 ch = *PyUnicode_AS_UNICODE(v);
3317 p = PyUnicode_AS_UNICODE(u);
3318 e = p + PyUnicode_GET_SIZE(u);
3319 result = 0;
3320 while (p < e) {
3321 if (*p++ == ch) {
3322 result = 1;
3323 break;
3327 Py_DECREF(u);
3328 Py_DECREF(v);
3329 return result;
3331 onError:
3332 Py_XDECREF(u);
3333 Py_XDECREF(v);
3334 return -1;
3337 /* Concat to string or Unicode object giving a new Unicode object. */
3339 PyObject *PyUnicode_Concat(PyObject *left,
3340 PyObject *right)
3342 PyUnicodeObject *u = NULL, *v = NULL, *w;
3344 /* Coerce the two arguments */
3345 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3346 if (u == NULL)
3347 goto onError;
3348 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3349 if (v == NULL)
3350 goto onError;
3352 /* Shortcuts */
3353 if (v == unicode_empty) {
3354 Py_DECREF(v);
3355 return (PyObject *)u;
3357 if (u == unicode_empty) {
3358 Py_DECREF(u);
3359 return (PyObject *)v;
3362 /* Concat the two Unicode strings */
3363 w = _PyUnicode_New(u->length + v->length);
3364 if (w == NULL)
3365 goto onError;
3366 Py_UNICODE_COPY(w->str, u->str, u->length);
3367 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3369 Py_DECREF(u);
3370 Py_DECREF(v);
3371 return (PyObject *)w;
3373 onError:
3374 Py_XDECREF(u);
3375 Py_XDECREF(v);
3376 return NULL;
3379 static char count__doc__[] =
3380 "S.count(sub[, start[, end]]) -> int\n\
3382 Return the number of occurrences of substring sub in Unicode string\n\
3383 S[start:end]. Optional arguments start and end are\n\
3384 interpreted as in slice notation.";
3386 static PyObject *
3387 unicode_count(PyUnicodeObject *self, PyObject *args)
3389 PyUnicodeObject *substring;
3390 int start = 0;
3391 int end = INT_MAX;
3392 PyObject *result;
3394 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3395 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3396 return NULL;
3398 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3399 (PyObject *)substring);
3400 if (substring == NULL)
3401 return NULL;
3403 if (start < 0)
3404 start += self->length;
3405 if (start < 0)
3406 start = 0;
3407 if (end > self->length)
3408 end = self->length;
3409 if (end < 0)
3410 end += self->length;
3411 if (end < 0)
3412 end = 0;
3414 result = PyInt_FromLong((long) count(self, start, end, substring));
3416 Py_DECREF(substring);
3417 return result;
3420 static char encode__doc__[] =
3421 "S.encode([encoding[,errors]]) -> string\n\
3423 Return an encoded string version of S. Default encoding is the current\n\
3424 default string encoding. errors may be given to set a different error\n\
3425 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3426 a ValueError. Other possible values are 'ignore' and 'replace'.";
3428 static PyObject *
3429 unicode_encode(PyUnicodeObject *self, PyObject *args)
3431 char *encoding = NULL;
3432 char *errors = NULL;
3433 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3434 return NULL;
3435 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3438 static char expandtabs__doc__[] =
3439 "S.expandtabs([tabsize]) -> unicode\n\
3441 Return a copy of S where all tab characters are expanded using spaces.\n\
3442 If tabsize is not given, a tab size of 8 characters is assumed.";
3444 static PyObject*
3445 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3447 Py_UNICODE *e;
3448 Py_UNICODE *p;
3449 Py_UNICODE *q;
3450 int i, j;
3451 PyUnicodeObject *u;
3452 int tabsize = 8;
3454 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3455 return NULL;
3457 /* First pass: determine size of output string */
3458 i = j = 0;
3459 e = self->str + self->length;
3460 for (p = self->str; p < e; p++)
3461 if (*p == '\t') {
3462 if (tabsize > 0)
3463 j += tabsize - (j % tabsize);
3465 else {
3466 j++;
3467 if (*p == '\n' || *p == '\r') {
3468 i += j;
3469 j = 0;
3473 /* Second pass: create output string and fill it */
3474 u = _PyUnicode_New(i + j);
3475 if (!u)
3476 return NULL;
3478 j = 0;
3479 q = u->str;
3481 for (p = self->str; p < e; p++)
3482 if (*p == '\t') {
3483 if (tabsize > 0) {
3484 i = tabsize - (j % tabsize);
3485 j += i;
3486 while (i--)
3487 *q++ = ' ';
3490 else {
3491 j++;
3492 *q++ = *p;
3493 if (*p == '\n' || *p == '\r')
3494 j = 0;
3497 return (PyObject*) u;
3500 static char find__doc__[] =
3501 "S.find(sub [,start [,end]]) -> int\n\
3503 Return the lowest index in S where substring sub is found,\n\
3504 such that sub is contained within s[start,end]. Optional\n\
3505 arguments start and end are interpreted as in slice notation.\n\
3507 Return -1 on failure.";
3509 static PyObject *
3510 unicode_find(PyUnicodeObject *self, PyObject *args)
3512 PyUnicodeObject *substring;
3513 int start = 0;
3514 int end = INT_MAX;
3515 PyObject *result;
3517 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3518 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3519 return NULL;
3520 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3521 (PyObject *)substring);
3522 if (substring == NULL)
3523 return NULL;
3525 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3527 Py_DECREF(substring);
3528 return result;
3531 static PyObject *
3532 unicode_getitem(PyUnicodeObject *self, int index)
3534 if (index < 0 || index >= self->length) {
3535 PyErr_SetString(PyExc_IndexError, "string index out of range");
3536 return NULL;
3539 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3542 static long
3543 unicode_hash(PyUnicodeObject *self)
3545 /* Since Unicode objects compare equal to their ASCII string
3546 counterparts, they should use the individual character values
3547 as basis for their hash value. This is needed to assure that
3548 strings and Unicode objects behave in the same way as
3549 dictionary keys. */
3551 register int len;
3552 register Py_UNICODE *p;
3553 register long x;
3555 if (self->hash != -1)
3556 return self->hash;
3557 len = PyUnicode_GET_SIZE(self);
3558 p = PyUnicode_AS_UNICODE(self);
3559 x = *p << 7;
3560 while (--len >= 0)
3561 x = (1000003*x) ^ *p++;
3562 x ^= PyUnicode_GET_SIZE(self);
3563 if (x == -1)
3564 x = -2;
3565 self->hash = x;
3566 return x;
3569 static char index__doc__[] =
3570 "S.index(sub [,start [,end]]) -> int\n\
3572 Like S.find() but raise ValueError when the substring is not found.";
3574 static PyObject *
3575 unicode_index(PyUnicodeObject *self, PyObject *args)
3577 int result;
3578 PyUnicodeObject *substring;
3579 int start = 0;
3580 int end = INT_MAX;
3582 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3584 return NULL;
3586 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3587 (PyObject *)substring);
3588 if (substring == NULL)
3589 return NULL;
3591 result = findstring(self, substring, start, end, 1);
3593 Py_DECREF(substring);
3594 if (result < 0) {
3595 PyErr_SetString(PyExc_ValueError, "substring not found");
3596 return NULL;
3598 return PyInt_FromLong(result);
3601 static char islower__doc__[] =
3602 "S.islower() -> int\n\
3604 Return 1 if all cased characters in S are lowercase and there is\n\
3605 at least one cased character in S, 0 otherwise.";
3607 static PyObject*
3608 unicode_islower(PyUnicodeObject *self, PyObject *args)
3610 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3611 register const Py_UNICODE *e;
3612 int cased;
3614 if (!PyArg_NoArgs(args))
3615 return NULL;
3617 /* Shortcut for single character strings */
3618 if (PyUnicode_GET_SIZE(self) == 1)
3619 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3621 /* Special case for empty strings */
3622 if (PyString_GET_SIZE(self) == 0)
3623 return PyInt_FromLong(0);
3625 e = p + PyUnicode_GET_SIZE(self);
3626 cased = 0;
3627 for (; p < e; p++) {
3628 register const Py_UNICODE ch = *p;
3630 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3631 return PyInt_FromLong(0);
3632 else if (!cased && Py_UNICODE_ISLOWER(ch))
3633 cased = 1;
3635 return PyInt_FromLong(cased);
3638 static char isupper__doc__[] =
3639 "S.isupper() -> int\n\
3641 Return 1 if all cased characters in S are uppercase and there is\n\
3642 at least one cased character in S, 0 otherwise.";
3644 static PyObject*
3645 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3647 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3648 register const Py_UNICODE *e;
3649 int cased;
3651 if (!PyArg_NoArgs(args))
3652 return NULL;
3654 /* Shortcut for single character strings */
3655 if (PyUnicode_GET_SIZE(self) == 1)
3656 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3658 /* Special case for empty strings */
3659 if (PyString_GET_SIZE(self) == 0)
3660 return PyInt_FromLong(0);
3662 e = p + PyUnicode_GET_SIZE(self);
3663 cased = 0;
3664 for (; p < e; p++) {
3665 register const Py_UNICODE ch = *p;
3667 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3668 return PyInt_FromLong(0);
3669 else if (!cased && Py_UNICODE_ISUPPER(ch))
3670 cased = 1;
3672 return PyInt_FromLong(cased);
3675 static char istitle__doc__[] =
3676 "S.istitle() -> int\n\
3678 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3679 may only follow uncased characters and lowercase characters only cased\n\
3680 ones. Return 0 otherwise.";
3682 static PyObject*
3683 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3685 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3686 register const Py_UNICODE *e;
3687 int cased, previous_is_cased;
3689 if (!PyArg_NoArgs(args))
3690 return NULL;
3692 /* Shortcut for single character strings */
3693 if (PyUnicode_GET_SIZE(self) == 1)
3694 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3695 (Py_UNICODE_ISUPPER(*p) != 0));
3697 /* Special case for empty strings */
3698 if (PyString_GET_SIZE(self) == 0)
3699 return PyInt_FromLong(0);
3701 e = p + PyUnicode_GET_SIZE(self);
3702 cased = 0;
3703 previous_is_cased = 0;
3704 for (; p < e; p++) {
3705 register const Py_UNICODE ch = *p;
3707 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3708 if (previous_is_cased)
3709 return PyInt_FromLong(0);
3710 previous_is_cased = 1;
3711 cased = 1;
3713 else if (Py_UNICODE_ISLOWER(ch)) {
3714 if (!previous_is_cased)
3715 return PyInt_FromLong(0);
3716 previous_is_cased = 1;
3717 cased = 1;
3719 else
3720 previous_is_cased = 0;
3722 return PyInt_FromLong(cased);
3725 static char isspace__doc__[] =
3726 "S.isspace() -> int\n\
3728 Return 1 if there are only whitespace characters in S,\n\
3729 0 otherwise.";
3731 static PyObject*
3732 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3734 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3735 register const Py_UNICODE *e;
3737 if (!PyArg_NoArgs(args))
3738 return NULL;
3740 /* Shortcut for single character strings */
3741 if (PyUnicode_GET_SIZE(self) == 1 &&
3742 Py_UNICODE_ISSPACE(*p))
3743 return PyInt_FromLong(1);
3745 /* Special case for empty strings */
3746 if (PyString_GET_SIZE(self) == 0)
3747 return PyInt_FromLong(0);
3749 e = p + PyUnicode_GET_SIZE(self);
3750 for (; p < e; p++) {
3751 if (!Py_UNICODE_ISSPACE(*p))
3752 return PyInt_FromLong(0);
3754 return PyInt_FromLong(1);
3757 static char isalpha__doc__[] =
3758 "S.isalpha() -> int\n\
3760 Return 1 if all characters in S are alphabetic\n\
3761 and there is at least one character in S, 0 otherwise.";
3763 static PyObject*
3764 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3766 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3767 register const Py_UNICODE *e;
3769 if (!PyArg_NoArgs(args))
3770 return NULL;
3772 /* Shortcut for single character strings */
3773 if (PyUnicode_GET_SIZE(self) == 1 &&
3774 Py_UNICODE_ISALPHA(*p))
3775 return PyInt_FromLong(1);
3777 /* Special case for empty strings */
3778 if (PyString_GET_SIZE(self) == 0)
3779 return PyInt_FromLong(0);
3781 e = p + PyUnicode_GET_SIZE(self);
3782 for (; p < e; p++) {
3783 if (!Py_UNICODE_ISALPHA(*p))
3784 return PyInt_FromLong(0);
3786 return PyInt_FromLong(1);
3789 static char isalnum__doc__[] =
3790 "S.isalnum() -> int\n\
3792 Return 1 if all characters in S are alphanumeric\n\
3793 and there is at least one character in S, 0 otherwise.";
3795 static PyObject*
3796 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3798 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3799 register const Py_UNICODE *e;
3801 if (!PyArg_NoArgs(args))
3802 return NULL;
3804 /* Shortcut for single character strings */
3805 if (PyUnicode_GET_SIZE(self) == 1 &&
3806 Py_UNICODE_ISALNUM(*p))
3807 return PyInt_FromLong(1);
3809 /* Special case for empty strings */
3810 if (PyString_GET_SIZE(self) == 0)
3811 return PyInt_FromLong(0);
3813 e = p + PyUnicode_GET_SIZE(self);
3814 for (; p < e; p++) {
3815 if (!Py_UNICODE_ISALNUM(*p))
3816 return PyInt_FromLong(0);
3818 return PyInt_FromLong(1);
3821 static char isdecimal__doc__[] =
3822 "S.isdecimal() -> int\n\
3824 Return 1 if there are only decimal characters in S,\n\
3825 0 otherwise.";
3827 static PyObject*
3828 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3830 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3831 register const Py_UNICODE *e;
3833 if (!PyArg_NoArgs(args))
3834 return NULL;
3836 /* Shortcut for single character strings */
3837 if (PyUnicode_GET_SIZE(self) == 1 &&
3838 Py_UNICODE_ISDECIMAL(*p))
3839 return PyInt_FromLong(1);
3841 /* Special case for empty strings */
3842 if (PyString_GET_SIZE(self) == 0)
3843 return PyInt_FromLong(0);
3845 e = p + PyUnicode_GET_SIZE(self);
3846 for (; p < e; p++) {
3847 if (!Py_UNICODE_ISDECIMAL(*p))
3848 return PyInt_FromLong(0);
3850 return PyInt_FromLong(1);
3853 static char isdigit__doc__[] =
3854 "S.isdigit() -> int\n\
3856 Return 1 if there are only digit characters in S,\n\
3857 0 otherwise.";
3859 static PyObject*
3860 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3862 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3863 register const Py_UNICODE *e;
3865 if (!PyArg_NoArgs(args))
3866 return NULL;
3868 /* Shortcut for single character strings */
3869 if (PyUnicode_GET_SIZE(self) == 1 &&
3870 Py_UNICODE_ISDIGIT(*p))
3871 return PyInt_FromLong(1);
3873 /* Special case for empty strings */
3874 if (PyString_GET_SIZE(self) == 0)
3875 return PyInt_FromLong(0);
3877 e = p + PyUnicode_GET_SIZE(self);
3878 for (; p < e; p++) {
3879 if (!Py_UNICODE_ISDIGIT(*p))
3880 return PyInt_FromLong(0);
3882 return PyInt_FromLong(1);
3885 static char isnumeric__doc__[] =
3886 "S.isnumeric() -> int\n\
3888 Return 1 if there are only numeric characters in S,\n\
3889 0 otherwise.";
3891 static PyObject*
3892 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3894 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3895 register const Py_UNICODE *e;
3897 if (!PyArg_NoArgs(args))
3898 return NULL;
3900 /* Shortcut for single character strings */
3901 if (PyUnicode_GET_SIZE(self) == 1 &&
3902 Py_UNICODE_ISNUMERIC(*p))
3903 return PyInt_FromLong(1);
3905 /* Special case for empty strings */
3906 if (PyString_GET_SIZE(self) == 0)
3907 return PyInt_FromLong(0);
3909 e = p + PyUnicode_GET_SIZE(self);
3910 for (; p < e; p++) {
3911 if (!Py_UNICODE_ISNUMERIC(*p))
3912 return PyInt_FromLong(0);
3914 return PyInt_FromLong(1);
3917 static char join__doc__[] =
3918 "S.join(sequence) -> unicode\n\
3920 Return a string which is the concatenation of the strings in the\n\
3921 sequence. The separator between elements is S.";
3923 static PyObject*
3924 unicode_join(PyUnicodeObject *self, PyObject *args)
3926 PyObject *data;
3927 if (!PyArg_ParseTuple(args, "O:join", &data))
3928 return NULL;
3930 return PyUnicode_Join((PyObject *)self, data);
3933 static int
3934 unicode_length(PyUnicodeObject *self)
3936 return self->length;
3939 static char ljust__doc__[] =
3940 "S.ljust(width) -> unicode\n\
3942 Return S left justified in a Unicode string of length width. Padding is\n\
3943 done using spaces.";
3945 static PyObject *
3946 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3948 int width;
3949 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3950 return NULL;
3952 if (self->length >= width) {
3953 Py_INCREF(self);
3954 return (PyObject*) self;
3957 return (PyObject*) pad(self, 0, width - self->length, ' ');
3960 static char lower__doc__[] =
3961 "S.lower() -> unicode\n\
3963 Return a copy of the string S converted to lowercase.";
3965 static PyObject*
3966 unicode_lower(PyUnicodeObject *self, PyObject *args)
3968 if (!PyArg_NoArgs(args))
3969 return NULL;
3970 return fixup(self, fixlower);
3973 static char lstrip__doc__[] =
3974 "S.lstrip() -> unicode\n\
3976 Return a copy of the string S with leading whitespace removed.";
3978 static PyObject *
3979 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3981 if (!PyArg_NoArgs(args))
3982 return NULL;
3983 return strip(self, 1, 0);
3986 static PyObject*
3987 unicode_repeat(PyUnicodeObject *str, int len)
3989 PyUnicodeObject *u;
3990 Py_UNICODE *p;
3992 if (len < 0)
3993 len = 0;
3995 if (len == 1) {
3996 /* no repeat, return original string */
3997 Py_INCREF(str);
3998 return (PyObject*) str;
4001 u = _PyUnicode_New(len * str->length);
4002 if (!u)
4003 return NULL;
4005 p = u->str;
4007 while (len-- > 0) {
4008 Py_UNICODE_COPY(p, str->str, str->length);
4009 p += str->length;
4012 return (PyObject*) u;
4015 PyObject *PyUnicode_Replace(PyObject *obj,
4016 PyObject *subobj,
4017 PyObject *replobj,
4018 int maxcount)
4020 PyObject *self;
4021 PyObject *str1;
4022 PyObject *str2;
4023 PyObject *result;
4025 self = PyUnicode_FromObject(obj);
4026 if (self == NULL)
4027 return NULL;
4028 str1 = PyUnicode_FromObject(subobj);
4029 if (str1 == NULL) {
4030 Py_DECREF(self);
4031 return NULL;
4033 str2 = PyUnicode_FromObject(replobj);
4034 if (str2 == NULL) {
4035 Py_DECREF(self);
4036 Py_DECREF(str1);
4037 return NULL;
4039 result = replace((PyUnicodeObject *)self,
4040 (PyUnicodeObject *)str1,
4041 (PyUnicodeObject *)str2,
4042 maxcount);
4043 Py_DECREF(self);
4044 Py_DECREF(str1);
4045 Py_DECREF(str2);
4046 return result;
4049 static char replace__doc__[] =
4050 "S.replace (old, new[, maxsplit]) -> unicode\n\
4052 Return a copy of S with all occurrences of substring\n\
4053 old replaced by new. If the optional argument maxsplit is\n\
4054 given, only the first maxsplit occurrences are replaced.";
4056 static PyObject*
4057 unicode_replace(PyUnicodeObject *self, PyObject *args)
4059 PyUnicodeObject *str1;
4060 PyUnicodeObject *str2;
4061 int maxcount = -1;
4062 PyObject *result;
4064 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4065 return NULL;
4066 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4067 if (str1 == NULL)
4068 return NULL;
4069 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4070 if (str2 == NULL)
4071 return NULL;
4073 result = replace(self, str1, str2, maxcount);
4075 Py_DECREF(str1);
4076 Py_DECREF(str2);
4077 return result;
4080 static
4081 PyObject *unicode_repr(PyObject *unicode)
4083 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4084 PyUnicode_GET_SIZE(unicode),
4088 static char rfind__doc__[] =
4089 "S.rfind(sub [,start [,end]]) -> int\n\
4091 Return the highest index in S where substring sub is found,\n\
4092 such that sub is contained within s[start,end]. Optional\n\
4093 arguments start and end are interpreted as in slice notation.\n\
4095 Return -1 on failure.";
4097 static PyObject *
4098 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4100 PyUnicodeObject *substring;
4101 int start = 0;
4102 int end = INT_MAX;
4103 PyObject *result;
4105 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4106 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4107 return NULL;
4108 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4109 (PyObject *)substring);
4110 if (substring == NULL)
4111 return NULL;
4113 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4115 Py_DECREF(substring);
4116 return result;
4119 static char rindex__doc__[] =
4120 "S.rindex(sub [,start [,end]]) -> int\n\
4122 Like S.rfind() but raise ValueError when the substring is not found.";
4124 static PyObject *
4125 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4127 int result;
4128 PyUnicodeObject *substring;
4129 int start = 0;
4130 int end = INT_MAX;
4132 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4133 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4134 return NULL;
4135 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4136 (PyObject *)substring);
4137 if (substring == NULL)
4138 return NULL;
4140 result = findstring(self, substring, start, end, -1);
4142 Py_DECREF(substring);
4143 if (result < 0) {
4144 PyErr_SetString(PyExc_ValueError, "substring not found");
4145 return NULL;
4147 return PyInt_FromLong(result);
4150 static char rjust__doc__[] =
4151 "S.rjust(width) -> unicode\n\
4153 Return S right justified in a Unicode string of length width. Padding is\n\
4154 done using spaces.";
4156 static PyObject *
4157 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4159 int width;
4160 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4161 return NULL;
4163 if (self->length >= width) {
4164 Py_INCREF(self);
4165 return (PyObject*) self;
4168 return (PyObject*) pad(self, width - self->length, 0, ' ');
4171 static char rstrip__doc__[] =
4172 "S.rstrip() -> unicode\n\
4174 Return a copy of the string S with trailing whitespace removed.";
4176 static PyObject *
4177 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4179 if (!PyArg_NoArgs(args))
4180 return NULL;
4181 return strip(self, 0, 1);
4184 static PyObject*
4185 unicode_slice(PyUnicodeObject *self, int start, int end)
4187 /* standard clamping */
4188 if (start < 0)
4189 start = 0;
4190 if (end < 0)
4191 end = 0;
4192 if (end > self->length)
4193 end = self->length;
4194 if (start == 0 && end == self->length) {
4195 /* full slice, return original string */
4196 Py_INCREF(self);
4197 return (PyObject*) self;
4199 if (start > end)
4200 start = end;
4201 /* copy slice */
4202 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4203 end - start);
4206 PyObject *PyUnicode_Split(PyObject *s,
4207 PyObject *sep,
4208 int maxsplit)
4210 PyObject *result;
4212 s = PyUnicode_FromObject(s);
4213 if (s == NULL)
4214 return NULL;
4215 if (sep != NULL) {
4216 sep = PyUnicode_FromObject(sep);
4217 if (sep == NULL) {
4218 Py_DECREF(s);
4219 return NULL;
4223 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4225 Py_DECREF(s);
4226 Py_XDECREF(sep);
4227 return result;
4230 static char split__doc__[] =
4231 "S.split([sep [,maxsplit]]) -> list of strings\n\
4233 Return a list of the words in S, using sep as the\n\
4234 delimiter string. If maxsplit is given, at most maxsplit\n\
4235 splits are done. If sep is not specified, any whitespace string\n\
4236 is a separator.";
4238 static PyObject*
4239 unicode_split(PyUnicodeObject *self, PyObject *args)
4241 PyObject *substring = Py_None;
4242 int maxcount = -1;
4244 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4245 return NULL;
4247 if (substring == Py_None)
4248 return split(self, NULL, maxcount);
4249 else if (PyUnicode_Check(substring))
4250 return split(self, (PyUnicodeObject *)substring, maxcount);
4251 else
4252 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4255 static char splitlines__doc__[] =
4256 "S.splitlines([keepends]]) -> list of strings\n\
4258 Return a list of the lines in S, breaking at line boundaries.\n\
4259 Line breaks are not included in the resulting list unless keepends\n\
4260 is given and true.";
4262 static PyObject*
4263 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4265 int keepends = 0;
4267 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4268 return NULL;
4270 return PyUnicode_Splitlines((PyObject *)self, keepends);
4273 static
4274 PyObject *unicode_str(PyUnicodeObject *self)
4276 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4279 static char strip__doc__[] =
4280 "S.strip() -> unicode\n\
4282 Return a copy of S with leading and trailing whitespace removed.";
4284 static PyObject *
4285 unicode_strip(PyUnicodeObject *self, PyObject *args)
4287 if (!PyArg_NoArgs(args))
4288 return NULL;
4289 return strip(self, 1, 1);
4292 static char swapcase__doc__[] =
4293 "S.swapcase() -> unicode\n\
4295 Return a copy of S with uppercase characters converted to lowercase\n\
4296 and vice versa.";
4298 static PyObject*
4299 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4301 if (!PyArg_NoArgs(args))
4302 return NULL;
4303 return fixup(self, fixswapcase);
4306 static char translate__doc__[] =
4307 "S.translate(table) -> unicode\n\
4309 Return a copy of the string S, where all characters have been mapped\n\
4310 through the given translation table, which must be a mapping of\n\
4311 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4312 are left untouched. Characters mapped to None are deleted.";
4314 static PyObject*
4315 unicode_translate(PyUnicodeObject *self, PyObject *args)
4317 PyObject *table;
4319 if (!PyArg_ParseTuple(args, "O:translate", &table))
4320 return NULL;
4321 return PyUnicode_TranslateCharmap(self->str,
4322 self->length,
4323 table,
4324 "ignore");
4327 static char upper__doc__[] =
4328 "S.upper() -> unicode\n\
4330 Return a copy of S converted to uppercase.";
4332 static PyObject*
4333 unicode_upper(PyUnicodeObject *self, PyObject *args)
4335 if (!PyArg_NoArgs(args))
4336 return NULL;
4337 return fixup(self, fixupper);
4340 #if 0
4341 static char zfill__doc__[] =
4342 "S.zfill(width) -> unicode\n\
4344 Pad a numeric string x with zeros on the left, to fill a field\n\
4345 of the specified width. The string x is never truncated.";
4347 static PyObject *
4348 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4350 int fill;
4351 PyUnicodeObject *u;
4353 int width;
4354 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4355 return NULL;
4357 if (self->length >= width) {
4358 Py_INCREF(self);
4359 return (PyObject*) self;
4362 fill = width - self->length;
4364 u = pad(self, fill, 0, '0');
4366 if (u->str[fill] == '+' || u->str[fill] == '-') {
4367 /* move sign to beginning of string */
4368 u->str[0] = u->str[fill];
4369 u->str[fill] = '0';
4372 return (PyObject*) u;
4374 #endif
4376 #if 0
4377 static PyObject*
4378 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4380 if (!PyArg_NoArgs(args))
4381 return NULL;
4382 return PyInt_FromLong(unicode_freelist_size);
4384 #endif
4386 static char startswith__doc__[] =
4387 "S.startswith(prefix[, start[, end]]) -> int\n\
4389 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4390 optional start, test S beginning at that position. With optional end, stop\n\
4391 comparing S at that position.";
4393 static PyObject *
4394 unicode_startswith(PyUnicodeObject *self,
4395 PyObject *args)
4397 PyUnicodeObject *substring;
4398 int start = 0;
4399 int end = INT_MAX;
4400 PyObject *result;
4402 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4403 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4404 return NULL;
4405 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4406 (PyObject *)substring);
4407 if (substring == NULL)
4408 return NULL;
4410 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4412 Py_DECREF(substring);
4413 return result;
4417 static char endswith__doc__[] =
4418 "S.endswith(suffix[, start[, end]]) -> int\n\
4420 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4421 optional start, test S beginning at that position. With optional end, stop\n\
4422 comparing S at that position.";
4424 static PyObject *
4425 unicode_endswith(PyUnicodeObject *self,
4426 PyObject *args)
4428 PyUnicodeObject *substring;
4429 int start = 0;
4430 int end = INT_MAX;
4431 PyObject *result;
4433 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4434 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4435 return NULL;
4436 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4437 (PyObject *)substring);
4438 if (substring == NULL)
4439 return NULL;
4441 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4443 Py_DECREF(substring);
4444 return result;
4448 static PyMethodDef unicode_methods[] = {
4450 /* Order is according to common usage: often used methods should
4451 appear first, since lookup is done sequentially. */
4453 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4454 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4455 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4456 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4457 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4458 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4459 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4460 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4461 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4462 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4463 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4464 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4465 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4466 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4467 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4468 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4469 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4470 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4471 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4472 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4473 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4474 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4475 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4476 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4477 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4478 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4479 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4480 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4481 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4482 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4483 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4484 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4485 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4486 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4487 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4488 #if 0
4489 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4490 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4491 #endif
4493 #if 0
4494 /* This one is just used for debugging the implementation. */
4495 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4496 #endif
4498 {NULL, NULL}
4501 static PyObject *
4502 unicode_getattr(PyUnicodeObject *self, char *name)
4504 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4507 static PySequenceMethods unicode_as_sequence = {
4508 (inquiry) unicode_length, /* sq_length */
4509 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4510 (intargfunc) unicode_repeat, /* sq_repeat */
4511 (intargfunc) unicode_getitem, /* sq_item */
4512 (intintargfunc) unicode_slice, /* sq_slice */
4513 0, /* sq_ass_item */
4514 0, /* sq_ass_slice */
4515 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4518 static int
4519 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4520 int index,
4521 const void **ptr)
4523 if (index != 0) {
4524 PyErr_SetString(PyExc_SystemError,
4525 "accessing non-existent unicode segment");
4526 return -1;
4528 *ptr = (void *) self->str;
4529 return PyUnicode_GET_DATA_SIZE(self);
4532 static int
4533 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4534 const void **ptr)
4536 PyErr_SetString(PyExc_TypeError,
4537 "cannot use unicode as modifyable buffer");
4538 return -1;
4541 static int
4542 unicode_buffer_getsegcount(PyUnicodeObject *self,
4543 int *lenp)
4545 if (lenp)
4546 *lenp = PyUnicode_GET_DATA_SIZE(self);
4547 return 1;
4550 static int
4551 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4552 int index,
4553 const void **ptr)
4555 PyObject *str;
4557 if (index != 0) {
4558 PyErr_SetString(PyExc_SystemError,
4559 "accessing non-existent unicode segment");
4560 return -1;
4562 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4563 if (str == NULL)
4564 return -1;
4565 *ptr = (void *) PyString_AS_STRING(str);
4566 return PyString_GET_SIZE(str);
4569 /* Helpers for PyUnicode_Format() */
4571 static PyObject *
4572 getnextarg(PyObject *args, int arglen, int *p_argidx)
4574 int argidx = *p_argidx;
4575 if (argidx < arglen) {
4576 (*p_argidx)++;
4577 if (arglen < 0)
4578 return args;
4579 else
4580 return PyTuple_GetItem(args, argidx);
4582 PyErr_SetString(PyExc_TypeError,
4583 "not enough arguments for format string");
4584 return NULL;
4587 #define F_LJUST (1<<0)
4588 #define F_SIGN (1<<1)
4589 #define F_BLANK (1<<2)
4590 #define F_ALT (1<<3)
4591 #define F_ZERO (1<<4)
4593 static
4594 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4596 register int i;
4597 int len;
4598 va_list va;
4599 char *charbuffer;
4600 va_start(va, format);
4602 /* First, format the string as char array, then expand to Py_UNICODE
4603 array. */
4604 charbuffer = (char *)buffer;
4605 len = vsprintf(charbuffer, format, va);
4606 for (i = len - 1; i >= 0; i--)
4607 buffer[i] = (Py_UNICODE) charbuffer[i];
4609 va_end(va);
4610 return len;
4613 static int
4614 formatfloat(Py_UNICODE *buf,
4615 size_t buflen,
4616 int flags,
4617 int prec,
4618 int type,
4619 PyObject *v)
4621 /* fmt = '%#.' + `prec` + `type`
4622 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4623 char fmt[20];
4624 double x;
4626 x = PyFloat_AsDouble(v);
4627 if (x == -1.0 && PyErr_Occurred())
4628 return -1;
4629 if (prec < 0)
4630 prec = 6;
4631 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4632 type = 'g';
4633 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4634 /* worst case length calc to ensure no buffer overrun:
4635 fmt = %#.<prec>g
4636 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4637 for any double rep.)
4638 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4639 If prec=0 the effective precision is 1 (the leading digit is
4640 always given), therefore increase by one to 10+prec. */
4641 if (buflen <= (size_t)10 + (size_t)prec) {
4642 PyErr_SetString(PyExc_OverflowError,
4643 "formatted float is too long (precision too long?)");
4644 return -1;
4646 return usprintf(buf, fmt, x);
4649 static int
4650 formatint(Py_UNICODE *buf,
4651 size_t buflen,
4652 int flags,
4653 int prec,
4654 int type,
4655 PyObject *v)
4657 /* fmt = '%#.' + `prec` + 'l' + `type`
4658 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
4659 char fmt[20];
4660 long x;
4662 x = PyInt_AsLong(v);
4663 if (x == -1 && PyErr_Occurred())
4664 return -1;
4665 if (prec < 0)
4666 prec = 1;
4667 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4668 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4669 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4670 PyErr_SetString(PyExc_OverflowError,
4671 "formatted integer is too long (precision too long?)");
4672 return -1;
4674 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4675 return usprintf(buf, fmt, x);
4678 static int
4679 formatchar(Py_UNICODE *buf,
4680 size_t buflen,
4681 PyObject *v)
4683 /* presume that the buffer is at least 2 characters long */
4684 if (PyUnicode_Check(v)) {
4685 if (PyUnicode_GET_SIZE(v) != 1)
4686 goto onError;
4687 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4690 else if (PyString_Check(v)) {
4691 if (PyString_GET_SIZE(v) != 1)
4692 goto onError;
4693 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4696 else {
4697 /* Integer input truncated to a character */
4698 long x;
4699 x = PyInt_AsLong(v);
4700 if (x == -1 && PyErr_Occurred())
4701 goto onError;
4702 buf[0] = (char) x;
4704 buf[1] = '\0';
4705 return 1;
4707 onError:
4708 PyErr_SetString(PyExc_TypeError,
4709 "%c requires int or char");
4710 return -1;
4713 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4715 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4716 chars are formatted. XXX This is a magic number. Each formatting
4717 routine does bounds checking to ensure no overflow, but a better
4718 solution may be to malloc a buffer of appropriate size for each
4719 format. For now, the current solution is sufficient.
4721 #define FORMATBUFLEN (size_t)120
4723 PyObject *PyUnicode_Format(PyObject *format,
4724 PyObject *args)
4726 Py_UNICODE *fmt, *res;
4727 int fmtcnt, rescnt, reslen, arglen, argidx;
4728 int args_owned = 0;
4729 PyUnicodeObject *result = NULL;
4730 PyObject *dict = NULL;
4731 PyObject *uformat;
4733 if (format == NULL || args == NULL) {
4734 PyErr_BadInternalCall();
4735 return NULL;
4737 uformat = PyUnicode_FromObject(format);
4738 if (uformat == NULL)
4739 return NULL;
4740 fmt = PyUnicode_AS_UNICODE(uformat);
4741 fmtcnt = PyUnicode_GET_SIZE(uformat);
4743 reslen = rescnt = fmtcnt + 100;
4744 result = _PyUnicode_New(reslen);
4745 if (result == NULL)
4746 goto onError;
4747 res = PyUnicode_AS_UNICODE(result);
4749 if (PyTuple_Check(args)) {
4750 arglen = PyTuple_Size(args);
4751 argidx = 0;
4753 else {
4754 arglen = -1;
4755 argidx = -2;
4757 if (args->ob_type->tp_as_mapping)
4758 dict = args;
4760 while (--fmtcnt >= 0) {
4761 if (*fmt != '%') {
4762 if (--rescnt < 0) {
4763 rescnt = fmtcnt + 100;
4764 reslen += rescnt;
4765 if (_PyUnicode_Resize(result, reslen) < 0)
4766 return NULL;
4767 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4768 --rescnt;
4770 *res++ = *fmt++;
4772 else {
4773 /* Got a format specifier */
4774 int flags = 0;
4775 int width = -1;
4776 int prec = -1;
4777 int size = 0;
4778 Py_UNICODE c = '\0';
4779 Py_UNICODE fill;
4780 PyObject *v = NULL;
4781 PyObject *temp = NULL;
4782 Py_UNICODE *pbuf;
4783 Py_UNICODE sign;
4784 int len;
4785 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4787 fmt++;
4788 if (*fmt == '(') {
4789 Py_UNICODE *keystart;
4790 int keylen;
4791 PyObject *key;
4792 int pcount = 1;
4794 if (dict == NULL) {
4795 PyErr_SetString(PyExc_TypeError,
4796 "format requires a mapping");
4797 goto onError;
4799 ++fmt;
4800 --fmtcnt;
4801 keystart = fmt;
4802 /* Skip over balanced parentheses */
4803 while (pcount > 0 && --fmtcnt >= 0) {
4804 if (*fmt == ')')
4805 --pcount;
4806 else if (*fmt == '(')
4807 ++pcount;
4808 fmt++;
4810 keylen = fmt - keystart - 1;
4811 if (fmtcnt < 0 || pcount > 0) {
4812 PyErr_SetString(PyExc_ValueError,
4813 "incomplete format key");
4814 goto onError;
4816 /* keys are converted to strings using UTF-8 and
4817 then looked up since Python uses strings to hold
4818 variables names etc. in its namespaces and we
4819 wouldn't want to break common idioms. */
4820 key = PyUnicode_EncodeUTF8(keystart,
4821 keylen,
4822 NULL);
4823 if (key == NULL)
4824 goto onError;
4825 if (args_owned) {
4826 Py_DECREF(args);
4827 args_owned = 0;
4829 args = PyObject_GetItem(dict, key);
4830 Py_DECREF(key);
4831 if (args == NULL) {
4832 goto onError;
4834 args_owned = 1;
4835 arglen = -1;
4836 argidx = -2;
4838 while (--fmtcnt >= 0) {
4839 switch (c = *fmt++) {
4840 case '-': flags |= F_LJUST; continue;
4841 case '+': flags |= F_SIGN; continue;
4842 case ' ': flags |= F_BLANK; continue;
4843 case '#': flags |= F_ALT; continue;
4844 case '0': flags |= F_ZERO; continue;
4846 break;
4848 if (c == '*') {
4849 v = getnextarg(args, arglen, &argidx);
4850 if (v == NULL)
4851 goto onError;
4852 if (!PyInt_Check(v)) {
4853 PyErr_SetString(PyExc_TypeError,
4854 "* wants int");
4855 goto onError;
4857 width = PyInt_AsLong(v);
4858 if (width < 0) {
4859 flags |= F_LJUST;
4860 width = -width;
4862 if (--fmtcnt >= 0)
4863 c = *fmt++;
4865 else if (c >= '0' && c <= '9') {
4866 width = c - '0';
4867 while (--fmtcnt >= 0) {
4868 c = *fmt++;
4869 if (c < '0' || c > '9')
4870 break;
4871 if ((width*10) / 10 != width) {
4872 PyErr_SetString(PyExc_ValueError,
4873 "width too big");
4874 goto onError;
4876 width = width*10 + (c - '0');
4879 if (c == '.') {
4880 prec = 0;
4881 if (--fmtcnt >= 0)
4882 c = *fmt++;
4883 if (c == '*') {
4884 v = getnextarg(args, arglen, &argidx);
4885 if (v == NULL)
4886 goto onError;
4887 if (!PyInt_Check(v)) {
4888 PyErr_SetString(PyExc_TypeError,
4889 "* wants int");
4890 goto onError;
4892 prec = PyInt_AsLong(v);
4893 if (prec < 0)
4894 prec = 0;
4895 if (--fmtcnt >= 0)
4896 c = *fmt++;
4898 else if (c >= '0' && c <= '9') {
4899 prec = c - '0';
4900 while (--fmtcnt >= 0) {
4901 c = Py_CHARMASK(*fmt++);
4902 if (c < '0' || c > '9')
4903 break;
4904 if ((prec*10) / 10 != prec) {
4905 PyErr_SetString(PyExc_ValueError,
4906 "prec too big");
4907 goto onError;
4909 prec = prec*10 + (c - '0');
4912 } /* prec */
4913 if (fmtcnt >= 0) {
4914 if (c == 'h' || c == 'l' || c == 'L') {
4915 size = c;
4916 if (--fmtcnt >= 0)
4917 c = *fmt++;
4920 if (fmtcnt < 0) {
4921 PyErr_SetString(PyExc_ValueError,
4922 "incomplete format");
4923 goto onError;
4925 if (c != '%') {
4926 v = getnextarg(args, arglen, &argidx);
4927 if (v == NULL)
4928 goto onError;
4930 sign = 0;
4931 fill = ' ';
4932 switch (c) {
4934 case '%':
4935 pbuf = formatbuf;
4936 /* presume that buffer length is at least 1 */
4937 pbuf[0] = '%';
4938 len = 1;
4939 break;
4941 case 's':
4942 case 'r':
4943 if (PyUnicode_Check(v) && c == 's') {
4944 temp = v;
4945 Py_INCREF(temp);
4947 else {
4948 PyObject *unicode;
4949 if (c == 's')
4950 temp = PyObject_Str(v);
4951 else
4952 temp = PyObject_Repr(v);
4953 if (temp == NULL)
4954 goto onError;
4955 if (!PyString_Check(temp)) {
4956 /* XXX Note: this should never happen, since
4957 PyObject_Repr() and PyObject_Str() assure
4958 this */
4959 Py_DECREF(temp);
4960 PyErr_SetString(PyExc_TypeError,
4961 "%s argument has non-string str()");
4962 goto onError;
4964 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
4965 PyString_GET_SIZE(temp),
4966 NULL,
4967 "strict");
4968 Py_DECREF(temp);
4969 temp = unicode;
4970 if (temp == NULL)
4971 goto onError;
4973 pbuf = PyUnicode_AS_UNICODE(temp);
4974 len = PyUnicode_GET_SIZE(temp);
4975 if (prec >= 0 && len > prec)
4976 len = prec;
4977 break;
4979 case 'i':
4980 case 'd':
4981 case 'u':
4982 case 'o':
4983 case 'x':
4984 case 'X':
4985 if (c == 'i')
4986 c = 'd';
4987 pbuf = formatbuf;
4988 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4989 flags, prec, c, v);
4990 if (len < 0)
4991 goto onError;
4992 sign = (c == 'd');
4993 if (flags & F_ZERO) {
4994 fill = '0';
4995 if ((flags&F_ALT) &&
4996 (c == 'x' || c == 'X') &&
4997 pbuf[0] == '0' && pbuf[1] == c) {
4998 *res++ = *pbuf++;
4999 *res++ = *pbuf++;
5000 rescnt -= 2;
5001 len -= 2;
5002 width -= 2;
5003 if (width < 0)
5004 width = 0;
5007 break;
5009 case 'e':
5010 case 'E':
5011 case 'f':
5012 case 'g':
5013 case 'G':
5014 pbuf = formatbuf;
5015 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5016 flags, prec, c, v);
5017 if (len < 0)
5018 goto onError;
5019 sign = 1;
5020 if (flags&F_ZERO)
5021 fill = '0';
5022 break;
5024 case 'c':
5025 pbuf = formatbuf;
5026 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5027 if (len < 0)
5028 goto onError;
5029 break;
5031 default:
5032 PyErr_Format(PyExc_ValueError,
5033 "unsupported format character '%c' (0x%x)",
5034 c, c);
5035 goto onError;
5037 if (sign) {
5038 if (*pbuf == '-' || *pbuf == '+') {
5039 sign = *pbuf++;
5040 len--;
5042 else if (flags & F_SIGN)
5043 sign = '+';
5044 else if (flags & F_BLANK)
5045 sign = ' ';
5046 else
5047 sign = 0;
5049 if (width < len)
5050 width = len;
5051 if (rescnt < width + (sign != 0)) {
5052 reslen -= rescnt;
5053 rescnt = width + fmtcnt + 100;
5054 reslen += rescnt;
5055 if (_PyUnicode_Resize(result, reslen) < 0)
5056 return NULL;
5057 res = PyUnicode_AS_UNICODE(result)
5058 + reslen - rescnt;
5060 if (sign) {
5061 if (fill != ' ')
5062 *res++ = sign;
5063 rescnt--;
5064 if (width > len)
5065 width--;
5067 if (width > len && !(flags & F_LJUST)) {
5068 do {
5069 --rescnt;
5070 *res++ = fill;
5071 } while (--width > len);
5073 if (sign && fill == ' ')
5074 *res++ = sign;
5075 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5076 res += len;
5077 rescnt -= len;
5078 while (--width >= len) {
5079 --rescnt;
5080 *res++ = ' ';
5082 if (dict && (argidx < arglen) && c != '%') {
5083 PyErr_SetString(PyExc_TypeError,
5084 "not all arguments converted");
5085 goto onError;
5087 Py_XDECREF(temp);
5088 } /* '%' */
5089 } /* until end */
5090 if (argidx < arglen && !dict) {
5091 PyErr_SetString(PyExc_TypeError,
5092 "not all arguments converted");
5093 goto onError;
5096 if (args_owned) {
5097 Py_DECREF(args);
5099 Py_DECREF(uformat);
5100 if (_PyUnicode_Resize(result, reslen - rescnt))
5101 goto onError;
5102 return (PyObject *)result;
5104 onError:
5105 Py_XDECREF(result);
5106 Py_DECREF(uformat);
5107 if (args_owned) {
5108 Py_DECREF(args);
5110 return NULL;
5113 static PyBufferProcs unicode_as_buffer = {
5114 (getreadbufferproc) unicode_buffer_getreadbuf,
5115 (getwritebufferproc) unicode_buffer_getwritebuf,
5116 (getsegcountproc) unicode_buffer_getsegcount,
5117 (getcharbufferproc) unicode_buffer_getcharbuf,
5120 PyTypeObject PyUnicode_Type = {
5121 PyObject_HEAD_INIT(&PyType_Type)
5122 0, /* ob_size */
5123 "unicode", /* tp_name */
5124 sizeof(PyUnicodeObject), /* tp_size */
5125 0, /* tp_itemsize */
5126 /* Slots */
5127 (destructor)_PyUnicode_Free, /* tp_dealloc */
5128 0, /* tp_print */
5129 (getattrfunc)unicode_getattr, /* tp_getattr */
5130 0, /* tp_setattr */
5131 (cmpfunc) unicode_compare, /* tp_compare */
5132 (reprfunc) unicode_repr, /* tp_repr */
5133 0, /* tp_as_number */
5134 &unicode_as_sequence, /* tp_as_sequence */
5135 0, /* tp_as_mapping */
5136 (hashfunc) unicode_hash, /* tp_hash*/
5137 0, /* tp_call*/
5138 (reprfunc) unicode_str, /* tp_str */
5139 (getattrofunc) NULL, /* tp_getattro */
5140 (setattrofunc) NULL, /* tp_setattro */
5141 &unicode_as_buffer, /* tp_as_buffer */
5142 Py_TPFLAGS_DEFAULT, /* tp_flags */
5145 /* Initialize the Unicode implementation */
5147 void _PyUnicode_Init(void)
5149 /* Doublecheck the configuration... */
5150 if (sizeof(Py_UNICODE) != 2)
5151 Py_FatalError("Unicode configuration error: "
5152 "sizeof(Py_UNICODE) != 2 bytes");
5154 /* Init the implementation */
5155 unicode_freelist = NULL;
5156 unicode_freelist_size = 0;
5157 unicode_empty = _PyUnicode_New(0);
5158 strcpy(unicode_default_encoding, "ascii");
5161 /* Finalize the Unicode implementation */
5163 void
5164 _PyUnicode_Fini(void)
5166 PyUnicodeObject *u = unicode_freelist;
5168 while (u != NULL) {
5169 PyUnicodeObject *v = u;
5170 u = *(PyUnicodeObject **)u;
5171 if (v->str)
5172 PyMem_DEL(v->str);
5173 Py_XDECREF(v->defenc);
5174 PyObject_DEL(v);
5176 unicode_freelist = NULL;
5177 unicode_freelist_size = 0;
5178 Py_XDECREF(unicode_empty);
5179 unicode_empty = NULL;