Bump version to 1.0.
[python/dscho.git] / Objects / unicodeobject.c
blobb096faa3f29151b5d0e65f9f79d370937ac5fd71
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
10 Original header:
11 --------------------------------------------------------------------
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
29 * Written by Fredrik Lundh, January 1999.
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
65 #include "Python.h"
67 #include "unicodeobject.h"
68 #include "ucnhash.h"
70 #ifdef MS_WIN32
71 #include <windows.h>
72 #endif
74 /* Limit for the Unicode object free list */
76 #define MAX_UNICODE_FREELIST_SIZE 1024
78 /* Limit for the Unicode object free list stay alive optimization.
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
84 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
85 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
86 malloc()-overhead) bytes of unused garbage.
88 Setting the limit to 0 effectively turns the feature off.
90 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
95 #define KEEPALIVE_SIZE_LIMIT 9
97 /* Endianness switches; defaults to little endian */
99 #ifdef WORDS_BIGENDIAN
100 # define BYTEORDER_IS_BIG_ENDIAN
101 #else
102 # define BYTEORDER_IS_LITTLE_ENDIAN
103 #endif
105 /* --- Globals ------------------------------------------------------------
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
112 /* The empty Unicode object */
113 static PyUnicodeObject *unicode_empty;
115 /* Free list for Unicode objects */
116 static PyUnicodeObject *unicode_freelist;
117 static int unicode_freelist_size;
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
127 static char unicode_default_encoding[100];
129 /* --- Unicode Object ----------------------------------------------------- */
131 static
132 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
135 void *oldstr;
137 /* Shortcut if there's nothing much to do. */
138 if (unicode->length == length)
139 goto reset;
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
157 unicode->str[length] = 0;
158 unicode->length = length;
160 reset:
161 /* Reset the object caches */
162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
166 unicode->hash = -1;
168 return 0;
171 int PyUnicode_Resize(PyObject **unicode,
172 int length)
174 PyUnicodeObject *v;
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
185 return _PyUnicode_Resize(v, length);
188 /* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
196 static
197 PyUnicodeObject *_PyUnicode_New(int length)
199 register PyUnicodeObject *unicode;
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
210 unicode_freelist = *(PyUnicodeObject **)unicode;
211 unicode_freelist_size--;
212 if (unicode->str) {
213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
216 _PyUnicode_Resize(unicode, length)) {
217 PyMem_DEL(unicode->str);
218 goto onError;
221 else {
222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
224 PyObject_INIT(unicode, &PyUnicode_Type);
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
233 if (!unicode->str) {
234 PyErr_NoMemory();
235 goto onError;
237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
240 unicode->defenc = NULL;
241 return unicode;
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
245 PyObject_DEL(unicode);
246 return NULL;
249 static
250 void _PyUnicode_Free(register PyUnicodeObject *unicode)
252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
255 PyMem_DEL(unicode->str);
256 unicode->str = NULL;
257 unicode->length = 0;
259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
263 /* Add to free list */
264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
268 else {
269 PyMem_DEL(unicode->str);
270 Py_XDECREF(unicode->defenc);
271 PyObject_DEL(unicode);
275 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
278 PyUnicodeObject *unicode;
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
288 return (PyObject *)unicode;
291 #ifdef HAVE_WCHAR_H
293 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
296 PyUnicodeObject *unicode;
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
307 /* Copy the wchar_t data into the new object */
308 #ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310 #else
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
318 #endif
320 return (PyObject *)unicode;
323 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333 #ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335 #else
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
343 #endif
345 return size;
348 #endif
350 PyObject *PyUnicode_FromObject(register PyObject *obj)
352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
355 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
359 const char *s;
360 int len;
361 int owned = 0;
362 PyObject *v;
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
384 if (PyUnicode_Check(obj)) {
385 Py_INCREF(obj);
386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
392 goto done;
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
406 goto onError;
409 /* Convert to Unicode */
410 if (len == 0) {
411 Py_INCREF(unicode_empty);
412 v = (PyObject *)unicode_empty;
414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
416 done:
417 if (owned) {
418 Py_DECREF(obj);
420 return v;
422 onError:
423 if (owned) {
424 Py_DECREF(obj);
426 return NULL;
429 PyObject *PyUnicode_Decode(const char *s,
430 int size,
431 const char *encoding,
432 const char *errors)
434 PyObject *buffer = NULL, *unicode;
436 if (encoding == NULL)
437 encoding = PyUnicode_GetDefaultEncoding();
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding, "utf-8") == 0)
441 return PyUnicode_DecodeUTF8(s, size, errors);
442 else if (strcmp(encoding, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s, size, errors);
444 else if (strcmp(encoding, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s, size, errors);
447 /* Decode via the codec registry */
448 buffer = PyBuffer_FromMemory((void *)s, size);
449 if (buffer == NULL)
450 goto onError;
451 unicode = PyCodec_Decode(buffer, encoding, errors);
452 if (unicode == NULL)
453 goto onError;
454 if (!PyUnicode_Check(unicode)) {
455 PyErr_Format(PyExc_TypeError,
456 "decoder did not return an unicode object (type=%.400s)",
457 unicode->ob_type->tp_name);
458 Py_DECREF(unicode);
459 goto onError;
461 Py_DECREF(buffer);
462 return unicode;
464 onError:
465 Py_XDECREF(buffer);
466 return NULL;
469 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
470 int size,
471 const char *encoding,
472 const char *errors)
474 PyObject *v, *unicode;
476 unicode = PyUnicode_FromUnicode(s, size);
477 if (unicode == NULL)
478 return NULL;
479 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
480 Py_DECREF(unicode);
481 return v;
484 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
485 const char *encoding,
486 const char *errors)
488 PyObject *v;
490 if (!PyUnicode_Check(unicode)) {
491 PyErr_BadArgument();
492 goto onError;
495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
498 /* Shortcuts for common default encodings */
499 if (errors == NULL) {
500 if (strcmp(encoding, "utf-8") == 0)
501 return PyUnicode_AsUTF8String(unicode);
502 else if (strcmp(encoding, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode);
504 else if (strcmp(encoding, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode);
508 /* Encode via the codec registry */
509 v = PyCodec_Encode(unicode, encoding, errors);
510 if (v == NULL)
511 goto onError;
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v)) {
514 PyErr_Format(PyExc_TypeError,
515 "encoder did not return a string object (type=%.400s)",
516 v->ob_type->tp_name);
517 Py_DECREF(v);
518 goto onError;
520 return v;
522 onError:
523 return NULL;
526 /* Return a Python string holding the default encoded value of the
527 Unicode object.
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
534 The refcount of the string is *not* incremented.
536 *** Exported for internal use by the interpreter only !!! ***
540 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
541 const char *errors)
543 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
545 if (v)
546 return v;
547 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
548 if (v && errors == NULL)
549 ((PyUnicodeObject *)unicode)->defenc = v;
550 return v;
553 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
555 if (!PyUnicode_Check(unicode)) {
556 PyErr_BadArgument();
557 goto onError;
559 return PyUnicode_AS_UNICODE(unicode);
561 onError:
562 return NULL;
565 int PyUnicode_GetSize(PyObject *unicode)
567 if (!PyUnicode_Check(unicode)) {
568 PyErr_BadArgument();
569 goto onError;
571 return PyUnicode_GET_SIZE(unicode);
573 onError:
574 return -1;
577 const char *PyUnicode_GetDefaultEncoding(void)
579 return unicode_default_encoding;
582 int PyUnicode_SetDefaultEncoding(const char *encoding)
584 PyObject *v;
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v = _PyCodec_Lookup(encoding);
589 if (v == NULL)
590 goto onError;
591 Py_DECREF(v);
592 strncpy(unicode_default_encoding,
593 encoding,
594 sizeof(unicode_default_encoding));
595 return 0;
597 onError:
598 return -1;
601 /* --- UTF-8 Codec -------------------------------------------------------- */
603 static
604 char utf8_code_length[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
625 static
626 int utf8_decoding_error(const char **source,
627 Py_UNICODE **dest,
628 const char *errors,
629 const char *details)
631 if ((errors == NULL) ||
632 (strcmp(errors,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError,
634 "UTF-8 decoding error: %.400s",
635 details);
636 return -1;
638 else if (strcmp(errors,"ignore") == 0) {
639 (*source)++;
640 return 0;
642 else if (strcmp(errors,"replace") == 0) {
643 (*source)++;
644 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
645 (*dest)++;
646 return 0;
648 else {
649 PyErr_Format(PyExc_ValueError,
650 "UTF-8 decoding error; unknown error handling code: %.400s",
651 errors);
652 return -1;
656 PyObject *PyUnicode_DecodeUTF8(const char *s,
657 int size,
658 const char *errors)
660 int n;
661 const char *e;
662 PyUnicodeObject *unicode;
663 Py_UNICODE *p;
664 const char *errmsg = "";
666 /* Note: size will always be longer than the resulting Unicode
667 character count */
668 unicode = _PyUnicode_New(size);
669 if (!unicode)
670 return NULL;
671 if (size == 0)
672 return (PyObject *)unicode;
674 /* Unpack UTF-8 encoded data */
675 p = unicode->str;
676 e = s + size;
678 while (s < e) {
679 Py_UCS4 ch = (unsigned char)*s;
681 if (ch < 0x80) {
682 *p++ = (Py_UNICODE)ch;
683 s++;
684 continue;
687 n = utf8_code_length[ch];
689 if (s + n > e) {
690 errmsg = "unexpected end of data";
691 goto utf8Error;
694 switch (n) {
696 case 0:
697 errmsg = "unexpected code byte";
698 goto utf8Error;
699 break;
701 case 1:
702 errmsg = "internal error";
703 goto utf8Error;
704 break;
706 case 2:
707 if ((s[1] & 0xc0) != 0x80) {
708 errmsg = "invalid data";
709 goto utf8Error;
711 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
712 if (ch < 0x80) {
713 errmsg = "illegal encoding";
714 goto utf8Error;
716 else
717 *p++ = (Py_UNICODE)ch;
718 break;
720 case 3:
721 if ((s[1] & 0xc0) != 0x80 ||
722 (s[2] & 0xc0) != 0x80) {
723 errmsg = "invalid data";
724 goto utf8Error;
726 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
727 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
728 errmsg = "illegal encoding";
729 goto utf8Error;
731 else
732 *p++ = (Py_UNICODE)ch;
733 break;
735 case 4:
736 if ((s[1] & 0xc0) != 0x80 ||
737 (s[2] & 0xc0) != 0x80 ||
738 (s[3] & 0xc0) != 0x80) {
739 errmsg = "invalid data";
740 goto utf8Error;
742 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
743 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
744 /* validate and convert to UTF-16 */
745 if ((ch < 0x10000) || /* minimum value allowed for 4
746 byte encoding */
747 (ch > 0x10ffff)) { /* maximum value allowed for
748 UTF-16 */
749 errmsg = "illegal encoding";
750 goto utf8Error;
752 /* compute and append the two surrogates: */
754 /* translate from 10000..10FFFF to 0..FFFF */
755 ch -= 0x10000;
757 /* high surrogate = top 10 bits added to D800 */
758 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
762 break;
764 default:
765 /* Other sizes are only needed for UCS-4 */
766 errmsg = "unsupported Unicode code range";
767 goto utf8Error;
768 break;
770 s += n;
771 continue;
773 utf8Error:
774 if (utf8_decoding_error(&s, &p, errors, errmsg))
775 goto onError;
778 /* Adjust length */
779 if (_PyUnicode_Resize(unicode, p - unicode->str))
780 goto onError;
782 return (PyObject *)unicode;
784 onError:
785 Py_DECREF(unicode);
786 return NULL;
789 /* Not used anymore, now that the encoder supports UTF-16
790 surrogates. */
791 #if 0
792 static
793 int utf8_encoding_error(const Py_UNICODE **source,
794 char **dest,
795 const char *errors,
796 const char *details)
798 if ((errors == NULL) ||
799 (strcmp(errors,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError,
801 "UTF-8 encoding error: %.400s",
802 details);
803 return -1;
805 else if (strcmp(errors,"ignore") == 0) {
806 return 0;
808 else if (strcmp(errors,"replace") == 0) {
809 **dest = '?';
810 (*dest)++;
811 return 0;
813 else {
814 PyErr_Format(PyExc_ValueError,
815 "UTF-8 encoding error; "
816 "unknown error handling code: %.400s",
817 errors);
818 return -1;
821 #endif
823 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
824 int size,
825 const char *errors)
827 PyObject *v;
828 char *p;
829 char *q;
830 Py_UCS4 ch2;
831 unsigned int cbAllocated = 3 * size;
832 unsigned int cbWritten = 0;
833 int i = 0;
835 v = PyString_FromStringAndSize(NULL, cbAllocated);
836 if (v == NULL)
837 return NULL;
838 if (size == 0)
839 return v;
841 p = q = PyString_AS_STRING(v);
842 while (i < size) {
843 Py_UCS4 ch = s[i++];
844 if (ch < 0x80) {
845 *p++ = (char) ch;
846 cbWritten++;
848 else if (ch < 0x0800) {
849 *p++ = 0xc0 | (ch >> 6);
850 *p++ = 0x80 | (ch & 0x3f);
851 cbWritten += 2;
853 else {
854 /* Check for high surrogate */
855 if (0xD800 <= ch && ch <= 0xDBFF) {
856 if (i != size) {
857 ch2 = s[i];
858 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
860 if (cbWritten >= (cbAllocated - 4)) {
861 /* Provide enough room for some more
862 surrogates */
863 cbAllocated += 4*10;
864 if (_PyString_Resize(&v, cbAllocated))
865 goto onError;
868 /* combine the two values */
869 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
871 *p++ = (char)((ch >> 18) | 0xf0);
872 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
873 i++;
874 cbWritten += 4;
878 else {
879 *p++ = (char)(0xe0 | (ch >> 12));
880 cbWritten += 3;
882 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
883 *p++ = (char)(0x80 | (ch & 0x3f));
886 *p = '\0';
887 if (_PyString_Resize(&v, p - q))
888 goto onError;
889 return v;
891 onError:
892 Py_DECREF(v);
893 return NULL;
896 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
898 if (!PyUnicode_Check(unicode)) {
899 PyErr_BadArgument();
900 return NULL;
902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
903 PyUnicode_GET_SIZE(unicode),
904 NULL);
907 /* --- UTF-16 Codec ------------------------------------------------------- */
909 static
910 int utf16_decoding_error(const Py_UNICODE **source,
911 Py_UNICODE **dest,
912 const char *errors,
913 const char *details)
915 if ((errors == NULL) ||
916 (strcmp(errors,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError,
918 "UTF-16 decoding error: %.400s",
919 details);
920 return -1;
922 else if (strcmp(errors,"ignore") == 0) {
923 return 0;
925 else if (strcmp(errors,"replace") == 0) {
926 if (dest) {
927 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
928 (*dest)++;
930 return 0;
932 else {
933 PyErr_Format(PyExc_ValueError,
934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
936 errors);
937 return -1;
941 PyObject *PyUnicode_DecodeUTF16(const char *s,
942 int size,
943 const char *errors,
944 int *byteorder)
946 PyUnicodeObject *unicode;
947 Py_UNICODE *p;
948 const Py_UNICODE *q, *e;
949 int bo = 0;
950 const char *errmsg = "";
952 /* size should be an even number */
953 if (size % sizeof(Py_UNICODE) != 0) {
954 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
955 return NULL;
956 /* The remaining input chars are ignored if we fall through
957 here... */
960 /* Note: size will always be longer than the resulting Unicode
961 character count */
962 unicode = _PyUnicode_New(size);
963 if (!unicode)
964 return NULL;
965 if (size == 0)
966 return (PyObject *)unicode;
968 /* Unpack UTF-16 encoded data */
969 p = unicode->str;
970 q = (Py_UNICODE *)s;
971 e = q + (size / sizeof(Py_UNICODE));
973 if (byteorder)
974 bo = *byteorder;
976 while (q < e) {
977 register Py_UNICODE ch = *q++;
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
982 !) */
983 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
984 if (ch == 0xFEFF) {
985 bo = -1;
986 continue;
987 } else if (ch == 0xFFFE) {
988 bo = 1;
989 continue;
991 if (bo == 1)
992 ch = (ch >> 8) | (ch << 8);
993 #else
994 if (ch == 0xFEFF) {
995 bo = 1;
996 continue;
997 } else if (ch == 0xFFFE) {
998 bo = -1;
999 continue;
1001 if (bo == -1)
1002 ch = (ch >> 8) | (ch << 8);
1003 #endif
1004 if (ch < 0xD800 || ch > 0xDFFF) {
1005 *p++ = ch;
1006 continue;
1009 /* UTF-16 code pair: */
1010 if (q >= e) {
1011 errmsg = "unexpected end of data";
1012 goto utf16Error;
1014 if (0xDC00 <= *q && *q <= 0xDFFF) {
1015 q++;
1016 if (0xD800 <= *q && *q <= 0xDBFF) {
1017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
1021 errmsg = "code pairs are not supported";
1022 goto utf16Error;
1024 else
1025 continue;
1027 errmsg = "illegal encoding";
1028 /* Fall through to report the error */
1030 utf16Error:
1031 if (utf16_decoding_error(&q, &p, errors, errmsg))
1032 goto onError;
1035 if (byteorder)
1036 *byteorder = bo;
1038 /* Adjust length */
1039 if (_PyUnicode_Resize(unicode, p - unicode->str))
1040 goto onError;
1042 return (PyObject *)unicode;
1044 onError:
1045 Py_DECREF(unicode);
1046 return NULL;
1049 #undef UTF16_ERROR
1051 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052 int size,
1053 const char *errors,
1054 int byteorder)
1056 PyObject *v;
1057 Py_UNICODE *p;
1058 char *q;
1060 /* We don't create UTF-16 pairs... */
1061 v = PyString_FromStringAndSize(NULL,
1062 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063 if (v == NULL)
1064 return NULL;
1066 q = PyString_AS_STRING(v);
1067 p = (Py_UNICODE *)q;
1068 if (byteorder == 0)
1069 *p++ = 0xFEFF;
1070 if (size == 0)
1071 return v;
1072 if (byteorder == 0 ||
1073 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074 byteorder == -1
1075 #else
1076 byteorder == 1
1077 #endif
1079 memcpy(p, s, size * sizeof(Py_UNICODE));
1080 else
1081 while (size-- > 0) {
1082 Py_UNICODE ch = *s++;
1083 *p++ = (ch >> 8) | (ch << 8);
1085 return v;
1088 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1090 if (!PyUnicode_Check(unicode)) {
1091 PyErr_BadArgument();
1092 return NULL;
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095 PyUnicode_GET_SIZE(unicode),
1096 NULL,
1100 /* --- Unicode Escape Codec ----------------------------------------------- */
1102 static
1103 int unicodeescape_decoding_error(const char **source,
1104 Py_UNICODE *x,
1105 const char *errors,
1106 const char *details)
1108 if ((errors == NULL) ||
1109 (strcmp(errors,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError,
1111 "Unicode-Escape decoding error: %.400s",
1112 details);
1113 return -1;
1115 else if (strcmp(errors,"ignore") == 0) {
1116 return 0;
1118 else if (strcmp(errors,"replace") == 0) {
1119 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1120 return 0;
1122 else {
1123 PyErr_Format(PyExc_ValueError,
1124 "Unicode-Escape decoding error; "
1125 "unknown error handling code: %.400s",
1126 errors);
1127 return -1;
1131 static _Py_UCNHashAPI *pucnHash = NULL;
1133 static
1134 int mystrnicmp(const char *s1, const char *s2, size_t count)
1136 char c1, c2;
1138 if (count)
1142 c1 = tolower(*(s1++));
1143 c2 = tolower(*(s2++));
1145 while(--count && c1 == c2);
1147 return c1 - c2;
1150 return 0;
1153 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154 int size,
1155 const char *errors)
1157 PyUnicodeObject *v;
1158 Py_UNICODE *p = NULL, *buf = NULL;
1159 const char *end;
1160 Py_UCS4 chr;
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v = _PyUnicode_New(size);
1166 if (v == NULL)
1167 goto onError;
1168 if (size == 0)
1169 return (PyObject *)v;
1170 p = buf = PyUnicode_AS_UNICODE(v);
1171 end = s + size;
1172 while (s < end) {
1173 unsigned char c;
1174 Py_UNICODE x;
1175 int i;
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1178 if (*s != '\\') {
1179 *p++ = (unsigned char)*s++;
1180 continue;
1183 /* \ - Escapes */
1184 s++;
1185 switch (*s++) {
1187 /* \x escapes */
1188 case '\n': break;
1189 case '\\': *p++ = '\\'; break;
1190 case '\'': *p++ = '\''; break;
1191 case '\"': *p++ = '\"'; break;
1192 case 'b': *p++ = '\b'; break;
1193 case 'f': *p++ = '\014'; break; /* FF */
1194 case 't': *p++ = '\t'; break;
1195 case 'n': *p++ = '\n'; break;
1196 case 'r': *p++ = '\r'; break;
1197 case 'v': *p++ = '\013'; break; /* VT */
1198 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
1203 x = s[-1] - '0';
1204 if ('0' <= *s && *s <= '7') {
1205 x = (x<<3) + *s++ - '0';
1206 if ('0' <= *s && *s <= '7')
1207 x = (x<<3) + *s++ - '0';
1209 *p++ = x;
1210 break;
1212 /* \xXX with two hex digits */
1213 case 'x':
1214 for (x = 0, i = 0; i < 2; i++) {
1215 c = (unsigned char)s[i];
1216 if (!isxdigit(c)) {
1217 if (unicodeescape_decoding_error(&s, &x, errors,
1218 "truncated \\xXX"))
1219 goto onError;
1220 i++;
1221 break;
1223 x = (x<<4) & ~0xF;
1224 if (c >= '0' && c <= '9')
1225 x += c - '0';
1226 else if (c >= 'a' && c <= 'f')
1227 x += 10 + c - 'a';
1228 else
1229 x += 10 + c - 'A';
1231 s += i;
1232 *p++ = x;
1233 break;
1235 /* \uXXXX with 4 hex digits */
1236 case 'u':
1237 for (x = 0, i = 0; i < 4; i++) {
1238 c = (unsigned char)s[i];
1239 if (!isxdigit(c)) {
1240 if (unicodeescape_decoding_error(&s, &x, errors,
1241 "truncated \\uXXXX"))
1242 goto onError;
1243 i++;
1244 break;
1246 x = (x<<4) & ~0xF;
1247 if (c >= '0' && c <= '9')
1248 x += c - '0';
1249 else if (c >= 'a' && c <= 'f')
1250 x += 10 + c - 'a';
1251 else
1252 x += 10 + c - 'A';
1254 s += i;
1255 *p++ = x;
1256 break;
1258 /* \UXXXXXXXX with 8 hex digits */
1259 case 'U':
1260 for (chr = 0, i = 0; i < 8; i++) {
1261 c = (unsigned char)s[i];
1262 if (!isxdigit(c)) {
1263 if (unicodeescape_decoding_error(&s, &x, errors,
1264 "truncated \\uXXXX"))
1265 goto onError;
1266 i++;
1267 break;
1269 chr = (chr<<4) & ~0xF;
1270 if (c >= '0' && c <= '9')
1271 chr += c - '0';
1272 else if (c >= 'a' && c <= 'f')
1273 chr += 10 + c - 'a';
1274 else
1275 chr += 10 + c - 'A';
1277 s += i;
1278 goto store;
1280 case 'N':
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1284 if (pucnHash == NULL) {
1285 PyObject *mod = 0, *v = 0;
1286 mod = PyImport_ImportModule("ucnhash");
1287 if (mod == NULL)
1288 goto onError;
1289 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290 Py_DECREF(mod);
1291 if (v == NULL)
1292 goto onError;
1293 pucnHash = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (pucnHash == NULL)
1296 goto onError;
1299 if (*s == '{') {
1300 const char *start = s + 1;
1301 const char *endBrace = start;
1302 unsigned long j;
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1307 while (*endBrace != '}' &&
1308 (unsigned int)(endBrace - start) <=
1309 pucnHash->cchMax &&
1310 endBrace < end)
1312 endBrace++;
1314 if (endBrace != end && *endBrace == '}') {
1315 j = pucnHash->hash(start, endBrace - start);
1316 if (j > pucnHash->cKeys ||
1317 mystrnicmp(
1318 start,
1319 ((_Py_UnicodeCharacterName *)
1320 (pucnHash->getValue(j)))->pszUCN,
1321 (int)(endBrace - start)) != 0)
1323 if (unicodeescape_decoding_error(
1324 &s, &x, errors,
1325 "Invalid Unicode Character Name"))
1327 goto onError;
1329 goto ucnFallthrough;
1331 chr = ((_Py_UnicodeCharacterName *)
1332 (pucnHash->getValue(j)))->value;
1333 s = endBrace + 1;
1334 goto store;
1335 } else {
1336 if (unicodeescape_decoding_error(
1337 &s, &x, errors,
1338 "Unicode name missing closing brace"))
1339 goto onError;
1340 goto ucnFallthrough;
1342 break;
1344 if (unicodeescape_decoding_error(
1345 &s, &x, errors,
1346 "Missing opening brace for Unicode Character Name escape"))
1347 goto onError;
1348 ucnFallthrough:
1349 /* fall through on purpose */
1350 default:
1351 *p++ = '\\';
1352 *p++ = (unsigned char)s[-1];
1353 break;
1354 store:
1355 /* when we get here, chr is a 32-bit unicode character */
1356 if (chr <= 0xffff)
1357 /* UCS-2 character */
1358 *p++ = (Py_UNICODE) chr;
1359 else if (chr <= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1361 chr -= 0x10000L;
1362 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364 } else {
1365 if (unicodeescape_decoding_error(
1366 &s, &x, errors,
1367 "Illegal Unicode character")
1369 goto onError;
1373 if (_PyUnicode_Resize(v, (int)(p - buf)))
1374 goto onError;
1375 return (PyObject *)v;
1377 onError:
1378 Py_XDECREF(v);
1379 return NULL;
1382 /* Return a Unicode-Escape string version of the Unicode object.
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1385 appropriate.
1389 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390 int size,
1391 Py_UNICODE ch);
1393 static
1394 PyObject *unicodeescape_string(const Py_UNICODE *s,
1395 int size,
1396 int quotes)
1398 PyObject *repr;
1399 char *p;
1400 char *q;
1402 static const char *hexdigit = "0123456789ABCDEF";
1404 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405 if (repr == NULL)
1406 return NULL;
1408 p = q = PyString_AS_STRING(repr);
1410 if (quotes) {
1411 *p++ = 'u';
1412 *p++ = (findchar(s, size, '\'') &&
1413 !findchar(s, size, '"')) ? '"' : '\'';
1415 while (size-- > 0) {
1416 Py_UNICODE ch = *s++;
1417 /* Escape quotes */
1418 if (quotes && (ch == q[1] || ch == '\\')) {
1419 *p++ = '\\';
1420 *p++ = (char) ch;
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch >= 256) {
1424 *p++ = '\\';
1425 *p++ = 'u';
1426 *p++ = hexdigit[(ch >> 12) & 0xf];
1427 *p++ = hexdigit[(ch >> 8) & 0xf];
1428 *p++ = hexdigit[(ch >> 4) & 0xf];
1429 *p++ = hexdigit[ch & 15];
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch < ' ' || ch >= 128) {
1433 *p++ = '\\';
1434 *p++ = hexdigit[(ch >> 6) & 7];
1435 *p++ = hexdigit[(ch >> 3) & 7];
1436 *p++ = hexdigit[ch & 7];
1438 /* Copy everything else as-is */
1439 else
1440 *p++ = (char) ch;
1442 if (quotes)
1443 *p++ = q[1];
1445 *p = '\0';
1446 if (_PyString_Resize(&repr, p - q))
1447 goto onError;
1449 return repr;
1451 onError:
1452 Py_DECREF(repr);
1453 return NULL;
1456 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457 int size)
1459 return unicodeescape_string(s, size, 0);
1462 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1464 if (!PyUnicode_Check(unicode)) {
1465 PyErr_BadArgument();
1466 return NULL;
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469 PyUnicode_GET_SIZE(unicode));
1472 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1474 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475 int size,
1476 const char *errors)
1478 PyUnicodeObject *v;
1479 Py_UNICODE *p, *buf;
1480 const char *end;
1481 const char *bs;
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v = _PyUnicode_New(size);
1487 if (v == NULL)
1488 goto onError;
1489 if (size == 0)
1490 return (PyObject *)v;
1491 p = buf = PyUnicode_AS_UNICODE(v);
1492 end = s + size;
1493 while (s < end) {
1494 unsigned char c;
1495 Py_UNICODE x;
1496 int i;
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1499 if (*s != '\\') {
1500 *p++ = (unsigned char)*s++;
1501 continue;
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1506 bs = s;
1507 for (;s < end;) {
1508 if (*s != '\\')
1509 break;
1510 *p++ = (unsigned char)*s++;
1512 if (((s - bs) & 1) == 0 ||
1513 s >= end ||
1514 *s != 'u') {
1515 continue;
1517 p--;
1518 s++;
1520 /* \uXXXX with 4 hex digits */
1521 for (x = 0, i = 0; i < 4; i++) {
1522 c = (unsigned char)s[i];
1523 if (!isxdigit(c)) {
1524 if (unicodeescape_decoding_error(&s, &x, errors,
1525 "truncated \\uXXXX"))
1526 goto onError;
1527 i++;
1528 break;
1530 x = (x<<4) & ~0xF;
1531 if (c >= '0' && c <= '9')
1532 x += c - '0';
1533 else if (c >= 'a' && c <= 'f')
1534 x += 10 + c - 'a';
1535 else
1536 x += 10 + c - 'A';
1538 s += i;
1539 *p++ = x;
1541 if (_PyUnicode_Resize(v, (int)(p - buf)))
1542 goto onError;
1543 return (PyObject *)v;
1545 onError:
1546 Py_XDECREF(v);
1547 return NULL;
1550 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551 int size)
1553 PyObject *repr;
1554 char *p;
1555 char *q;
1557 static const char *hexdigit = "0123456789ABCDEF";
1559 repr = PyString_FromStringAndSize(NULL, 6 * size);
1560 if (repr == NULL)
1561 return NULL;
1562 if (size == 0)
1563 return repr;
1565 p = q = PyString_AS_STRING(repr);
1566 while (size-- > 0) {
1567 Py_UNICODE ch = *s++;
1568 /* Map 16-bit characters to '\uxxxx' */
1569 if (ch >= 256) {
1570 *p++ = '\\';
1571 *p++ = 'u';
1572 *p++ = hexdigit[(ch >> 12) & 0xf];
1573 *p++ = hexdigit[(ch >> 8) & 0xf];
1574 *p++ = hexdigit[(ch >> 4) & 0xf];
1575 *p++ = hexdigit[ch & 15];
1577 /* Copy everything else as-is */
1578 else
1579 *p++ = (char) ch;
1581 *p = '\0';
1582 if (_PyString_Resize(&repr, p - q))
1583 goto onError;
1585 return repr;
1587 onError:
1588 Py_DECREF(repr);
1589 return NULL;
1592 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1594 if (!PyUnicode_Check(unicode)) {
1595 PyErr_BadArgument();
1596 return NULL;
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599 PyUnicode_GET_SIZE(unicode));
1602 /* --- Latin-1 Codec ------------------------------------------------------ */
1604 PyObject *PyUnicode_DecodeLatin1(const char *s,
1605 int size,
1606 const char *errors)
1608 PyUnicodeObject *v;
1609 Py_UNICODE *p;
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v = _PyUnicode_New(size);
1613 if (v == NULL)
1614 goto onError;
1615 if (size == 0)
1616 return (PyObject *)v;
1617 p = PyUnicode_AS_UNICODE(v);
1618 while (size-- > 0)
1619 *p++ = (unsigned char)*s++;
1620 return (PyObject *)v;
1622 onError:
1623 Py_XDECREF(v);
1624 return NULL;
1627 static
1628 int latin1_encoding_error(const Py_UNICODE **source,
1629 char **dest,
1630 const char *errors,
1631 const char *details)
1633 if ((errors == NULL) ||
1634 (strcmp(errors,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError,
1636 "Latin-1 encoding error: %.400s",
1637 details);
1638 return -1;
1640 else if (strcmp(errors,"ignore") == 0) {
1641 return 0;
1643 else if (strcmp(errors,"replace") == 0) {
1644 **dest = '?';
1645 (*dest)++;
1646 return 0;
1648 else {
1649 PyErr_Format(PyExc_ValueError,
1650 "Latin-1 encoding error; "
1651 "unknown error handling code: %.400s",
1652 errors);
1653 return -1;
1657 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658 int size,
1659 const char *errors)
1661 PyObject *repr;
1662 char *s, *start;
1664 repr = PyString_FromStringAndSize(NULL, size);
1665 if (repr == NULL)
1666 return NULL;
1667 if (size == 0)
1668 return repr;
1670 s = PyString_AS_STRING(repr);
1671 start = s;
1672 while (size-- > 0) {
1673 Py_UNICODE ch = *p++;
1674 if (ch >= 256) {
1675 if (latin1_encoding_error(&p, &s, errors,
1676 "ordinal not in range(256)"))
1677 goto onError;
1679 else
1680 *s++ = (char)ch;
1682 /* Resize if error handling skipped some characters */
1683 if (s - start < PyString_GET_SIZE(repr))
1684 if (_PyString_Resize(&repr, s - start))
1685 goto onError;
1686 return repr;
1688 onError:
1689 Py_DECREF(repr);
1690 return NULL;
1693 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1695 if (!PyUnicode_Check(unicode)) {
1696 PyErr_BadArgument();
1697 return NULL;
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700 PyUnicode_GET_SIZE(unicode),
1701 NULL);
1704 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1706 static
1707 int ascii_decoding_error(const char **source,
1708 Py_UNICODE **dest,
1709 const char *errors,
1710 const char *details)
1712 if ((errors == NULL) ||
1713 (strcmp(errors,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError,
1715 "ASCII decoding error: %.400s",
1716 details);
1717 return -1;
1719 else if (strcmp(errors,"ignore") == 0) {
1720 return 0;
1722 else if (strcmp(errors,"replace") == 0) {
1723 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724 (*dest)++;
1725 return 0;
1727 else {
1728 PyErr_Format(PyExc_ValueError,
1729 "ASCII decoding error; "
1730 "unknown error handling code: %.400s",
1731 errors);
1732 return -1;
1736 PyObject *PyUnicode_DecodeASCII(const char *s,
1737 int size,
1738 const char *errors)
1740 PyUnicodeObject *v;
1741 Py_UNICODE *p;
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v = _PyUnicode_New(size);
1745 if (v == NULL)
1746 goto onError;
1747 if (size == 0)
1748 return (PyObject *)v;
1749 p = PyUnicode_AS_UNICODE(v);
1750 while (size-- > 0) {
1751 register unsigned char c;
1753 c = (unsigned char)*s++;
1754 if (c < 128)
1755 *p++ = c;
1756 else if (ascii_decoding_error(&s, &p, errors,
1757 "ordinal not in range(128)"))
1758 goto onError;
1760 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762 goto onError;
1763 return (PyObject *)v;
1765 onError:
1766 Py_XDECREF(v);
1767 return NULL;
1770 static
1771 int ascii_encoding_error(const Py_UNICODE **source,
1772 char **dest,
1773 const char *errors,
1774 const char *details)
1776 if ((errors == NULL) ||
1777 (strcmp(errors,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError,
1779 "ASCII encoding error: %.400s",
1780 details);
1781 return -1;
1783 else if (strcmp(errors,"ignore") == 0) {
1784 return 0;
1786 else if (strcmp(errors,"replace") == 0) {
1787 **dest = '?';
1788 (*dest)++;
1789 return 0;
1791 else {
1792 PyErr_Format(PyExc_ValueError,
1793 "ASCII encoding error; "
1794 "unknown error handling code: %.400s",
1795 errors);
1796 return -1;
1800 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801 int size,
1802 const char *errors)
1804 PyObject *repr;
1805 char *s, *start;
1807 repr = PyString_FromStringAndSize(NULL, size);
1808 if (repr == NULL)
1809 return NULL;
1810 if (size == 0)
1811 return repr;
1813 s = PyString_AS_STRING(repr);
1814 start = s;
1815 while (size-- > 0) {
1816 Py_UNICODE ch = *p++;
1817 if (ch >= 128) {
1818 if (ascii_encoding_error(&p, &s, errors,
1819 "ordinal not in range(128)"))
1820 goto onError;
1822 else
1823 *s++ = (char)ch;
1825 /* Resize if error handling skipped some characters */
1826 if (s - start < PyString_GET_SIZE(repr))
1827 if (_PyString_Resize(&repr, s - start))
1828 goto onError;
1829 return repr;
1831 onError:
1832 Py_DECREF(repr);
1833 return NULL;
1836 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1838 if (!PyUnicode_Check(unicode)) {
1839 PyErr_BadArgument();
1840 return NULL;
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843 PyUnicode_GET_SIZE(unicode),
1844 NULL);
1847 #ifdef MS_WIN32
1849 /* --- MBCS codecs for Windows -------------------------------------------- */
1851 PyObject *PyUnicode_DecodeMBCS(const char *s,
1852 int size,
1853 const char *errors)
1855 PyUnicodeObject *v;
1856 Py_UNICODE *p;
1858 /* First get the size of the result */
1859 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1860 if (size > 0 && usize==0)
1861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1863 v = _PyUnicode_New(usize);
1864 if (v == NULL)
1865 return NULL;
1866 if (usize == 0)
1867 return (PyObject *)v;
1868 p = PyUnicode_AS_UNICODE(v);
1869 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870 Py_DECREF(v);
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1874 return (PyObject *)v;
1877 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878 int size,
1879 const char *errors)
1881 PyObject *repr;
1882 char *s;
1883 DWORD mbcssize;
1885 /* If there are no characters, bail now! */
1886 if (size==0)
1887 return PyString_FromString("");
1889 /* First get the size of the result */
1890 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1891 if (mbcssize==0)
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1894 repr = PyString_FromStringAndSize(NULL, mbcssize);
1895 if (repr == NULL)
1896 return NULL;
1897 if (mbcssize == 0)
1898 return repr;
1900 /* Do the conversion */
1901 s = PyString_AS_STRING(repr);
1902 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903 Py_DECREF(repr);
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1906 return repr;
1909 #endif /* MS_WIN32 */
1911 /* --- Character Mapping Codec -------------------------------------------- */
1913 static
1914 int charmap_decoding_error(const char **source,
1915 Py_UNICODE **dest,
1916 const char *errors,
1917 const char *details)
1919 if ((errors == NULL) ||
1920 (strcmp(errors,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError,
1922 "charmap decoding error: %.400s",
1923 details);
1924 return -1;
1926 else if (strcmp(errors,"ignore") == 0) {
1927 return 0;
1929 else if (strcmp(errors,"replace") == 0) {
1930 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931 (*dest)++;
1932 return 0;
1934 else {
1935 PyErr_Format(PyExc_ValueError,
1936 "charmap decoding error; "
1937 "unknown error handling code: %.400s",
1938 errors);
1939 return -1;
1943 PyObject *PyUnicode_DecodeCharmap(const char *s,
1944 int size,
1945 PyObject *mapping,
1946 const char *errors)
1948 PyUnicodeObject *v;
1949 Py_UNICODE *p;
1951 /* Default to Latin-1 */
1952 if (mapping == NULL)
1953 return PyUnicode_DecodeLatin1(s, size, errors);
1955 v = _PyUnicode_New(size);
1956 if (v == NULL)
1957 goto onError;
1958 if (size == 0)
1959 return (PyObject *)v;
1960 p = PyUnicode_AS_UNICODE(v);
1961 while (size-- > 0) {
1962 unsigned char ch = *s++;
1963 PyObject *w, *x;
1965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1966 w = PyInt_FromLong((long)ch);
1967 if (w == NULL)
1968 goto onError;
1969 x = PyObject_GetItem(mapping, w);
1970 Py_DECREF(w);
1971 if (x == NULL) {
1972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1973 /* No mapping found: default to Latin-1 mapping */
1974 PyErr_Clear();
1975 *p++ = (Py_UNICODE)ch;
1976 continue;
1978 goto onError;
1981 /* Apply mapping */
1982 if (PyInt_Check(x)) {
1983 long value = PyInt_AS_LONG(x);
1984 if (value < 0 || value > 65535) {
1985 PyErr_SetString(PyExc_TypeError,
1986 "character mapping must be in range(65536)");
1987 Py_DECREF(x);
1988 goto onError;
1990 *p++ = (Py_UNICODE)value;
1992 else if (x == Py_None) {
1993 /* undefined mapping */
1994 if (charmap_decoding_error(&s, &p, errors,
1995 "character maps to <undefined>")) {
1996 Py_DECREF(x);
1997 goto onError;
2000 else if (PyUnicode_Check(x)) {
2001 if (PyUnicode_GET_SIZE(x) != 1) {
2002 /* 1-n mapping */
2003 PyErr_SetString(PyExc_NotImplementedError,
2004 "1-n mappings are currently not implemented");
2005 Py_DECREF(x);
2006 goto onError;
2008 *p++ = *PyUnicode_AS_UNICODE(x);
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2017 Py_DECREF(x);
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2020 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2021 goto onError;
2022 return (PyObject *)v;
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2029 static
2030 int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
2038 "charmap encoding error: %.400s",
2039 details);
2040 return -1;
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
2053 "unknown error handling code: %.400s",
2054 errors);
2055 return -1;
2059 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2064 PyObject *v;
2065 char *s;
2067 /* Default to Latin-1 */
2068 if (mapping == NULL)
2069 return PyUnicode_EncodeLatin1(p, size, errors);
2071 v = PyString_FromStringAndSize(NULL, size);
2072 if (v == NULL)
2073 return NULL;
2074 if (size == 0)
2075 return v;
2076 s = PyString_AS_STRING(v);
2077 while (size-- > 0) {
2078 Py_UNICODE ch = *p++;
2079 PyObject *w, *x;
2081 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2082 w = PyInt_FromLong((long)ch);
2083 if (w == NULL)
2084 goto onError;
2085 x = PyObject_GetItem(mapping, w);
2086 Py_DECREF(w);
2087 if (x == NULL) {
2088 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2089 /* No mapping found: default to Latin-1 mapping if possible */
2090 PyErr_Clear();
2091 if (ch < 256) {
2092 *s++ = (char)ch;
2093 continue;
2095 else if (!charmap_encoding_error(&p, &s, errors,
2096 "missing character mapping"))
2097 continue;
2099 goto onError;
2102 /* Apply mapping */
2103 if (PyInt_Check(x)) {
2104 long value = PyInt_AS_LONG(x);
2105 if (value < 0 || value > 255) {
2106 PyErr_SetString(PyExc_TypeError,
2107 "character mapping must be in range(256)");
2108 Py_DECREF(x);
2109 goto onError;
2111 *s++ = (char)value;
2113 else if (x == Py_None) {
2114 /* undefined mapping */
2115 if (charmap_encoding_error(&p, &s, errors,
2116 "character maps to <undefined>")) {
2117 Py_DECREF(x);
2118 goto onError;
2121 else if (PyString_Check(x)) {
2122 if (PyString_GET_SIZE(x) != 1) {
2123 /* 1-n mapping */
2124 PyErr_SetString(PyExc_NotImplementedError,
2125 "1-n mappings are currently not implemented");
2126 Py_DECREF(x);
2127 goto onError;
2129 *s++ = *PyString_AS_STRING(x);
2131 else {
2132 /* wrong return value */
2133 PyErr_SetString(PyExc_TypeError,
2134 "character mapping must return integer, None or unicode");
2135 Py_DECREF(x);
2136 goto onError;
2138 Py_DECREF(x);
2140 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2141 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2142 goto onError;
2143 return v;
2145 onError:
2146 Py_DECREF(v);
2147 return NULL;
2150 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2151 PyObject *mapping)
2153 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2154 PyErr_BadArgument();
2155 return NULL;
2157 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2158 PyUnicode_GET_SIZE(unicode),
2159 mapping,
2160 NULL);
2163 static
2164 int translate_error(const Py_UNICODE **source,
2165 Py_UNICODE **dest,
2166 const char *errors,
2167 const char *details)
2169 if ((errors == NULL) ||
2170 (strcmp(errors,"strict") == 0)) {
2171 PyErr_Format(PyExc_UnicodeError,
2172 "translate error: %.400s",
2173 details);
2174 return -1;
2176 else if (strcmp(errors,"ignore") == 0) {
2177 return 0;
2179 else if (strcmp(errors,"replace") == 0) {
2180 **dest = '?';
2181 (*dest)++;
2182 return 0;
2184 else {
2185 PyErr_Format(PyExc_ValueError,
2186 "translate error; "
2187 "unknown error handling code: %.400s",
2188 errors);
2189 return -1;
2193 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2194 int size,
2195 PyObject *mapping,
2196 const char *errors)
2198 PyUnicodeObject *v;
2199 Py_UNICODE *p;
2201 if (mapping == NULL) {
2202 PyErr_BadArgument();
2203 return NULL;
2206 /* Output will never be longer than input */
2207 v = _PyUnicode_New(size);
2208 if (v == NULL)
2209 goto onError;
2210 if (size == 0)
2211 goto done;
2212 p = PyUnicode_AS_UNICODE(v);
2213 while (size-- > 0) {
2214 Py_UNICODE ch = *s++;
2215 PyObject *w, *x;
2217 /* Get mapping */
2218 w = PyInt_FromLong(ch);
2219 if (w == NULL)
2220 goto onError;
2221 x = PyObject_GetItem(mapping, w);
2222 Py_DECREF(w);
2223 if (x == NULL) {
2224 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2225 /* No mapping found: default to 1-1 mapping */
2226 PyErr_Clear();
2227 *p++ = ch;
2228 continue;
2230 goto onError;
2233 /* Apply mapping */
2234 if (PyInt_Check(x))
2235 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2236 else if (x == Py_None) {
2237 /* undefined mapping */
2238 if (translate_error(&s, &p, errors,
2239 "character maps to <undefined>")) {
2240 Py_DECREF(x);
2241 goto onError;
2244 else if (PyUnicode_Check(x)) {
2245 if (PyUnicode_GET_SIZE(x) != 1) {
2246 /* 1-n mapping */
2247 PyErr_SetString(PyExc_NotImplementedError,
2248 "1-n mappings are currently not implemented");
2249 Py_DECREF(x);
2250 goto onError;
2252 *p++ = *PyUnicode_AS_UNICODE(x);
2254 else {
2255 /* wrong return value */
2256 PyErr_SetString(PyExc_TypeError,
2257 "translate mapping must return integer, None or unicode");
2258 Py_DECREF(x);
2259 goto onError;
2261 Py_DECREF(x);
2263 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2264 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2265 goto onError;
2267 done:
2268 return (PyObject *)v;
2270 onError:
2271 Py_XDECREF(v);
2272 return NULL;
2275 PyObject *PyUnicode_Translate(PyObject *str,
2276 PyObject *mapping,
2277 const char *errors)
2279 PyObject *result;
2281 str = PyUnicode_FromObject(str);
2282 if (str == NULL)
2283 goto onError;
2284 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2285 PyUnicode_GET_SIZE(str),
2286 mapping,
2287 errors);
2288 Py_DECREF(str);
2289 return result;
2291 onError:
2292 Py_XDECREF(str);
2293 return NULL;
2296 /* --- Decimal Encoder ---------------------------------------------------- */
2298 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2299 int length,
2300 char *output,
2301 const char *errors)
2303 Py_UNICODE *p, *end;
2305 if (output == NULL) {
2306 PyErr_BadArgument();
2307 return -1;
2310 p = s;
2311 end = s + length;
2312 while (p < end) {
2313 register Py_UNICODE ch = *p++;
2314 int decimal;
2316 if (Py_UNICODE_ISSPACE(ch)) {
2317 *output++ = ' ';
2318 continue;
2320 decimal = Py_UNICODE_TODECIMAL(ch);
2321 if (decimal >= 0) {
2322 *output++ = '0' + decimal;
2323 continue;
2325 if (0 < ch && ch < 256) {
2326 *output++ = (char)ch;
2327 continue;
2329 /* All other characters are considered invalid */
2330 if (errors == NULL || strcmp(errors, "strict") == 0) {
2331 PyErr_SetString(PyExc_ValueError,
2332 "invalid decimal Unicode string");
2333 goto onError;
2335 else if (strcmp(errors, "ignore") == 0)
2336 continue;
2337 else if (strcmp(errors, "replace") == 0) {
2338 *output++ = '?';
2339 continue;
2342 /* 0-terminate the output string */
2343 *output++ = '\0';
2344 return 0;
2346 onError:
2347 return -1;
2350 /* --- Helpers ------------------------------------------------------------ */
2352 static
2353 int count(PyUnicodeObject *self,
2354 int start,
2355 int end,
2356 PyUnicodeObject *substring)
2358 int count = 0;
2360 if (substring->length == 0)
2361 return (end - start + 1);
2363 end -= substring->length;
2365 while (start <= end)
2366 if (Py_UNICODE_MATCH(self, start, substring)) {
2367 count++;
2368 start += substring->length;
2369 } else
2370 start++;
2372 return count;
2375 int PyUnicode_Count(PyObject *str,
2376 PyObject *substr,
2377 int start,
2378 int end)
2380 int result;
2382 str = PyUnicode_FromObject(str);
2383 if (str == NULL)
2384 return -1;
2385 substr = PyUnicode_FromObject(substr);
2386 if (substr == NULL) {
2387 Py_DECREF(str);
2388 return -1;
2391 result = count((PyUnicodeObject *)str,
2392 start, end,
2393 (PyUnicodeObject *)substr);
2395 Py_DECREF(str);
2396 Py_DECREF(substr);
2397 return result;
2400 static
2401 int findstring(PyUnicodeObject *self,
2402 PyUnicodeObject *substring,
2403 int start,
2404 int end,
2405 int direction)
2407 if (start < 0)
2408 start += self->length;
2409 if (start < 0)
2410 start = 0;
2412 if (substring->length == 0)
2413 return start;
2415 if (end > self->length)
2416 end = self->length;
2417 if (end < 0)
2418 end += self->length;
2419 if (end < 0)
2420 end = 0;
2422 end -= substring->length;
2424 if (direction < 0) {
2425 for (; end >= start; end--)
2426 if (Py_UNICODE_MATCH(self, end, substring))
2427 return end;
2428 } else {
2429 for (; start <= end; start++)
2430 if (Py_UNICODE_MATCH(self, start, substring))
2431 return start;
2434 return -1;
2437 int PyUnicode_Find(PyObject *str,
2438 PyObject *substr,
2439 int start,
2440 int end,
2441 int direction)
2443 int result;
2445 str = PyUnicode_FromObject(str);
2446 if (str == NULL)
2447 return -1;
2448 substr = PyUnicode_FromObject(substr);
2449 if (substr == NULL) {
2450 Py_DECREF(substr);
2451 return -1;
2454 result = findstring((PyUnicodeObject *)str,
2455 (PyUnicodeObject *)substr,
2456 start, end, direction);
2457 Py_DECREF(str);
2458 Py_DECREF(substr);
2459 return result;
2462 static
2463 int tailmatch(PyUnicodeObject *self,
2464 PyUnicodeObject *substring,
2465 int start,
2466 int end,
2467 int direction)
2469 if (start < 0)
2470 start += self->length;
2471 if (start < 0)
2472 start = 0;
2474 if (substring->length == 0)
2475 return 1;
2477 if (end > self->length)
2478 end = self->length;
2479 if (end < 0)
2480 end += self->length;
2481 if (end < 0)
2482 end = 0;
2484 end -= substring->length;
2485 if (end < start)
2486 return 0;
2488 if (direction > 0) {
2489 if (Py_UNICODE_MATCH(self, end, substring))
2490 return 1;
2491 } else {
2492 if (Py_UNICODE_MATCH(self, start, substring))
2493 return 1;
2496 return 0;
2499 int PyUnicode_Tailmatch(PyObject *str,
2500 PyObject *substr,
2501 int start,
2502 int end,
2503 int direction)
2505 int result;
2507 str = PyUnicode_FromObject(str);
2508 if (str == NULL)
2509 return -1;
2510 substr = PyUnicode_FromObject(substr);
2511 if (substr == NULL) {
2512 Py_DECREF(substr);
2513 return -1;
2516 result = tailmatch((PyUnicodeObject *)str,
2517 (PyUnicodeObject *)substr,
2518 start, end, direction);
2519 Py_DECREF(str);
2520 Py_DECREF(substr);
2521 return result;
2524 static
2525 const Py_UNICODE *findchar(const Py_UNICODE *s,
2526 int size,
2527 Py_UNICODE ch)
2529 /* like wcschr, but doesn't stop at NULL characters */
2531 while (size-- > 0) {
2532 if (*s == ch)
2533 return s;
2534 s++;
2537 return NULL;
2540 /* Apply fixfct filter to the Unicode object self and return a
2541 reference to the modified object */
2543 static
2544 PyObject *fixup(PyUnicodeObject *self,
2545 int (*fixfct)(PyUnicodeObject *s))
2548 PyUnicodeObject *u;
2550 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2551 self->length);
2552 if (u == NULL)
2553 return NULL;
2554 if (!fixfct(u)) {
2555 /* fixfct should return TRUE if it modified the buffer. If
2556 FALSE, return a reference to the original buffer instead
2557 (to save space, not time) */
2558 Py_INCREF(self);
2559 Py_DECREF(u);
2560 return (PyObject*) self;
2562 return (PyObject*) u;
2565 static
2566 int fixupper(PyUnicodeObject *self)
2568 int len = self->length;
2569 Py_UNICODE *s = self->str;
2570 int status = 0;
2572 while (len-- > 0) {
2573 register Py_UNICODE ch;
2575 ch = Py_UNICODE_TOUPPER(*s);
2576 if (ch != *s) {
2577 status = 1;
2578 *s = ch;
2580 s++;
2583 return status;
2586 static
2587 int fixlower(PyUnicodeObject *self)
2589 int len = self->length;
2590 Py_UNICODE *s = self->str;
2591 int status = 0;
2593 while (len-- > 0) {
2594 register Py_UNICODE ch;
2596 ch = Py_UNICODE_TOLOWER(*s);
2597 if (ch != *s) {
2598 status = 1;
2599 *s = ch;
2601 s++;
2604 return status;
2607 static
2608 int fixswapcase(PyUnicodeObject *self)
2610 int len = self->length;
2611 Py_UNICODE *s = self->str;
2612 int status = 0;
2614 while (len-- > 0) {
2615 if (Py_UNICODE_ISUPPER(*s)) {
2616 *s = Py_UNICODE_TOLOWER(*s);
2617 status = 1;
2618 } else if (Py_UNICODE_ISLOWER(*s)) {
2619 *s = Py_UNICODE_TOUPPER(*s);
2620 status = 1;
2622 s++;
2625 return status;
2628 static
2629 int fixcapitalize(PyUnicodeObject *self)
2631 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2632 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2633 return 1;
2635 return 0;
2638 static
2639 int fixtitle(PyUnicodeObject *self)
2641 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2642 register Py_UNICODE *e;
2643 int previous_is_cased;
2645 /* Shortcut for single character strings */
2646 if (PyUnicode_GET_SIZE(self) == 1) {
2647 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2648 if (*p != ch) {
2649 *p = ch;
2650 return 1;
2652 else
2653 return 0;
2656 e = p + PyUnicode_GET_SIZE(self);
2657 previous_is_cased = 0;
2658 for (; p < e; p++) {
2659 register const Py_UNICODE ch = *p;
2661 if (previous_is_cased)
2662 *p = Py_UNICODE_TOLOWER(ch);
2663 else
2664 *p = Py_UNICODE_TOTITLE(ch);
2666 if (Py_UNICODE_ISLOWER(ch) ||
2667 Py_UNICODE_ISUPPER(ch) ||
2668 Py_UNICODE_ISTITLE(ch))
2669 previous_is_cased = 1;
2670 else
2671 previous_is_cased = 0;
2673 return 1;
2676 PyObject *PyUnicode_Join(PyObject *separator,
2677 PyObject *seq)
2679 Py_UNICODE *sep;
2680 int seplen;
2681 PyUnicodeObject *res = NULL;
2682 int reslen = 0;
2683 Py_UNICODE *p;
2684 int seqlen = 0;
2685 int sz = 100;
2686 int i;
2688 seqlen = PySequence_Size(seq);
2689 if (seqlen < 0 && PyErr_Occurred())
2690 return NULL;
2692 if (separator == NULL) {
2693 Py_UNICODE blank = ' ';
2694 sep = &blank;
2695 seplen = 1;
2697 else {
2698 separator = PyUnicode_FromObject(separator);
2699 if (separator == NULL)
2700 return NULL;
2701 sep = PyUnicode_AS_UNICODE(separator);
2702 seplen = PyUnicode_GET_SIZE(separator);
2705 res = _PyUnicode_New(sz);
2706 if (res == NULL)
2707 goto onError;
2708 p = PyUnicode_AS_UNICODE(res);
2709 reslen = 0;
2711 for (i = 0; i < seqlen; i++) {
2712 int itemlen;
2713 PyObject *item;
2715 item = PySequence_GetItem(seq, i);
2716 if (item == NULL)
2717 goto onError;
2718 if (!PyUnicode_Check(item)) {
2719 PyObject *v;
2720 v = PyUnicode_FromObject(item);
2721 Py_DECREF(item);
2722 item = v;
2723 if (item == NULL)
2724 goto onError;
2726 itemlen = PyUnicode_GET_SIZE(item);
2727 while (reslen + itemlen + seplen >= sz) {
2728 if (_PyUnicode_Resize(res, sz*2))
2729 goto onError;
2730 sz *= 2;
2731 p = PyUnicode_AS_UNICODE(res) + reslen;
2733 if (i > 0) {
2734 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2735 p += seplen;
2736 reslen += seplen;
2738 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2739 p += itemlen;
2740 reslen += itemlen;
2741 Py_DECREF(item);
2743 if (_PyUnicode_Resize(res, reslen))
2744 goto onError;
2746 Py_XDECREF(separator);
2747 return (PyObject *)res;
2749 onError:
2750 Py_XDECREF(separator);
2751 Py_DECREF(res);
2752 return NULL;
2755 static
2756 PyUnicodeObject *pad(PyUnicodeObject *self,
2757 int left,
2758 int right,
2759 Py_UNICODE fill)
2761 PyUnicodeObject *u;
2763 if (left < 0)
2764 left = 0;
2765 if (right < 0)
2766 right = 0;
2768 if (left == 0 && right == 0) {
2769 Py_INCREF(self);
2770 return self;
2773 u = _PyUnicode_New(left + self->length + right);
2774 if (u) {
2775 if (left)
2776 Py_UNICODE_FILL(u->str, fill, left);
2777 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2778 if (right)
2779 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2782 return u;
2785 #define SPLIT_APPEND(data, left, right) \
2786 str = PyUnicode_FromUnicode(data + left, right - left); \
2787 if (!str) \
2788 goto onError; \
2789 if (PyList_Append(list, str)) { \
2790 Py_DECREF(str); \
2791 goto onError; \
2793 else \
2794 Py_DECREF(str);
2796 static
2797 PyObject *split_whitespace(PyUnicodeObject *self,
2798 PyObject *list,
2799 int maxcount)
2801 register int i;
2802 register int j;
2803 int len = self->length;
2804 PyObject *str;
2806 for (i = j = 0; i < len; ) {
2807 /* find a token */
2808 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2809 i++;
2810 j = i;
2811 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2812 i++;
2813 if (j < i) {
2814 if (maxcount-- <= 0)
2815 break;
2816 SPLIT_APPEND(self->str, j, i);
2817 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2818 i++;
2819 j = i;
2822 if (j < len) {
2823 SPLIT_APPEND(self->str, j, len);
2825 return list;
2827 onError:
2828 Py_DECREF(list);
2829 return NULL;
2832 PyObject *PyUnicode_Splitlines(PyObject *string,
2833 int keepends)
2835 register int i;
2836 register int j;
2837 int len;
2838 PyObject *list;
2839 PyObject *str;
2840 Py_UNICODE *data;
2842 string = PyUnicode_FromObject(string);
2843 if (string == NULL)
2844 return NULL;
2845 data = PyUnicode_AS_UNICODE(string);
2846 len = PyUnicode_GET_SIZE(string);
2848 list = PyList_New(0);
2849 if (!list)
2850 goto onError;
2852 for (i = j = 0; i < len; ) {
2853 int eol;
2855 /* Find a line and append it */
2856 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2857 i++;
2859 /* Skip the line break reading CRLF as one line break */
2860 eol = i;
2861 if (i < len) {
2862 if (data[i] == '\r' && i + 1 < len &&
2863 data[i+1] == '\n')
2864 i += 2;
2865 else
2866 i++;
2867 if (keepends)
2868 eol = i;
2870 SPLIT_APPEND(data, j, eol);
2871 j = i;
2873 if (j < len) {
2874 SPLIT_APPEND(data, j, len);
2877 Py_DECREF(string);
2878 return list;
2880 onError:
2881 Py_DECREF(list);
2882 Py_DECREF(string);
2883 return NULL;
2886 static
2887 PyObject *split_char(PyUnicodeObject *self,
2888 PyObject *list,
2889 Py_UNICODE ch,
2890 int maxcount)
2892 register int i;
2893 register int j;
2894 int len = self->length;
2895 PyObject *str;
2897 for (i = j = 0; i < len; ) {
2898 if (self->str[i] == ch) {
2899 if (maxcount-- <= 0)
2900 break;
2901 SPLIT_APPEND(self->str, j, i);
2902 i = j = i + 1;
2903 } else
2904 i++;
2906 if (j <= len) {
2907 SPLIT_APPEND(self->str, j, len);
2909 return list;
2911 onError:
2912 Py_DECREF(list);
2913 return NULL;
2916 static
2917 PyObject *split_substring(PyUnicodeObject *self,
2918 PyObject *list,
2919 PyUnicodeObject *substring,
2920 int maxcount)
2922 register int i;
2923 register int j;
2924 int len = self->length;
2925 int sublen = substring->length;
2926 PyObject *str;
2928 for (i = j = 0; i < len - sublen; ) {
2929 if (Py_UNICODE_MATCH(self, i, substring)) {
2930 if (maxcount-- <= 0)
2931 break;
2932 SPLIT_APPEND(self->str, j, i);
2933 i = j = i + sublen;
2934 } else
2935 i++;
2937 if (j <= len) {
2938 SPLIT_APPEND(self->str, j, len);
2940 return list;
2942 onError:
2943 Py_DECREF(list);
2944 return NULL;
2947 #undef SPLIT_APPEND
2949 static
2950 PyObject *split(PyUnicodeObject *self,
2951 PyUnicodeObject *substring,
2952 int maxcount)
2954 PyObject *list;
2956 if (maxcount < 0)
2957 maxcount = INT_MAX;
2959 list = PyList_New(0);
2960 if (!list)
2961 return NULL;
2963 if (substring == NULL)
2964 return split_whitespace(self,list,maxcount);
2966 else if (substring->length == 1)
2967 return split_char(self,list,substring->str[0],maxcount);
2969 else if (substring->length == 0) {
2970 Py_DECREF(list);
2971 PyErr_SetString(PyExc_ValueError, "empty separator");
2972 return NULL;
2974 else
2975 return split_substring(self,list,substring,maxcount);
2978 static
2979 PyObject *strip(PyUnicodeObject *self,
2980 int left,
2981 int right)
2983 Py_UNICODE *p = self->str;
2984 int start = 0;
2985 int end = self->length;
2987 if (left)
2988 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2989 start++;
2991 if (right)
2992 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2993 end--;
2995 if (start == 0 && end == self->length) {
2996 /* couldn't strip anything off, return original string */
2997 Py_INCREF(self);
2998 return (PyObject*) self;
3001 return (PyObject*) PyUnicode_FromUnicode(
3002 self->str + start,
3003 end - start
3007 static
3008 PyObject *replace(PyUnicodeObject *self,
3009 PyUnicodeObject *str1,
3010 PyUnicodeObject *str2,
3011 int maxcount)
3013 PyUnicodeObject *u;
3015 if (maxcount < 0)
3016 maxcount = INT_MAX;
3018 if (str1->length == 1 && str2->length == 1) {
3019 int i;
3021 /* replace characters */
3022 if (!findchar(self->str, self->length, str1->str[0])) {
3023 /* nothing to replace, return original string */
3024 Py_INCREF(self);
3025 u = self;
3026 } else {
3027 Py_UNICODE u1 = str1->str[0];
3028 Py_UNICODE u2 = str2->str[0];
3030 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3031 self->str,
3032 self->length
3034 if (u)
3035 for (i = 0; i < u->length; i++)
3036 if (u->str[i] == u1) {
3037 if (--maxcount < 0)
3038 break;
3039 u->str[i] = u2;
3043 } else {
3044 int n, i;
3045 Py_UNICODE *p;
3047 /* replace strings */
3048 n = count(self, 0, self->length, str1);
3049 if (n > maxcount)
3050 n = maxcount;
3051 if (n == 0) {
3052 /* nothing to replace, return original string */
3053 Py_INCREF(self);
3054 u = self;
3055 } else {
3056 u = _PyUnicode_New(
3057 self->length + n * (str2->length - str1->length));
3058 if (u) {
3059 i = 0;
3060 p = u->str;
3061 while (i <= self->length - str1->length)
3062 if (Py_UNICODE_MATCH(self, i, str1)) {
3063 /* replace string segment */
3064 Py_UNICODE_COPY(p, str2->str, str2->length);
3065 p += str2->length;
3066 i += str1->length;
3067 if (--n <= 0) {
3068 /* copy remaining part */
3069 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3070 break;
3072 } else
3073 *p++ = self->str[i++];
3078 return (PyObject *) u;
3081 /* --- Unicode Object Methods --------------------------------------------- */
3083 static char title__doc__[] =
3084 "S.title() -> unicode\n\
3086 Return a titlecased version of S, i.e. words start with title case\n\
3087 characters, all remaining cased characters have lower case.";
3089 static PyObject*
3090 unicode_title(PyUnicodeObject *self, PyObject *args)
3092 if (!PyArg_NoArgs(args))
3093 return NULL;
3094 return fixup(self, fixtitle);
3097 static char capitalize__doc__[] =
3098 "S.capitalize() -> unicode\n\
3100 Return a capitalized version of S, i.e. make the first character\n\
3101 have upper case.";
3103 static PyObject*
3104 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3106 if (!PyArg_NoArgs(args))
3107 return NULL;
3108 return fixup(self, fixcapitalize);
3111 #if 0
3112 static char capwords__doc__[] =
3113 "S.capwords() -> unicode\n\
3115 Apply .capitalize() to all words in S and return the result with\n\
3116 normalized whitespace (all whitespace strings are replaced by ' ').";
3118 static PyObject*
3119 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3121 PyObject *list;
3122 PyObject *item;
3123 int i;
3125 if (!PyArg_NoArgs(args))
3126 return NULL;
3128 /* Split into words */
3129 list = split(self, NULL, -1);
3130 if (!list)
3131 return NULL;
3133 /* Capitalize each word */
3134 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3135 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3136 fixcapitalize);
3137 if (item == NULL)
3138 goto onError;
3139 Py_DECREF(PyList_GET_ITEM(list, i));
3140 PyList_SET_ITEM(list, i, item);
3143 /* Join the words to form a new string */
3144 item = PyUnicode_Join(NULL, list);
3146 onError:
3147 Py_DECREF(list);
3148 return (PyObject *)item;
3150 #endif
3152 static char center__doc__[] =
3153 "S.center(width) -> unicode\n\
3155 Return S centered in a Unicode string of length width. Padding is done\n\
3156 using spaces.";
3158 static PyObject *
3159 unicode_center(PyUnicodeObject *self, PyObject *args)
3161 int marg, left;
3162 int width;
3164 if (!PyArg_ParseTuple(args, "i:center", &width))
3165 return NULL;
3167 if (self->length >= width) {
3168 Py_INCREF(self);
3169 return (PyObject*) self;
3172 marg = width - self->length;
3173 left = marg / 2 + (marg & width & 1);
3175 return (PyObject*) pad(self, left, marg - left, ' ');
3178 #if 0
3180 /* This code should go into some future Unicode collation support
3181 module. The basic comparison should compare ordinals on a naive
3182 basis (this is what Java does and thus JPython too). */
3184 /* speedy UTF-16 code point order comparison */
3185 /* gleaned from: */
3186 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3188 static short utf16Fixup[32] =
3190 0, 0, 0, 0, 0, 0, 0, 0,
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
3193 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3196 static int
3197 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3199 int len1, len2;
3201 Py_UNICODE *s1 = str1->str;
3202 Py_UNICODE *s2 = str2->str;
3204 len1 = str1->length;
3205 len2 = str2->length;
3207 while (len1 > 0 && len2 > 0) {
3208 Py_UNICODE c1, c2;
3209 long diff;
3211 c1 = *s1++;
3212 c2 = *s2++;
3213 if (c1 > (1<<11) * 26)
3214 c1 += utf16Fixup[c1>>11];
3215 if (c2 > (1<<11) * 26)
3216 c2 += utf16Fixup[c2>>11];
3218 /* now c1 and c2 are in UTF-32-compatible order */
3219 diff = (long)c1 - (long)c2;
3220 if (diff)
3221 return (diff < 0) ? -1 : (diff != 0);
3222 len1--; len2--;
3225 return (len1 < len2) ? -1 : (len1 != len2);
3228 #else
3230 static int
3231 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3233 register int len1, len2;
3235 Py_UNICODE *s1 = str1->str;
3236 Py_UNICODE *s2 = str2->str;
3238 len1 = str1->length;
3239 len2 = str2->length;
3241 while (len1 > 0 && len2 > 0) {
3242 register long diff;
3244 diff = (long)*s1++ - (long)*s2++;
3245 if (diff)
3246 return (diff < 0) ? -1 : (diff != 0);
3247 len1--; len2--;
3250 return (len1 < len2) ? -1 : (len1 != len2);
3253 #endif
3255 int PyUnicode_Compare(PyObject *left,
3256 PyObject *right)
3258 PyUnicodeObject *u = NULL, *v = NULL;
3259 int result;
3261 /* Coerce the two arguments */
3262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3263 if (u == NULL)
3264 goto onError;
3265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3266 if (v == NULL)
3267 goto onError;
3269 /* Shortcut for empty or interned objects */
3270 if (v == u) {
3271 Py_DECREF(u);
3272 Py_DECREF(v);
3273 return 0;
3276 result = unicode_compare(u, v);
3278 Py_DECREF(u);
3279 Py_DECREF(v);
3280 return result;
3282 onError:
3283 Py_XDECREF(u);
3284 Py_XDECREF(v);
3285 return -1;
3288 int PyUnicode_Contains(PyObject *container,
3289 PyObject *element)
3291 PyUnicodeObject *u = NULL, *v = NULL;
3292 int result;
3293 register const Py_UNICODE *p, *e;
3294 register Py_UNICODE ch;
3296 /* Coerce the two arguments */
3297 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3298 if (v == NULL) {
3299 PyErr_SetString(PyExc_TypeError,
3300 "'in <string>' requires character as left operand");
3301 goto onError;
3303 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3304 if (u == NULL) {
3305 Py_DECREF(v);
3306 goto onError;
3309 /* Check v in u */
3310 if (PyUnicode_GET_SIZE(v) != 1) {
3311 PyErr_SetString(PyExc_TypeError,
3312 "'in <string>' requires character as left operand");
3313 goto onError;
3315 ch = *PyUnicode_AS_UNICODE(v);
3316 p = PyUnicode_AS_UNICODE(u);
3317 e = p + PyUnicode_GET_SIZE(u);
3318 result = 0;
3319 while (p < e) {
3320 if (*p++ == ch) {
3321 result = 1;
3322 break;
3326 Py_DECREF(u);
3327 Py_DECREF(v);
3328 return result;
3330 onError:
3331 Py_XDECREF(u);
3332 Py_XDECREF(v);
3333 return -1;
3336 /* Concat to string or Unicode object giving a new Unicode object. */
3338 PyObject *PyUnicode_Concat(PyObject *left,
3339 PyObject *right)
3341 PyUnicodeObject *u = NULL, *v = NULL, *w;
3343 /* Coerce the two arguments */
3344 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3345 if (u == NULL)
3346 goto onError;
3347 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3348 if (v == NULL)
3349 goto onError;
3351 /* Shortcuts */
3352 if (v == unicode_empty) {
3353 Py_DECREF(v);
3354 return (PyObject *)u;
3356 if (u == unicode_empty) {
3357 Py_DECREF(u);
3358 return (PyObject *)v;
3361 /* Concat the two Unicode strings */
3362 w = _PyUnicode_New(u->length + v->length);
3363 if (w == NULL)
3364 goto onError;
3365 Py_UNICODE_COPY(w->str, u->str, u->length);
3366 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3368 Py_DECREF(u);
3369 Py_DECREF(v);
3370 return (PyObject *)w;
3372 onError:
3373 Py_XDECREF(u);
3374 Py_XDECREF(v);
3375 return NULL;
3378 static char count__doc__[] =
3379 "S.count(sub[, start[, end]]) -> int\n\
3381 Return the number of occurrences of substring sub in Unicode string\n\
3382 S[start:end]. Optional arguments start and end are\n\
3383 interpreted as in slice notation.";
3385 static PyObject *
3386 unicode_count(PyUnicodeObject *self, PyObject *args)
3388 PyUnicodeObject *substring;
3389 int start = 0;
3390 int end = INT_MAX;
3391 PyObject *result;
3393 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3394 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3395 return NULL;
3397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3398 (PyObject *)substring);
3399 if (substring == NULL)
3400 return NULL;
3402 if (start < 0)
3403 start += self->length;
3404 if (start < 0)
3405 start = 0;
3406 if (end > self->length)
3407 end = self->length;
3408 if (end < 0)
3409 end += self->length;
3410 if (end < 0)
3411 end = 0;
3413 result = PyInt_FromLong((long) count(self, start, end, substring));
3415 Py_DECREF(substring);
3416 return result;
3419 static char encode__doc__[] =
3420 "S.encode([encoding[,errors]]) -> string\n\
3422 Return an encoded string version of S. Default encoding is the current\n\
3423 default string encoding. errors may be given to set a different error\n\
3424 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3425 a ValueError. Other possible values are 'ignore' and 'replace'.";
3427 static PyObject *
3428 unicode_encode(PyUnicodeObject *self, PyObject *args)
3430 char *encoding = NULL;
3431 char *errors = NULL;
3432 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3433 return NULL;
3434 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3437 static char expandtabs__doc__[] =
3438 "S.expandtabs([tabsize]) -> unicode\n\
3440 Return a copy of S where all tab characters are expanded using spaces.\n\
3441 If tabsize is not given, a tab size of 8 characters is assumed.";
3443 static PyObject*
3444 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3446 Py_UNICODE *e;
3447 Py_UNICODE *p;
3448 Py_UNICODE *q;
3449 int i, j;
3450 PyUnicodeObject *u;
3451 int tabsize = 8;
3453 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3454 return NULL;
3456 /* First pass: determine size of output string */
3457 i = j = 0;
3458 e = self->str + self->length;
3459 for (p = self->str; p < e; p++)
3460 if (*p == '\t') {
3461 if (tabsize > 0)
3462 j += tabsize - (j % tabsize);
3464 else {
3465 j++;
3466 if (*p == '\n' || *p == '\r') {
3467 i += j;
3468 j = 0;
3472 /* Second pass: create output string and fill it */
3473 u = _PyUnicode_New(i + j);
3474 if (!u)
3475 return NULL;
3477 j = 0;
3478 q = u->str;
3480 for (p = self->str; p < e; p++)
3481 if (*p == '\t') {
3482 if (tabsize > 0) {
3483 i = tabsize - (j % tabsize);
3484 j += i;
3485 while (i--)
3486 *q++ = ' ';
3489 else {
3490 j++;
3491 *q++ = *p;
3492 if (*p == '\n' || *p == '\r')
3493 j = 0;
3496 return (PyObject*) u;
3499 static char find__doc__[] =
3500 "S.find(sub [,start [,end]]) -> int\n\
3502 Return the lowest index in S where substring sub is found,\n\
3503 such that sub is contained within s[start,end]. Optional\n\
3504 arguments start and end are interpreted as in slice notation.\n\
3506 Return -1 on failure.";
3508 static PyObject *
3509 unicode_find(PyUnicodeObject *self, PyObject *args)
3511 PyUnicodeObject *substring;
3512 int start = 0;
3513 int end = INT_MAX;
3514 PyObject *result;
3516 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3517 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3518 return NULL;
3519 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3520 (PyObject *)substring);
3521 if (substring == NULL)
3522 return NULL;
3524 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3526 Py_DECREF(substring);
3527 return result;
3530 static PyObject *
3531 unicode_getitem(PyUnicodeObject *self, int index)
3533 if (index < 0 || index >= self->length) {
3534 PyErr_SetString(PyExc_IndexError, "string index out of range");
3535 return NULL;
3538 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3541 static long
3542 unicode_hash(PyUnicodeObject *self)
3544 /* Since Unicode objects compare equal to their ASCII string
3545 counterparts, they should use the individual character values
3546 as basis for their hash value. This is needed to assure that
3547 strings and Unicode objects behave in the same way as
3548 dictionary keys. */
3550 register int len;
3551 register Py_UNICODE *p;
3552 register long x;
3554 if (self->hash != -1)
3555 return self->hash;
3556 len = PyUnicode_GET_SIZE(self);
3557 p = PyUnicode_AS_UNICODE(self);
3558 x = *p << 7;
3559 while (--len >= 0)
3560 x = (1000003*x) ^ *p++;
3561 x ^= PyUnicode_GET_SIZE(self);
3562 if (x == -1)
3563 x = -2;
3564 self->hash = x;
3565 return x;
3568 static char index__doc__[] =
3569 "S.index(sub [,start [,end]]) -> int\n\
3571 Like S.find() but raise ValueError when the substring is not found.";
3573 static PyObject *
3574 unicode_index(PyUnicodeObject *self, PyObject *args)
3576 int result;
3577 PyUnicodeObject *substring;
3578 int start = 0;
3579 int end = INT_MAX;
3581 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3582 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3583 return NULL;
3585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3586 (PyObject *)substring);
3587 if (substring == NULL)
3588 return NULL;
3590 result = findstring(self, substring, start, end, 1);
3592 Py_DECREF(substring);
3593 if (result < 0) {
3594 PyErr_SetString(PyExc_ValueError, "substring not found");
3595 return NULL;
3597 return PyInt_FromLong(result);
3600 static char islower__doc__[] =
3601 "S.islower() -> int\n\
3603 Return 1 if all cased characters in S are lowercase and there is\n\
3604 at least one cased character in S, 0 otherwise.";
3606 static PyObject*
3607 unicode_islower(PyUnicodeObject *self, PyObject *args)
3609 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3610 register const Py_UNICODE *e;
3611 int cased;
3613 if (!PyArg_NoArgs(args))
3614 return NULL;
3616 /* Shortcut for single character strings */
3617 if (PyUnicode_GET_SIZE(self) == 1)
3618 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3620 /* Special case for empty strings */
3621 if (PyString_GET_SIZE(self) == 0)
3622 return PyInt_FromLong(0);
3624 e = p + PyUnicode_GET_SIZE(self);
3625 cased = 0;
3626 for (; p < e; p++) {
3627 register const Py_UNICODE ch = *p;
3629 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3630 return PyInt_FromLong(0);
3631 else if (!cased && Py_UNICODE_ISLOWER(ch))
3632 cased = 1;
3634 return PyInt_FromLong(cased);
3637 static char isupper__doc__[] =
3638 "S.isupper() -> int\n\
3640 Return 1 if all cased characters in S are uppercase and there is\n\
3641 at least one cased character in S, 0 otherwise.";
3643 static PyObject*
3644 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3646 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3647 register const Py_UNICODE *e;
3648 int cased;
3650 if (!PyArg_NoArgs(args))
3651 return NULL;
3653 /* Shortcut for single character strings */
3654 if (PyUnicode_GET_SIZE(self) == 1)
3655 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3657 /* Special case for empty strings */
3658 if (PyString_GET_SIZE(self) == 0)
3659 return PyInt_FromLong(0);
3661 e = p + PyUnicode_GET_SIZE(self);
3662 cased = 0;
3663 for (; p < e; p++) {
3664 register const Py_UNICODE ch = *p;
3666 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3667 return PyInt_FromLong(0);
3668 else if (!cased && Py_UNICODE_ISUPPER(ch))
3669 cased = 1;
3671 return PyInt_FromLong(cased);
3674 static char istitle__doc__[] =
3675 "S.istitle() -> int\n\
3677 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3678 may only follow uncased characters and lowercase characters only cased\n\
3679 ones. Return 0 otherwise.";
3681 static PyObject*
3682 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3684 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3685 register const Py_UNICODE *e;
3686 int cased, previous_is_cased;
3688 if (!PyArg_NoArgs(args))
3689 return NULL;
3691 /* Shortcut for single character strings */
3692 if (PyUnicode_GET_SIZE(self) == 1)
3693 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3694 (Py_UNICODE_ISUPPER(*p) != 0));
3696 /* Special case for empty strings */
3697 if (PyString_GET_SIZE(self) == 0)
3698 return PyInt_FromLong(0);
3700 e = p + PyUnicode_GET_SIZE(self);
3701 cased = 0;
3702 previous_is_cased = 0;
3703 for (; p < e; p++) {
3704 register const Py_UNICODE ch = *p;
3706 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3707 if (previous_is_cased)
3708 return PyInt_FromLong(0);
3709 previous_is_cased = 1;
3710 cased = 1;
3712 else if (Py_UNICODE_ISLOWER(ch)) {
3713 if (!previous_is_cased)
3714 return PyInt_FromLong(0);
3715 previous_is_cased = 1;
3716 cased = 1;
3718 else
3719 previous_is_cased = 0;
3721 return PyInt_FromLong(cased);
3724 static char isspace__doc__[] =
3725 "S.isspace() -> int\n\
3727 Return 1 if there are only whitespace characters in S,\n\
3728 0 otherwise.";
3730 static PyObject*
3731 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3733 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3734 register const Py_UNICODE *e;
3736 if (!PyArg_NoArgs(args))
3737 return NULL;
3739 /* Shortcut for single character strings */
3740 if (PyUnicode_GET_SIZE(self) == 1 &&
3741 Py_UNICODE_ISSPACE(*p))
3742 return PyInt_FromLong(1);
3744 /* Special case for empty strings */
3745 if (PyString_GET_SIZE(self) == 0)
3746 return PyInt_FromLong(0);
3748 e = p + PyUnicode_GET_SIZE(self);
3749 for (; p < e; p++) {
3750 if (!Py_UNICODE_ISSPACE(*p))
3751 return PyInt_FromLong(0);
3753 return PyInt_FromLong(1);
3756 static char isalpha__doc__[] =
3757 "S.isalpha() -> int\n\
3759 Return 1 if all characters in S are alphabetic\n\
3760 and there is at least one character in S, 0 otherwise.";
3762 static PyObject*
3763 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3765 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3766 register const Py_UNICODE *e;
3768 if (!PyArg_NoArgs(args))
3769 return NULL;
3771 /* Shortcut for single character strings */
3772 if (PyUnicode_GET_SIZE(self) == 1 &&
3773 Py_UNICODE_ISALPHA(*p))
3774 return PyInt_FromLong(1);
3776 /* Special case for empty strings */
3777 if (PyString_GET_SIZE(self) == 0)
3778 return PyInt_FromLong(0);
3780 e = p + PyUnicode_GET_SIZE(self);
3781 for (; p < e; p++) {
3782 if (!Py_UNICODE_ISALPHA(*p))
3783 return PyInt_FromLong(0);
3785 return PyInt_FromLong(1);
3788 static char isalnum__doc__[] =
3789 "S.isalnum() -> int\n\
3791 Return 1 if all characters in S are alphanumeric\n\
3792 and there is at least one character in S, 0 otherwise.";
3794 static PyObject*
3795 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3797 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3798 register const Py_UNICODE *e;
3800 if (!PyArg_NoArgs(args))
3801 return NULL;
3803 /* Shortcut for single character strings */
3804 if (PyUnicode_GET_SIZE(self) == 1 &&
3805 Py_UNICODE_ISALNUM(*p))
3806 return PyInt_FromLong(1);
3808 /* Special case for empty strings */
3809 if (PyString_GET_SIZE(self) == 0)
3810 return PyInt_FromLong(0);
3812 e = p + PyUnicode_GET_SIZE(self);
3813 for (; p < e; p++) {
3814 if (!Py_UNICODE_ISALNUM(*p))
3815 return PyInt_FromLong(0);
3817 return PyInt_FromLong(1);
3820 static char isdecimal__doc__[] =
3821 "S.isdecimal() -> int\n\
3823 Return 1 if there are only decimal characters in S,\n\
3824 0 otherwise.";
3826 static PyObject*
3827 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3829 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830 register const Py_UNICODE *e;
3832 if (!PyArg_NoArgs(args))
3833 return NULL;
3835 /* Shortcut for single character strings */
3836 if (PyUnicode_GET_SIZE(self) == 1 &&
3837 Py_UNICODE_ISDECIMAL(*p))
3838 return PyInt_FromLong(1);
3840 /* Special case for empty strings */
3841 if (PyString_GET_SIZE(self) == 0)
3842 return PyInt_FromLong(0);
3844 e = p + PyUnicode_GET_SIZE(self);
3845 for (; p < e; p++) {
3846 if (!Py_UNICODE_ISDECIMAL(*p))
3847 return PyInt_FromLong(0);
3849 return PyInt_FromLong(1);
3852 static char isdigit__doc__[] =
3853 "S.isdigit() -> int\n\
3855 Return 1 if there are only digit characters in S,\n\
3856 0 otherwise.";
3858 static PyObject*
3859 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862 register const Py_UNICODE *e;
3864 if (!PyArg_NoArgs(args))
3865 return NULL;
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self) == 1 &&
3869 Py_UNICODE_ISDIGIT(*p))
3870 return PyInt_FromLong(1);
3872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self) == 0)
3874 return PyInt_FromLong(0);
3876 e = p + PyUnicode_GET_SIZE(self);
3877 for (; p < e; p++) {
3878 if (!Py_UNICODE_ISDIGIT(*p))
3879 return PyInt_FromLong(0);
3881 return PyInt_FromLong(1);
3884 static char isnumeric__doc__[] =
3885 "S.isnumeric() -> int\n\
3887 Return 1 if there are only numeric characters in S,\n\
3888 0 otherwise.";
3890 static PyObject*
3891 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894 register const Py_UNICODE *e;
3896 if (!PyArg_NoArgs(args))
3897 return NULL;
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self) == 1 &&
3901 Py_UNICODE_ISNUMERIC(*p))
3902 return PyInt_FromLong(1);
3904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self) == 0)
3906 return PyInt_FromLong(0);
3908 e = p + PyUnicode_GET_SIZE(self);
3909 for (; p < e; p++) {
3910 if (!Py_UNICODE_ISNUMERIC(*p))
3911 return PyInt_FromLong(0);
3913 return PyInt_FromLong(1);
3916 static char join__doc__[] =
3917 "S.join(sequence) -> unicode\n\
3919 Return a string which is the concatenation of the strings in the\n\
3920 sequence. The separator between elements is S.";
3922 static PyObject*
3923 unicode_join(PyUnicodeObject *self, PyObject *args)
3925 PyObject *data;
3926 if (!PyArg_ParseTuple(args, "O:join", &data))
3927 return NULL;
3929 return PyUnicode_Join((PyObject *)self, data);
3932 static int
3933 unicode_length(PyUnicodeObject *self)
3935 return self->length;
3938 static char ljust__doc__[] =
3939 "S.ljust(width) -> unicode\n\
3941 Return S left justified in a Unicode string of length width. Padding is\n\
3942 done using spaces.";
3944 static PyObject *
3945 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3947 int width;
3948 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3949 return NULL;
3951 if (self->length >= width) {
3952 Py_INCREF(self);
3953 return (PyObject*) self;
3956 return (PyObject*) pad(self, 0, width - self->length, ' ');
3959 static char lower__doc__[] =
3960 "S.lower() -> unicode\n\
3962 Return a copy of the string S converted to lowercase.";
3964 static PyObject*
3965 unicode_lower(PyUnicodeObject *self, PyObject *args)
3967 if (!PyArg_NoArgs(args))
3968 return NULL;
3969 return fixup(self, fixlower);
3972 static char lstrip__doc__[] =
3973 "S.lstrip() -> unicode\n\
3975 Return a copy of the string S with leading whitespace removed.";
3977 static PyObject *
3978 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3980 if (!PyArg_NoArgs(args))
3981 return NULL;
3982 return strip(self, 1, 0);
3985 static PyObject*
3986 unicode_repeat(PyUnicodeObject *str, int len)
3988 PyUnicodeObject *u;
3989 Py_UNICODE *p;
3990 int nchars;
3991 size_t nbytes;
3993 if (len < 0)
3994 len = 0;
3996 if (len == 1) {
3997 /* no repeat, return original string */
3998 Py_INCREF(str);
3999 return (PyObject*) str;
4002 /* ensure # of chars needed doesn't overflow int and # of bytes
4003 * needed doesn't overflow size_t
4005 nchars = len * str->length;
4006 if (len && nchars / len != str->length) {
4007 PyErr_SetString(PyExc_OverflowError,
4008 "repeated string is too long");
4009 return NULL;
4011 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4012 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4013 PyErr_SetString(PyExc_OverflowError,
4014 "repeated string is too long");
4015 return NULL;
4017 u = _PyUnicode_New(nchars);
4018 if (!u)
4019 return NULL;
4021 p = u->str;
4023 while (len-- > 0) {
4024 Py_UNICODE_COPY(p, str->str, str->length);
4025 p += str->length;
4028 return (PyObject*) u;
4031 PyObject *PyUnicode_Replace(PyObject *obj,
4032 PyObject *subobj,
4033 PyObject *replobj,
4034 int maxcount)
4036 PyObject *self;
4037 PyObject *str1;
4038 PyObject *str2;
4039 PyObject *result;
4041 self = PyUnicode_FromObject(obj);
4042 if (self == NULL)
4043 return NULL;
4044 str1 = PyUnicode_FromObject(subobj);
4045 if (str1 == NULL) {
4046 Py_DECREF(self);
4047 return NULL;
4049 str2 = PyUnicode_FromObject(replobj);
4050 if (str2 == NULL) {
4051 Py_DECREF(self);
4052 Py_DECREF(str1);
4053 return NULL;
4055 result = replace((PyUnicodeObject *)self,
4056 (PyUnicodeObject *)str1,
4057 (PyUnicodeObject *)str2,
4058 maxcount);
4059 Py_DECREF(self);
4060 Py_DECREF(str1);
4061 Py_DECREF(str2);
4062 return result;
4065 static char replace__doc__[] =
4066 "S.replace (old, new[, maxsplit]) -> unicode\n\
4068 Return a copy of S with all occurrences of substring\n\
4069 old replaced by new. If the optional argument maxsplit is\n\
4070 given, only the first maxsplit occurrences are replaced.";
4072 static PyObject*
4073 unicode_replace(PyUnicodeObject *self, PyObject *args)
4075 PyUnicodeObject *str1;
4076 PyUnicodeObject *str2;
4077 int maxcount = -1;
4078 PyObject *result;
4080 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4081 return NULL;
4082 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4083 if (str1 == NULL)
4084 return NULL;
4085 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4086 if (str2 == NULL)
4087 return NULL;
4089 result = replace(self, str1, str2, maxcount);
4091 Py_DECREF(str1);
4092 Py_DECREF(str2);
4093 return result;
4096 static
4097 PyObject *unicode_repr(PyObject *unicode)
4099 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4100 PyUnicode_GET_SIZE(unicode),
4104 static char rfind__doc__[] =
4105 "S.rfind(sub [,start [,end]]) -> int\n\
4107 Return the highest index in S where substring sub is found,\n\
4108 such that sub is contained within s[start,end]. Optional\n\
4109 arguments start and end are interpreted as in slice notation.\n\
4111 Return -1 on failure.";
4113 static PyObject *
4114 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4116 PyUnicodeObject *substring;
4117 int start = 0;
4118 int end = INT_MAX;
4119 PyObject *result;
4121 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4123 return NULL;
4124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4125 (PyObject *)substring);
4126 if (substring == NULL)
4127 return NULL;
4129 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4131 Py_DECREF(substring);
4132 return result;
4135 static char rindex__doc__[] =
4136 "S.rindex(sub [,start [,end]]) -> int\n\
4138 Like S.rfind() but raise ValueError when the substring is not found.";
4140 static PyObject *
4141 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4143 int result;
4144 PyUnicodeObject *substring;
4145 int start = 0;
4146 int end = INT_MAX;
4148 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4149 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4150 return NULL;
4151 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4152 (PyObject *)substring);
4153 if (substring == NULL)
4154 return NULL;
4156 result = findstring(self, substring, start, end, -1);
4158 Py_DECREF(substring);
4159 if (result < 0) {
4160 PyErr_SetString(PyExc_ValueError, "substring not found");
4161 return NULL;
4163 return PyInt_FromLong(result);
4166 static char rjust__doc__[] =
4167 "S.rjust(width) -> unicode\n\
4169 Return S right justified in a Unicode string of length width. Padding is\n\
4170 done using spaces.";
4172 static PyObject *
4173 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4175 int width;
4176 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4177 return NULL;
4179 if (self->length >= width) {
4180 Py_INCREF(self);
4181 return (PyObject*) self;
4184 return (PyObject*) pad(self, width - self->length, 0, ' ');
4187 static char rstrip__doc__[] =
4188 "S.rstrip() -> unicode\n\
4190 Return a copy of the string S with trailing whitespace removed.";
4192 static PyObject *
4193 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4195 if (!PyArg_NoArgs(args))
4196 return NULL;
4197 return strip(self, 0, 1);
4200 static PyObject*
4201 unicode_slice(PyUnicodeObject *self, int start, int end)
4203 /* standard clamping */
4204 if (start < 0)
4205 start = 0;
4206 if (end < 0)
4207 end = 0;
4208 if (end > self->length)
4209 end = self->length;
4210 if (start == 0 && end == self->length) {
4211 /* full slice, return original string */
4212 Py_INCREF(self);
4213 return (PyObject*) self;
4215 if (start > end)
4216 start = end;
4217 /* copy slice */
4218 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4219 end - start);
4222 PyObject *PyUnicode_Split(PyObject *s,
4223 PyObject *sep,
4224 int maxsplit)
4226 PyObject *result;
4228 s = PyUnicode_FromObject(s);
4229 if (s == NULL)
4230 return NULL;
4231 if (sep != NULL) {
4232 sep = PyUnicode_FromObject(sep);
4233 if (sep == NULL) {
4234 Py_DECREF(s);
4235 return NULL;
4239 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4241 Py_DECREF(s);
4242 Py_XDECREF(sep);
4243 return result;
4246 static char split__doc__[] =
4247 "S.split([sep [,maxsplit]]) -> list of strings\n\
4249 Return a list of the words in S, using sep as the\n\
4250 delimiter string. If maxsplit is given, at most maxsplit\n\
4251 splits are done. If sep is not specified, any whitespace string\n\
4252 is a separator.";
4254 static PyObject*
4255 unicode_split(PyUnicodeObject *self, PyObject *args)
4257 PyObject *substring = Py_None;
4258 int maxcount = -1;
4260 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4261 return NULL;
4263 if (substring == Py_None)
4264 return split(self, NULL, maxcount);
4265 else if (PyUnicode_Check(substring))
4266 return split(self, (PyUnicodeObject *)substring, maxcount);
4267 else
4268 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4271 static char splitlines__doc__[] =
4272 "S.splitlines([keepends]]) -> list of strings\n\
4274 Return a list of the lines in S, breaking at line boundaries.\n\
4275 Line breaks are not included in the resulting list unless keepends\n\
4276 is given and true.";
4278 static PyObject*
4279 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4281 int keepends = 0;
4283 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4284 return NULL;
4286 return PyUnicode_Splitlines((PyObject *)self, keepends);
4289 static
4290 PyObject *unicode_str(PyUnicodeObject *self)
4292 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4295 static char strip__doc__[] =
4296 "S.strip() -> unicode\n\
4298 Return a copy of S with leading and trailing whitespace removed.";
4300 static PyObject *
4301 unicode_strip(PyUnicodeObject *self, PyObject *args)
4303 if (!PyArg_NoArgs(args))
4304 return NULL;
4305 return strip(self, 1, 1);
4308 static char swapcase__doc__[] =
4309 "S.swapcase() -> unicode\n\
4311 Return a copy of S with uppercase characters converted to lowercase\n\
4312 and vice versa.";
4314 static PyObject*
4315 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4317 if (!PyArg_NoArgs(args))
4318 return NULL;
4319 return fixup(self, fixswapcase);
4322 static char translate__doc__[] =
4323 "S.translate(table) -> unicode\n\
4325 Return a copy of the string S, where all characters have been mapped\n\
4326 through the given translation table, which must be a mapping of\n\
4327 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4328 are left untouched. Characters mapped to None are deleted.";
4330 static PyObject*
4331 unicode_translate(PyUnicodeObject *self, PyObject *args)
4333 PyObject *table;
4335 if (!PyArg_ParseTuple(args, "O:translate", &table))
4336 return NULL;
4337 return PyUnicode_TranslateCharmap(self->str,
4338 self->length,
4339 table,
4340 "ignore");
4343 static char upper__doc__[] =
4344 "S.upper() -> unicode\n\
4346 Return a copy of S converted to uppercase.";
4348 static PyObject*
4349 unicode_upper(PyUnicodeObject *self, PyObject *args)
4351 if (!PyArg_NoArgs(args))
4352 return NULL;
4353 return fixup(self, fixupper);
4356 #if 0
4357 static char zfill__doc__[] =
4358 "S.zfill(width) -> unicode\n\
4360 Pad a numeric string x with zeros on the left, to fill a field\n\
4361 of the specified width. The string x is never truncated.";
4363 static PyObject *
4364 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4366 int fill;
4367 PyUnicodeObject *u;
4369 int width;
4370 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4371 return NULL;
4373 if (self->length >= width) {
4374 Py_INCREF(self);
4375 return (PyObject*) self;
4378 fill = width - self->length;
4380 u = pad(self, fill, 0, '0');
4382 if (u->str[fill] == '+' || u->str[fill] == '-') {
4383 /* move sign to beginning of string */
4384 u->str[0] = u->str[fill];
4385 u->str[fill] = '0';
4388 return (PyObject*) u;
4390 #endif
4392 #if 0
4393 static PyObject*
4394 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4396 if (!PyArg_NoArgs(args))
4397 return NULL;
4398 return PyInt_FromLong(unicode_freelist_size);
4400 #endif
4402 static char startswith__doc__[] =
4403 "S.startswith(prefix[, start[, end]]) -> int\n\
4405 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4406 optional start, test S beginning at that position. With optional end, stop\n\
4407 comparing S at that position.";
4409 static PyObject *
4410 unicode_startswith(PyUnicodeObject *self,
4411 PyObject *args)
4413 PyUnicodeObject *substring;
4414 int start = 0;
4415 int end = INT_MAX;
4416 PyObject *result;
4418 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4419 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4420 return NULL;
4421 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4422 (PyObject *)substring);
4423 if (substring == NULL)
4424 return NULL;
4426 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4428 Py_DECREF(substring);
4429 return result;
4433 static char endswith__doc__[] =
4434 "S.endswith(suffix[, start[, end]]) -> int\n\
4436 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4437 optional start, test S beginning at that position. With optional end, stop\n\
4438 comparing S at that position.";
4440 static PyObject *
4441 unicode_endswith(PyUnicodeObject *self,
4442 PyObject *args)
4444 PyUnicodeObject *substring;
4445 int start = 0;
4446 int end = INT_MAX;
4447 PyObject *result;
4449 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4450 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4451 return NULL;
4452 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4453 (PyObject *)substring);
4454 if (substring == NULL)
4455 return NULL;
4457 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4459 Py_DECREF(substring);
4460 return result;
4464 static PyMethodDef unicode_methods[] = {
4466 /* Order is according to common usage: often used methods should
4467 appear first, since lookup is done sequentially. */
4469 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4470 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4471 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4472 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4473 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4474 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4475 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4476 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4477 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4478 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4479 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4480 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4481 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4482 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4483 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4484 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4485 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4486 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4487 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4488 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4489 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4490 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4491 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4492 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4493 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4494 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4495 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4496 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4497 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4498 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4499 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4500 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4501 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4502 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4503 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4504 #if 0
4505 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4506 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4507 #endif
4509 #if 0
4510 /* This one is just used for debugging the implementation. */
4511 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4512 #endif
4514 {NULL, NULL}
4517 static PyObject *
4518 unicode_getattr(PyUnicodeObject *self, char *name)
4520 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4523 static PySequenceMethods unicode_as_sequence = {
4524 (inquiry) unicode_length, /* sq_length */
4525 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4526 (intargfunc) unicode_repeat, /* sq_repeat */
4527 (intargfunc) unicode_getitem, /* sq_item */
4528 (intintargfunc) unicode_slice, /* sq_slice */
4529 0, /* sq_ass_item */
4530 0, /* sq_ass_slice */
4531 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4534 static int
4535 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4536 int index,
4537 const void **ptr)
4539 if (index != 0) {
4540 PyErr_SetString(PyExc_SystemError,
4541 "accessing non-existent unicode segment");
4542 return -1;
4544 *ptr = (void *) self->str;
4545 return PyUnicode_GET_DATA_SIZE(self);
4548 static int
4549 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4550 const void **ptr)
4552 PyErr_SetString(PyExc_TypeError,
4553 "cannot use unicode as modifyable buffer");
4554 return -1;
4557 static int
4558 unicode_buffer_getsegcount(PyUnicodeObject *self,
4559 int *lenp)
4561 if (lenp)
4562 *lenp = PyUnicode_GET_DATA_SIZE(self);
4563 return 1;
4566 static int
4567 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4568 int index,
4569 const void **ptr)
4571 PyObject *str;
4573 if (index != 0) {
4574 PyErr_SetString(PyExc_SystemError,
4575 "accessing non-existent unicode segment");
4576 return -1;
4578 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4579 if (str == NULL)
4580 return -1;
4581 *ptr = (void *) PyString_AS_STRING(str);
4582 return PyString_GET_SIZE(str);
4585 /* Helpers for PyUnicode_Format() */
4587 static PyObject *
4588 getnextarg(PyObject *args, int arglen, int *p_argidx)
4590 int argidx = *p_argidx;
4591 if (argidx < arglen) {
4592 (*p_argidx)++;
4593 if (arglen < 0)
4594 return args;
4595 else
4596 return PyTuple_GetItem(args, argidx);
4598 PyErr_SetString(PyExc_TypeError,
4599 "not enough arguments for format string");
4600 return NULL;
4603 #define F_LJUST (1<<0)
4604 #define F_SIGN (1<<1)
4605 #define F_BLANK (1<<2)
4606 #define F_ALT (1<<3)
4607 #define F_ZERO (1<<4)
4609 static
4610 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4612 register int i;
4613 int len;
4614 va_list va;
4615 char *charbuffer;
4616 va_start(va, format);
4618 /* First, format the string as char array, then expand to Py_UNICODE
4619 array. */
4620 charbuffer = (char *)buffer;
4621 len = vsprintf(charbuffer, format, va);
4622 for (i = len - 1; i >= 0; i--)
4623 buffer[i] = (Py_UNICODE) charbuffer[i];
4625 va_end(va);
4626 return len;
4629 static int
4630 formatfloat(Py_UNICODE *buf,
4631 size_t buflen,
4632 int flags,
4633 int prec,
4634 int type,
4635 PyObject *v)
4637 /* fmt = '%#.' + `prec` + `type`
4638 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4639 char fmt[20];
4640 double x;
4642 x = PyFloat_AsDouble(v);
4643 if (x == -1.0 && PyErr_Occurred())
4644 return -1;
4645 if (prec < 0)
4646 prec = 6;
4647 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4648 type = 'g';
4649 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4650 /* worst case length calc to ensure no buffer overrun:
4651 fmt = %#.<prec>g
4652 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4653 for any double rep.)
4654 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4655 If prec=0 the effective precision is 1 (the leading digit is
4656 always given), therefore increase by one to 10+prec. */
4657 if (buflen <= (size_t)10 + (size_t)prec) {
4658 PyErr_SetString(PyExc_OverflowError,
4659 "formatted float is too long (precision too long?)");
4660 return -1;
4662 return usprintf(buf, fmt, x);
4665 static PyObject*
4666 formatlong(PyObject *val, int flags, int prec, int type)
4668 char *buf;
4669 int i, len;
4670 PyObject *str; /* temporary string object. */
4671 PyUnicodeObject *result;
4673 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4674 if (!str)
4675 return NULL;
4676 result = _PyUnicode_New(len);
4677 for (i = 0; i < len; i++)
4678 result->str[i] = buf[i];
4679 result->str[len] = 0;
4680 Py_DECREF(str);
4681 return (PyObject*)result;
4684 static int
4685 formatint(Py_UNICODE *buf,
4686 size_t buflen,
4687 int flags,
4688 int prec,
4689 int type,
4690 PyObject *v)
4692 /* fmt = '%#.' + `prec` + 'l' + `type`
4693 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4694 + 1 + 1 = 24*/
4695 char fmt[64]; /* plenty big enough! */
4696 long x;
4698 x = PyInt_AsLong(v);
4699 if (x == -1 && PyErr_Occurred())
4700 return -1;
4701 if (prec < 0)
4702 prec = 1;
4703 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4704 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4705 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4706 PyErr_SetString(PyExc_OverflowError,
4707 "formatted integer is too long (precision too long?)");
4708 return -1;
4710 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4711 return usprintf(buf, fmt, x);
4714 static int
4715 formatchar(Py_UNICODE *buf,
4716 size_t buflen,
4717 PyObject *v)
4719 /* presume that the buffer is at least 2 characters long */
4720 if (PyUnicode_Check(v)) {
4721 if (PyUnicode_GET_SIZE(v) != 1)
4722 goto onError;
4723 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4726 else if (PyString_Check(v)) {
4727 if (PyString_GET_SIZE(v) != 1)
4728 goto onError;
4729 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4732 else {
4733 /* Integer input truncated to a character */
4734 long x;
4735 x = PyInt_AsLong(v);
4736 if (x == -1 && PyErr_Occurred())
4737 goto onError;
4738 buf[0] = (char) x;
4740 buf[1] = '\0';
4741 return 1;
4743 onError:
4744 PyErr_SetString(PyExc_TypeError,
4745 "%c requires int or char");
4746 return -1;
4749 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4751 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4752 chars are formatted. XXX This is a magic number. Each formatting
4753 routine does bounds checking to ensure no overflow, but a better
4754 solution may be to malloc a buffer of appropriate size for each
4755 format. For now, the current solution is sufficient.
4757 #define FORMATBUFLEN (size_t)120
4759 PyObject *PyUnicode_Format(PyObject *format,
4760 PyObject *args)
4762 Py_UNICODE *fmt, *res;
4763 int fmtcnt, rescnt, reslen, arglen, argidx;
4764 int args_owned = 0;
4765 PyUnicodeObject *result = NULL;
4766 PyObject *dict = NULL;
4767 PyObject *uformat;
4769 if (format == NULL || args == NULL) {
4770 PyErr_BadInternalCall();
4771 return NULL;
4773 uformat = PyUnicode_FromObject(format);
4774 if (uformat == NULL)
4775 return NULL;
4776 fmt = PyUnicode_AS_UNICODE(uformat);
4777 fmtcnt = PyUnicode_GET_SIZE(uformat);
4779 reslen = rescnt = fmtcnt + 100;
4780 result = _PyUnicode_New(reslen);
4781 if (result == NULL)
4782 goto onError;
4783 res = PyUnicode_AS_UNICODE(result);
4785 if (PyTuple_Check(args)) {
4786 arglen = PyTuple_Size(args);
4787 argidx = 0;
4789 else {
4790 arglen = -1;
4791 argidx = -2;
4793 if (args->ob_type->tp_as_mapping)
4794 dict = args;
4796 while (--fmtcnt >= 0) {
4797 if (*fmt != '%') {
4798 if (--rescnt < 0) {
4799 rescnt = fmtcnt + 100;
4800 reslen += rescnt;
4801 if (_PyUnicode_Resize(result, reslen) < 0)
4802 return NULL;
4803 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4804 --rescnt;
4806 *res++ = *fmt++;
4808 else {
4809 /* Got a format specifier */
4810 int flags = 0;
4811 int width = -1;
4812 int prec = -1;
4813 int size = 0;
4814 Py_UNICODE c = '\0';
4815 Py_UNICODE fill;
4816 PyObject *v = NULL;
4817 PyObject *temp = NULL;
4818 Py_UNICODE *pbuf;
4819 Py_UNICODE sign;
4820 int len;
4821 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4823 fmt++;
4824 if (*fmt == '(') {
4825 Py_UNICODE *keystart;
4826 int keylen;
4827 PyObject *key;
4828 int pcount = 1;
4830 if (dict == NULL) {
4831 PyErr_SetString(PyExc_TypeError,
4832 "format requires a mapping");
4833 goto onError;
4835 ++fmt;
4836 --fmtcnt;
4837 keystart = fmt;
4838 /* Skip over balanced parentheses */
4839 while (pcount > 0 && --fmtcnt >= 0) {
4840 if (*fmt == ')')
4841 --pcount;
4842 else if (*fmt == '(')
4843 ++pcount;
4844 fmt++;
4846 keylen = fmt - keystart - 1;
4847 if (fmtcnt < 0 || pcount > 0) {
4848 PyErr_SetString(PyExc_ValueError,
4849 "incomplete format key");
4850 goto onError;
4852 /* keys are converted to strings using UTF-8 and
4853 then looked up since Python uses strings to hold
4854 variables names etc. in its namespaces and we
4855 wouldn't want to break common idioms. */
4856 key = PyUnicode_EncodeUTF8(keystart,
4857 keylen,
4858 NULL);
4859 if (key == NULL)
4860 goto onError;
4861 if (args_owned) {
4862 Py_DECREF(args);
4863 args_owned = 0;
4865 args = PyObject_GetItem(dict, key);
4866 Py_DECREF(key);
4867 if (args == NULL) {
4868 goto onError;
4870 args_owned = 1;
4871 arglen = -1;
4872 argidx = -2;
4874 while (--fmtcnt >= 0) {
4875 switch (c = *fmt++) {
4876 case '-': flags |= F_LJUST; continue;
4877 case '+': flags |= F_SIGN; continue;
4878 case ' ': flags |= F_BLANK; continue;
4879 case '#': flags |= F_ALT; continue;
4880 case '0': flags |= F_ZERO; continue;
4882 break;
4884 if (c == '*') {
4885 v = getnextarg(args, arglen, &argidx);
4886 if (v == NULL)
4887 goto onError;
4888 if (!PyInt_Check(v)) {
4889 PyErr_SetString(PyExc_TypeError,
4890 "* wants int");
4891 goto onError;
4893 width = PyInt_AsLong(v);
4894 if (width < 0) {
4895 flags |= F_LJUST;
4896 width = -width;
4898 if (--fmtcnt >= 0)
4899 c = *fmt++;
4901 else if (c >= '0' && c <= '9') {
4902 width = c - '0';
4903 while (--fmtcnt >= 0) {
4904 c = *fmt++;
4905 if (c < '0' || c > '9')
4906 break;
4907 if ((width*10) / 10 != width) {
4908 PyErr_SetString(PyExc_ValueError,
4909 "width too big");
4910 goto onError;
4912 width = width*10 + (c - '0');
4915 if (c == '.') {
4916 prec = 0;
4917 if (--fmtcnt >= 0)
4918 c = *fmt++;
4919 if (c == '*') {
4920 v = getnextarg(args, arglen, &argidx);
4921 if (v == NULL)
4922 goto onError;
4923 if (!PyInt_Check(v)) {
4924 PyErr_SetString(PyExc_TypeError,
4925 "* wants int");
4926 goto onError;
4928 prec = PyInt_AsLong(v);
4929 if (prec < 0)
4930 prec = 0;
4931 if (--fmtcnt >= 0)
4932 c = *fmt++;
4934 else if (c >= '0' && c <= '9') {
4935 prec = c - '0';
4936 while (--fmtcnt >= 0) {
4937 c = Py_CHARMASK(*fmt++);
4938 if (c < '0' || c > '9')
4939 break;
4940 if ((prec*10) / 10 != prec) {
4941 PyErr_SetString(PyExc_ValueError,
4942 "prec too big");
4943 goto onError;
4945 prec = prec*10 + (c - '0');
4948 } /* prec */
4949 if (fmtcnt >= 0) {
4950 if (c == 'h' || c == 'l' || c == 'L') {
4951 size = c;
4952 if (--fmtcnt >= 0)
4953 c = *fmt++;
4956 if (fmtcnt < 0) {
4957 PyErr_SetString(PyExc_ValueError,
4958 "incomplete format");
4959 goto onError;
4961 if (c != '%') {
4962 v = getnextarg(args, arglen, &argidx);
4963 if (v == NULL)
4964 goto onError;
4966 sign = 0;
4967 fill = ' ';
4968 switch (c) {
4970 case '%':
4971 pbuf = formatbuf;
4972 /* presume that buffer length is at least 1 */
4973 pbuf[0] = '%';
4974 len = 1;
4975 break;
4977 case 's':
4978 case 'r':
4979 if (PyUnicode_Check(v) && c == 's') {
4980 temp = v;
4981 Py_INCREF(temp);
4983 else {
4984 PyObject *unicode;
4985 if (c == 's')
4986 temp = PyObject_Str(v);
4987 else
4988 temp = PyObject_Repr(v);
4989 if (temp == NULL)
4990 goto onError;
4991 if (!PyString_Check(temp)) {
4992 /* XXX Note: this should never happen, since
4993 PyObject_Repr() and PyObject_Str() assure
4994 this */
4995 Py_DECREF(temp);
4996 PyErr_SetString(PyExc_TypeError,
4997 "%s argument has non-string str()");
4998 goto onError;
5000 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5001 PyString_GET_SIZE(temp),
5002 NULL,
5003 "strict");
5004 Py_DECREF(temp);
5005 temp = unicode;
5006 if (temp == NULL)
5007 goto onError;
5009 pbuf = PyUnicode_AS_UNICODE(temp);
5010 len = PyUnicode_GET_SIZE(temp);
5011 if (prec >= 0 && len > prec)
5012 len = prec;
5013 break;
5015 case 'i':
5016 case 'd':
5017 case 'u':
5018 case 'o':
5019 case 'x':
5020 case 'X':
5021 if (c == 'i')
5022 c = 'd';
5023 if (PyLong_Check(v) && PyLong_AsLong(v) == -1
5024 && PyErr_Occurred()) {
5025 PyErr_Clear();
5026 temp = formatlong(v, flags, prec, c);
5027 if (!temp)
5028 goto onError;
5029 pbuf = PyUnicode_AS_UNICODE(temp);
5030 len = PyUnicode_GET_SIZE(temp);
5031 /* unbounded ints can always produce
5032 a sign character! */
5033 sign = 1;
5035 else {
5036 pbuf = formatbuf;
5037 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5038 flags, prec, c, v);
5039 if (len < 0)
5040 goto onError;
5041 /* only d conversion is signed */
5042 sign = c == 'd';
5044 if (flags & F_ZERO)
5045 fill = '0';
5046 break;
5048 case 'e':
5049 case 'E':
5050 case 'f':
5051 case 'g':
5052 case 'G':
5053 pbuf = formatbuf;
5054 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5055 flags, prec, c, v);
5056 if (len < 0)
5057 goto onError;
5058 sign = 1;
5059 if (flags & F_ZERO)
5060 fill = '0';
5061 break;
5063 case 'c':
5064 pbuf = formatbuf;
5065 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5066 if (len < 0)
5067 goto onError;
5068 break;
5070 default:
5071 PyErr_Format(PyExc_ValueError,
5072 "unsupported format character '%c' (0x%x)",
5073 c, c);
5074 goto onError;
5076 if (sign) {
5077 if (*pbuf == '-' || *pbuf == '+') {
5078 sign = *pbuf++;
5079 len--;
5081 else if (flags & F_SIGN)
5082 sign = '+';
5083 else if (flags & F_BLANK)
5084 sign = ' ';
5085 else
5086 sign = 0;
5088 if (width < len)
5089 width = len;
5090 if (rescnt < width + (sign != 0)) {
5091 reslen -= rescnt;
5092 rescnt = width + fmtcnt + 100;
5093 reslen += rescnt;
5094 if (_PyUnicode_Resize(result, reslen) < 0)
5095 return NULL;
5096 res = PyUnicode_AS_UNICODE(result)
5097 + reslen - rescnt;
5099 if (sign) {
5100 if (fill != ' ')
5101 *res++ = sign;
5102 rescnt--;
5103 if (width > len)
5104 width--;
5106 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5107 assert(pbuf[0] == '0');
5108 assert(pbuf[1] == c);
5109 if (fill != ' ') {
5110 *res++ = *pbuf++;
5111 *res++ = *pbuf++;
5113 rescnt -= 2;
5114 width -= 2;
5115 if (width < 0)
5116 width = 0;
5117 len -= 2;
5119 if (width > len && !(flags & F_LJUST)) {
5120 do {
5121 --rescnt;
5122 *res++ = fill;
5123 } while (--width > len);
5125 if (fill == ' ') {
5126 if (sign)
5127 *res++ = sign;
5128 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5129 assert(pbuf[0] == '0');
5130 assert(pbuf[1] == c);
5131 *res++ = *pbuf++;
5132 *res++ = *pbuf++;
5135 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5136 res += len;
5137 rescnt -= len;
5138 while (--width >= len) {
5139 --rescnt;
5140 *res++ = ' ';
5142 if (dict && (argidx < arglen) && c != '%') {
5143 PyErr_SetString(PyExc_TypeError,
5144 "not all arguments converted");
5145 goto onError;
5147 Py_XDECREF(temp);
5148 } /* '%' */
5149 } /* until end */
5150 if (argidx < arglen && !dict) {
5151 PyErr_SetString(PyExc_TypeError,
5152 "not all arguments converted");
5153 goto onError;
5156 if (args_owned) {
5157 Py_DECREF(args);
5159 Py_DECREF(uformat);
5160 if (_PyUnicode_Resize(result, reslen - rescnt))
5161 goto onError;
5162 return (PyObject *)result;
5164 onError:
5165 Py_XDECREF(result);
5166 Py_DECREF(uformat);
5167 if (args_owned) {
5168 Py_DECREF(args);
5170 return NULL;
5173 static PyBufferProcs unicode_as_buffer = {
5174 (getreadbufferproc) unicode_buffer_getreadbuf,
5175 (getwritebufferproc) unicode_buffer_getwritebuf,
5176 (getsegcountproc) unicode_buffer_getsegcount,
5177 (getcharbufferproc) unicode_buffer_getcharbuf,
5180 PyTypeObject PyUnicode_Type = {
5181 PyObject_HEAD_INIT(&PyType_Type)
5182 0, /* ob_size */
5183 "unicode", /* tp_name */
5184 sizeof(PyUnicodeObject), /* tp_size */
5185 0, /* tp_itemsize */
5186 /* Slots */
5187 (destructor)_PyUnicode_Free, /* tp_dealloc */
5188 0, /* tp_print */
5189 (getattrfunc)unicode_getattr, /* tp_getattr */
5190 0, /* tp_setattr */
5191 (cmpfunc) unicode_compare, /* tp_compare */
5192 (reprfunc) unicode_repr, /* tp_repr */
5193 0, /* tp_as_number */
5194 &unicode_as_sequence, /* tp_as_sequence */
5195 0, /* tp_as_mapping */
5196 (hashfunc) unicode_hash, /* tp_hash*/
5197 0, /* tp_call*/
5198 (reprfunc) unicode_str, /* tp_str */
5199 (getattrofunc) NULL, /* tp_getattro */
5200 (setattrofunc) NULL, /* tp_setattro */
5201 &unicode_as_buffer, /* tp_as_buffer */
5202 Py_TPFLAGS_DEFAULT, /* tp_flags */
5205 /* Initialize the Unicode implementation */
5207 void _PyUnicode_Init(void)
5209 /* Doublecheck the configuration... */
5210 if (sizeof(Py_UNICODE) != 2)
5211 Py_FatalError("Unicode configuration error: "
5212 "sizeof(Py_UNICODE) != 2 bytes");
5214 /* Init the implementation */
5215 unicode_freelist = NULL;
5216 unicode_freelist_size = 0;
5217 unicode_empty = _PyUnicode_New(0);
5218 strcpy(unicode_default_encoding, "ascii");
5221 /* Finalize the Unicode implementation */
5223 void
5224 _PyUnicode_Fini(void)
5226 PyUnicodeObject *u = unicode_freelist;
5228 while (u != NULL) {
5229 PyUnicodeObject *v = u;
5230 u = *(PyUnicodeObject **)u;
5231 if (v->str)
5232 PyMem_DEL(v->str);
5233 Py_XDECREF(v->defenc);
5234 PyObject_DEL(v);
5236 unicode_freelist = NULL;
5237 unicode_freelist_size = 0;
5238 Py_XDECREF(unicode_empty);
5239 unicode_empty = NULL;