This commit was manufactured by cvs2svn to create tag 'r22a4-fork'.
[python/dscho.git] / Objects / unicodeobject.c
blobdf8592d55e89b9ab43a0ee615ada345cfb2e4253
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_DEL(unicode);
223 return NULL;
226 static
227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
232 PyMem_DEL(unicode->str);
233 unicode->str = NULL;
234 unicode->length = 0;
236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
240 /* Add to free list */
241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
245 else {
246 PyMem_DEL(unicode->str);
247 Py_XDECREF(unicode->defenc);
248 PyObject_DEL(unicode);
252 int PyUnicode_Resize(PyObject **unicode,
253 int length)
255 register PyUnicodeObject *v;
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
294 PyUnicodeObject *unicode;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
312 if (!unicode)
313 return NULL;
314 unicode->str[0] = *u;
315 unicode_latin1[*u] = unicode;
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
328 Py_UNICODE_COPY(unicode->str, u, size);
330 return (PyObject *)unicode;
333 #ifdef HAVE_WCHAR_H
335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
338 PyUnicodeObject *unicode;
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352 #else
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
360 #endif
362 return (PyObject *)unicode;
365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377 #else
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
385 #endif
387 return size;
390 #endif
392 PyObject *PyUnicode_FromObject(register PyObject *obj)
394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
401 const char *s = NULL;
402 int len;
403 int owned = 0;
404 PyObject *v;
405 int reclevel;
407 if (obj == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
412 /* Coerce object */
413 for (reclevel = 0; reclevel < 2; reclevel++) {
415 if (PyUnicode_Check(obj)) {
416 if (encoding) {
417 PyErr_SetString(PyExc_TypeError,
418 "decoding Unicode is not supported");
419 goto onError;
421 if (PyUnicode_CheckExact(obj)) {
422 Py_INCREF(obj);
423 v = obj;
425 else {
426 /* For a subclass of unicode, return a true unicode object
427 with the same string value. */
428 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
429 PyUnicode_GET_SIZE(obj));
431 goto done;
433 else if (PyString_Check(obj)) {
434 s = PyString_AS_STRING(obj);
435 len = PyString_GET_SIZE(obj);
436 break;
438 else {
439 PyObject *w;
441 /* Try char buffer interface */
442 if (PyObject_AsCharBuffer(obj, &s, &len))
443 PyErr_Clear();
444 else
445 break;
447 /* Mimic the behaviour of str(object) if everything else
448 fails (see PyObject_Str()); this also covers instances
449 which implement __str__. */
450 if (obj->ob_type->tp_str == NULL)
451 w = PyObject_Repr(obj);
452 else
453 w = (*obj->ob_type->tp_str)(obj);
454 if (w == NULL)
455 goto onError;
456 if (owned) {
457 Py_DECREF(obj);
459 obj = w;
460 owned = 1;
464 if (s == NULL) {
465 PyErr_Format(PyExc_TypeError,
466 "coercing to Unicode: __str__ recursion limit exceeded "
467 "(last type: %.80s)",
468 obj->ob_type->tp_name);
469 goto onError;
472 /* Convert to Unicode */
473 if (len == 0) {
474 Py_INCREF(unicode_empty);
475 v = (PyObject *)unicode_empty;
477 else
478 v = PyUnicode_Decode(s, len, encoding, errors);
480 done:
481 if (owned) {
482 Py_DECREF(obj);
484 return v;
486 onError:
487 if (owned) {
488 Py_DECREF(obj);
490 return NULL;
493 PyObject *PyUnicode_Decode(const char *s,
494 int size,
495 const char *encoding,
496 const char *errors)
498 PyObject *buffer = NULL, *unicode;
500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
503 /* Shortcuts for common default encodings */
504 if (strcmp(encoding, "utf-8") == 0)
505 return PyUnicode_DecodeUTF8(s, size, errors);
506 else if (strcmp(encoding, "latin-1") == 0)
507 return PyUnicode_DecodeLatin1(s, size, errors);
508 else if (strcmp(encoding, "ascii") == 0)
509 return PyUnicode_DecodeASCII(s, size, errors);
511 /* Decode via the codec registry */
512 buffer = PyBuffer_FromMemory((void *)s, size);
513 if (buffer == NULL)
514 goto onError;
515 unicode = PyCodec_Decode(buffer, encoding, errors);
516 if (unicode == NULL)
517 goto onError;
518 if (!PyUnicode_Check(unicode)) {
519 PyErr_Format(PyExc_TypeError,
520 "decoder did not return an unicode object (type=%.400s)",
521 unicode->ob_type->tp_name);
522 Py_DECREF(unicode);
523 goto onError;
525 Py_DECREF(buffer);
526 return unicode;
528 onError:
529 Py_XDECREF(buffer);
530 return NULL;
533 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
534 int size,
535 const char *encoding,
536 const char *errors)
538 PyObject *v, *unicode;
540 unicode = PyUnicode_FromUnicode(s, size);
541 if (unicode == NULL)
542 return NULL;
543 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
544 Py_DECREF(unicode);
545 return v;
548 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
549 const char *encoding,
550 const char *errors)
552 PyObject *v;
554 if (!PyUnicode_Check(unicode)) {
555 PyErr_BadArgument();
556 goto onError;
559 if (encoding == NULL)
560 encoding = PyUnicode_GetDefaultEncoding();
562 /* Shortcuts for common default encodings */
563 if (errors == NULL) {
564 if (strcmp(encoding, "utf-8") == 0)
565 return PyUnicode_AsUTF8String(unicode);
566 else if (strcmp(encoding, "latin-1") == 0)
567 return PyUnicode_AsLatin1String(unicode);
568 else if (strcmp(encoding, "ascii") == 0)
569 return PyUnicode_AsASCIIString(unicode);
572 /* Encode via the codec registry */
573 v = PyCodec_Encode(unicode, encoding, errors);
574 if (v == NULL)
575 goto onError;
576 /* XXX Should we really enforce this ? */
577 if (!PyString_Check(v)) {
578 PyErr_Format(PyExc_TypeError,
579 "encoder did not return a string object (type=%.400s)",
580 v->ob_type->tp_name);
581 Py_DECREF(v);
582 goto onError;
584 return v;
586 onError:
587 return NULL;
590 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
591 const char *errors)
593 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
595 if (v)
596 return v;
597 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
598 if (v && errors == NULL)
599 ((PyUnicodeObject *)unicode)->defenc = v;
600 return v;
603 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
609 return PyUnicode_AS_UNICODE(unicode);
611 onError:
612 return NULL;
615 int PyUnicode_GetSize(PyObject *unicode)
617 if (!PyUnicode_Check(unicode)) {
618 PyErr_BadArgument();
619 goto onError;
621 return PyUnicode_GET_SIZE(unicode);
623 onError:
624 return -1;
627 const char *PyUnicode_GetDefaultEncoding(void)
629 return unicode_default_encoding;
632 int PyUnicode_SetDefaultEncoding(const char *encoding)
634 PyObject *v;
636 /* Make sure the encoding is valid. As side effect, this also
637 loads the encoding into the codec registry cache. */
638 v = _PyCodec_Lookup(encoding);
639 if (v == NULL)
640 goto onError;
641 Py_DECREF(v);
642 strncpy(unicode_default_encoding,
643 encoding,
644 sizeof(unicode_default_encoding));
645 return 0;
647 onError:
648 return -1;
651 /* --- UTF-7 Codec -------------------------------------------------------- */
653 /* see RFC2152 for details */
655 static
656 char utf7_special[128] = {
657 /* indicate whether a UTF-7 character is special i.e. cannot be directly
658 encoded:
659 0 - not special
660 1 - special
661 2 - whitespace (optional)
662 3 - RFC2152 Set O (optional) */
663 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
665 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
667 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
669 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
674 #define SPECIAL(c, encodeO, encodeWS) \
675 (((c)>127 || utf7_special[(c)] == 1) || \
676 (encodeWS && (utf7_special[(c)] == 2)) || \
677 (encodeO && (utf7_special[(c)] == 3)))
679 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
680 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
681 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
682 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
684 #define ENCODE(out, ch, bits) \
685 while (bits >= 6) { \
686 *out++ = B64(ch >> (bits-6)); \
687 bits -= 6; \
690 #define DECODE(out, ch, bits, surrogate) \
691 while (bits >= 16) { \
692 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
693 bits -= 16; \
694 if (surrogate) { \
695 /* We have already generated an error for the high surrogate
696 so let's not bother seeing if the low surrogate is correct or not */\
697 surrogate = 0; \
698 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
699 /* This is a surrogate pair. Unfortunately we can't represent \
700 it in a 16-bit character */ \
701 surrogate = 1; \
702 errmsg = "code pairs are not supported"; \
703 goto utf7Error; \
704 } else { \
705 *out++ = outCh; \
709 static
710 int utf7_decoding_error(Py_UNICODE **dest,
711 const char *errors,
712 const char *details)
714 if ((errors == NULL) ||
715 (strcmp(errors,"strict") == 0)) {
716 PyErr_Format(PyExc_UnicodeError,
717 "UTF-7 decoding error: %.400s",
718 details);
719 return -1;
721 else if (strcmp(errors,"ignore") == 0) {
722 return 0;
724 else if (strcmp(errors,"replace") == 0) {
725 if (dest != NULL) {
726 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
727 (*dest)++;
729 return 0;
731 else {
732 PyErr_Format(PyExc_ValueError,
733 "UTF-7 decoding error; unknown error handling code: %.400s",
734 errors);
735 return -1;
739 PyObject *PyUnicode_DecodeUTF7(const char *s,
740 int size,
741 const char *errors)
743 const char *e;
744 PyUnicodeObject *unicode;
745 Py_UNICODE *p;
746 const char *errmsg = "";
747 int inShift = 0;
748 unsigned int bitsleft = 0;
749 unsigned long charsleft = 0;
750 int surrogate = 0;
752 unicode = _PyUnicode_New(size);
753 if (!unicode)
754 return NULL;
755 if (size == 0)
756 return (PyObject *)unicode;
758 p = unicode->str;
759 e = s + size;
761 while (s < e) {
762 Py_UNICODE ch = *s;
764 if (inShift) {
765 if ((ch == '-') || !B64CHAR(ch)) {
766 inShift = 0;
767 s++;
769 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
770 if (bitsleft >= 6) {
771 /* The shift sequence has a partial character in it. If
772 bitsleft < 6 then we could just classify it as padding
773 but that is not the case here */
775 errmsg = "partial character in shift sequence";
776 goto utf7Error;
778 /* According to RFC2152 the remaining bits should be zero. We
779 choose to signal an error/insert a replacement character
780 here so indicate the potential of a misencoded character. */
782 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
783 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
784 errmsg = "non-zero padding bits in shift sequence";
785 goto utf7Error;
788 if (ch == '-') {
789 if ((s < e) && (*(s) == '-')) {
790 *p++ = '-';
791 inShift = 1;
793 } else if (SPECIAL(ch,0,0)) {
794 errmsg = "unexpected special character";
795 goto utf7Error;
796 } else {
797 *p++ = ch;
799 } else {
800 charsleft = (charsleft << 6) | UB64(ch);
801 bitsleft += 6;
802 s++;
803 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
806 else if ( ch == '+' ) {
807 s++;
808 if (s < e && *s == '-') {
809 s++;
810 *p++ = '+';
811 } else
813 inShift = 1;
814 bitsleft = 0;
817 else if (SPECIAL(ch,0,0)) {
818 errmsg = "unexpected special character";
819 s++;
820 goto utf7Error;
822 else {
823 *p++ = ch;
824 s++;
826 continue;
827 utf7Error:
828 if (utf7_decoding_error(&p, errors, errmsg))
829 goto onError;
832 if (inShift) {
833 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
834 goto onError;
837 if (_PyUnicode_Resize(&unicode, p - unicode->str))
838 goto onError;
840 return (PyObject *)unicode;
842 onError:
843 Py_DECREF(unicode);
844 return NULL;
848 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
849 int size,
850 int encodeSetO,
851 int encodeWhiteSpace,
852 const char *errors)
854 PyObject *v;
855 /* It might be possible to tighten this worst case */
856 unsigned int cbAllocated = 5 * size;
857 int inShift = 0;
858 int i = 0;
859 unsigned int bitsleft = 0;
860 unsigned long charsleft = 0;
861 char * out;
862 char * start;
864 if (size == 0)
865 return PyString_FromStringAndSize(NULL, 0);
867 v = PyString_FromStringAndSize(NULL, cbAllocated);
868 if (v == NULL)
869 return NULL;
871 start = out = PyString_AS_STRING(v);
872 for (;i < size; ++i) {
873 Py_UNICODE ch = s[i];
875 if (!inShift) {
876 if (ch == '+') {
877 *out++ = '+';
878 *out++ = '-';
879 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
880 charsleft = ch;
881 bitsleft = 16;
882 *out++ = '+';
883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
884 inShift = bitsleft > 0;
885 } else {
886 *out++ = (char) ch;
888 } else {
889 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
890 *out++ = B64(charsleft << (6-bitsleft));
891 charsleft = 0;
892 bitsleft = 0;
893 /* Characters not in the BASE64 set implicitly unshift the sequence
894 so no '-' is required, except if the character is itself a '-' */
895 if (B64CHAR(ch) || ch == '-') {
896 *out++ = '-';
898 inShift = 0;
899 *out++ = (char) ch;
900 } else {
901 bitsleft += 16;
902 charsleft = (charsleft << 16) | ch;
903 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
905 /* If the next character is special then we dont' need to terminate
906 the shift sequence. If the next character is not a BASE64 character
907 or '-' then the shift sequence will be terminated implicitly and we
908 don't have to insert a '-'. */
910 if (bitsleft == 0) {
911 if (i + 1 < size) {
912 Py_UNICODE ch2 = s[i+1];
914 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
916 } else if (B64CHAR(ch2) || ch2 == '-') {
917 *out++ = '-';
918 inShift = 0;
919 } else {
920 inShift = 0;
924 else {
925 *out++ = '-';
926 inShift = 0;
932 if (bitsleft) {
933 *out++= B64(charsleft << (6-bitsleft) );
934 *out++ = '-';
937 if (_PyString_Resize(&v, out - start)) {
938 Py_DECREF(v);
939 return NULL;
941 return v;
944 #undef SPECIAL
945 #undef B64
946 #undef B64CHAR
947 #undef UB64
948 #undef ENCODE
949 #undef DECODE
951 /* --- UTF-8 Codec -------------------------------------------------------- */
953 static
954 char utf8_code_length[256] = {
955 /* Map UTF-8 encoded prefix byte to sequence length. zero means
956 illegal prefix. see RFC 2279 for details */
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
971 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
972 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
975 static
976 int utf8_decoding_error(const char **source,
977 Py_UNICODE **dest,
978 const char *errors,
979 const char *details)
981 if ((errors == NULL) ||
982 (strcmp(errors,"strict") == 0)) {
983 PyErr_Format(PyExc_UnicodeError,
984 "UTF-8 decoding error: %.400s",
985 details);
986 return -1;
988 else if (strcmp(errors,"ignore") == 0) {
989 (*source)++;
990 return 0;
992 else if (strcmp(errors,"replace") == 0) {
993 (*source)++;
994 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
995 (*dest)++;
996 return 0;
998 else {
999 PyErr_Format(PyExc_ValueError,
1000 "UTF-8 decoding error; unknown error handling code: %.400s",
1001 errors);
1002 return -1;
1006 PyObject *PyUnicode_DecodeUTF8(const char *s,
1007 int size,
1008 const char *errors)
1010 int n;
1011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
1014 const char *errmsg = "";
1016 /* Note: size will always be longer than the resulting Unicode
1017 character count */
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1024 /* Unpack UTF-8 encoded data */
1025 p = unicode->str;
1026 e = s + size;
1028 while (s < e) {
1029 Py_UCS4 ch = (unsigned char)*s;
1031 if (ch < 0x80) {
1032 *p++ = (Py_UNICODE)ch;
1033 s++;
1034 continue;
1037 n = utf8_code_length[ch];
1039 if (s + n > e) {
1040 errmsg = "unexpected end of data";
1041 goto utf8Error;
1044 switch (n) {
1046 case 0:
1047 errmsg = "unexpected code byte";
1048 goto utf8Error;
1050 case 1:
1051 errmsg = "internal error";
1052 goto utf8Error;
1054 case 2:
1055 if ((s[1] & 0xc0) != 0x80) {
1056 errmsg = "invalid data";
1057 goto utf8Error;
1059 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1060 if (ch < 0x80) {
1061 errmsg = "illegal encoding";
1062 goto utf8Error;
1064 else
1065 *p++ = (Py_UNICODE)ch;
1066 break;
1068 case 3:
1069 if ((s[1] & 0xc0) != 0x80 ||
1070 (s[2] & 0xc0) != 0x80) {
1071 errmsg = "invalid data";
1072 goto utf8Error;
1074 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1075 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1076 errmsg = "illegal encoding";
1077 goto utf8Error;
1079 else
1080 *p++ = (Py_UNICODE)ch;
1081 break;
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
1086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
1093 if ((ch < 0x10000) /* minimum value allowed for 4
1094 byte encoding */
1095 || (ch > 0x10ffff)) /* maximum value allowed for
1096 UTF-16 */
1098 errmsg = "illegal encoding";
1099 goto utf8Error;
1101 #ifdef Py_UNICODE_WIDE
1102 *p++ = (Py_UNICODE)ch;
1103 #else
1104 /* compute and append the two surrogates: */
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1112 /* low surrogate = bottom 10 bits added to DC00 */
1113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1114 #endif
1115 break;
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
1119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
1122 s += n;
1123 continue;
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
1130 /* Adjust length */
1131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1132 goto onError;
1134 return (PyObject *)unicode;
1136 onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1141 /* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
1143 #if 0
1144 static
1145 int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
1153 "UTF-8 encoding error: %.400s",
1154 details);
1155 return -1;
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
1168 "unknown error handling code: %.400s",
1169 errors);
1170 return -1;
1173 #endif
1175 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1179 PyObject *v;
1180 char *p;
1181 char *q;
1182 Py_UCS4 ch2;
1183 unsigned int cbAllocated = 3 * size;
1184 unsigned int cbWritten = 0;
1185 int i = 0;
1187 v = PyString_FromStringAndSize(NULL, cbAllocated);
1188 if (v == NULL)
1189 return NULL;
1190 if (size == 0)
1191 return v;
1193 p = q = PyString_AS_STRING(v);
1194 while (i < size) {
1195 Py_UCS4 ch = s[i++];
1196 if (ch < 0x80) {
1197 *p++ = (char) ch;
1198 cbWritten++;
1200 else if (ch < 0x0800) {
1201 *p++ = 0xc0 | (ch >> 6);
1202 *p++ = 0x80 | (ch & 0x3f);
1203 cbWritten += 2;
1205 else if (ch < 0x10000) {
1206 /* Check for high surrogate */
1207 if (0xD800 <= ch && ch <= 0xDBFF) {
1208 if (i != size) {
1209 ch2 = s[i];
1210 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1212 if (cbWritten >= (cbAllocated - 4)) {
1213 /* Provide enough room for some more
1214 surrogates */
1215 cbAllocated += 4*10;
1216 if (_PyString_Resize(&v, cbAllocated))
1217 goto onError;
1220 /* combine the two values */
1221 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1223 *p++ = (char)((ch >> 18) | 0xf0);
1224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1225 i++;
1226 cbWritten += 4;
1230 else {
1231 *p++ = (char)(0xe0 | (ch >> 12));
1232 cbWritten += 3;
1234 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1235 *p++ = (char)(0x80 | (ch & 0x3f));
1236 } else {
1237 *p++ = 0xf0 | (ch>>18);
1238 *p++ = 0x80 | ((ch>>12) & 0x3f);
1239 *p++ = 0x80 | ((ch>>6) & 0x3f);
1240 *p++ = 0x80 | (ch & 0x3f);
1241 cbWritten += 4;
1244 *p = '\0';
1245 if (_PyString_Resize(&v, p - q))
1246 goto onError;
1247 return v;
1249 onError:
1250 Py_DECREF(v);
1251 return NULL;
1254 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 return NULL;
1260 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1261 PyUnicode_GET_SIZE(unicode),
1262 NULL);
1265 /* --- UTF-16 Codec ------------------------------------------------------- */
1267 static
1268 int utf16_decoding_error(Py_UNICODE **dest,
1269 const char *errors,
1270 const char *details)
1272 if ((errors == NULL) ||
1273 (strcmp(errors,"strict") == 0)) {
1274 PyErr_Format(PyExc_UnicodeError,
1275 "UTF-16 decoding error: %.400s",
1276 details);
1277 return -1;
1279 else if (strcmp(errors,"ignore") == 0) {
1280 return 0;
1282 else if (strcmp(errors,"replace") == 0) {
1283 if (dest) {
1284 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1285 (*dest)++;
1287 return 0;
1289 else {
1290 PyErr_Format(PyExc_ValueError,
1291 "UTF-16 decoding error; "
1292 "unknown error handling code: %.400s",
1293 errors);
1294 return -1;
1298 PyObject *
1299 PyUnicode_DecodeUTF16(const char *s,
1300 int size,
1301 const char *errors,
1302 int *byteorder)
1304 PyUnicodeObject *unicode;
1305 Py_UNICODE *p;
1306 const unsigned char *q, *e;
1307 int bo = 0; /* assume native ordering by default */
1308 const char *errmsg = "";
1309 /* Offsets from q for retrieving byte pairs in the right order. */
1310 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311 int ihi = 1, ilo = 0;
1312 #else
1313 int ihi = 0, ilo = 1;
1314 #endif
1316 /* size should be an even number */
1317 if (size & 1) {
1318 if (utf16_decoding_error(NULL, errors, "truncated data"))
1319 return NULL;
1320 --size; /* else ignore the oddball byte */
1323 /* Note: size will always be longer than the resulting Unicode
1324 character count */
1325 unicode = _PyUnicode_New(size);
1326 if (!unicode)
1327 return NULL;
1328 if (size == 0)
1329 return (PyObject *)unicode;
1331 /* Unpack UTF-16 encoded data */
1332 p = unicode->str;
1333 q = (unsigned char *)s;
1334 e = q + size;
1336 if (byteorder)
1337 bo = *byteorder;
1339 /* Check for BOM marks (U+FEFF) in the input and adjust current
1340 byte order setting accordingly. In native mode, the leading BOM
1341 mark is skipped, in all other modes, it is copied to the output
1342 stream as-is (giving a ZWNBSP character). */
1343 if (bo == 0) {
1344 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1345 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1346 if (bom == 0xFEFF) {
1347 q += 2;
1348 bo = -1;
1350 else if (bom == 0xFFFE) {
1351 q += 2;
1352 bo = 1;
1354 #else
1355 if (bom == 0xFEFF) {
1356 q += 2;
1357 bo = 1;
1359 else if (bom == 0xFFFE) {
1360 q += 2;
1361 bo = -1;
1363 #endif
1366 if (bo == -1) {
1367 /* force LE */
1368 ihi = 1;
1369 ilo = 0;
1371 else if (bo == 1) {
1372 /* force BE */
1373 ihi = 0;
1374 ilo = 1;
1377 while (q < e) {
1378 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1379 q += 2;
1381 if (ch < 0xD800 || ch > 0xDFFF) {
1382 *p++ = ch;
1383 continue;
1386 /* UTF-16 code pair: */
1387 if (q >= e) {
1388 errmsg = "unexpected end of data";
1389 goto utf16Error;
1391 if (0xD800 <= ch && ch <= 0xDBFF) {
1392 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1393 q += 2;
1394 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1395 #ifndef Py_UNICODE_WIDE
1396 *p++ = ch;
1397 *p++ = ch2;
1398 #else
1399 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1400 #endif
1401 continue;
1403 else {
1404 errmsg = "illegal UTF-16 surrogate";
1405 goto utf16Error;
1409 errmsg = "illegal encoding";
1410 /* Fall through to report the error */
1412 utf16Error:
1413 if (utf16_decoding_error(&p, errors, errmsg))
1414 goto onError;
1417 if (byteorder)
1418 *byteorder = bo;
1420 /* Adjust length */
1421 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1422 goto onError;
1424 return (PyObject *)unicode;
1426 onError:
1427 Py_DECREF(unicode);
1428 return NULL;
1431 PyObject *
1432 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1433 int size,
1434 const char *errors,
1435 int byteorder)
1437 PyObject *v;
1438 unsigned char *p;
1439 int i, pairs;
1440 /* Offsets from p for storing byte pairs in the right order. */
1441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443 #else
1444 int ihi = 0, ilo = 1;
1445 #endif
1447 #define STORECHAR(CH) \
1448 do { \
1449 p[ihi] = ((CH) >> 8) & 0xff; \
1450 p[ilo] = (CH) & 0xff; \
1451 p += 2; \
1452 } while(0)
1454 for (i = pairs = 0; i < size; i++)
1455 if (s[i] >= 0x10000)
1456 pairs++;
1457 v = PyString_FromStringAndSize(NULL,
1458 2 * (size + pairs + (byteorder == 0)));
1459 if (v == NULL)
1460 return NULL;
1462 p = (unsigned char *)PyString_AS_STRING(v);
1463 if (byteorder == 0)
1464 STORECHAR(0xFEFF);
1465 if (size == 0)
1466 return v;
1468 if (byteorder == -1) {
1469 /* force LE */
1470 ihi = 1;
1471 ilo = 0;
1473 else if (byteorder == 1) {
1474 /* force BE */
1475 ihi = 0;
1476 ilo = 1;
1479 while (size-- > 0) {
1480 Py_UNICODE ch = *s++;
1481 Py_UNICODE ch2 = 0;
1482 if (ch >= 0x10000) {
1483 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1484 ch = 0xD800 | ((ch-0x10000) >> 10);
1486 STORECHAR(ch);
1487 if (ch2)
1488 STORECHAR(ch2);
1490 return v;
1491 #undef STORECHAR
1494 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_BadArgument();
1498 return NULL;
1500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1501 PyUnicode_GET_SIZE(unicode),
1502 NULL,
1506 /* --- Unicode Escape Codec ----------------------------------------------- */
1508 static
1509 int unicodeescape_decoding_error(const char **source,
1510 Py_UNICODE *x,
1511 const char *errors,
1512 const char *details)
1514 if ((errors == NULL) ||
1515 (strcmp(errors,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError,
1517 "Unicode-Escape decoding error: %.400s",
1518 details);
1519 return -1;
1521 else if (strcmp(errors,"ignore") == 0) {
1522 return 0;
1524 else if (strcmp(errors,"replace") == 0) {
1525 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1526 return 0;
1528 else {
1529 PyErr_Format(PyExc_ValueError,
1530 "Unicode-Escape decoding error; "
1531 "unknown error handling code: %.400s",
1532 errors);
1533 return -1;
1537 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1539 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1540 int size,
1541 const char *errors)
1543 PyUnicodeObject *v;
1544 Py_UNICODE *p, *buf;
1545 const char *end;
1546 char* message;
1547 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1549 /* Escaped strings will always be longer than the resulting
1550 Unicode string, so we start with size here and then reduce the
1551 length after conversion to the true value. */
1552 v = _PyUnicode_New(size);
1553 if (v == NULL)
1554 goto onError;
1555 if (size == 0)
1556 return (PyObject *)v;
1558 p = buf = PyUnicode_AS_UNICODE(v);
1559 end = s + size;
1561 while (s < end) {
1562 unsigned char c;
1563 Py_UNICODE x;
1564 int i, digits;
1566 /* Non-escape characters are interpreted as Unicode ordinals */
1567 if (*s != '\\') {
1568 *p++ = (unsigned char) *s++;
1569 continue;
1572 /* \ - Escapes */
1573 s++;
1574 switch (*s++) {
1576 /* \x escapes */
1577 case '\n': break;
1578 case '\\': *p++ = '\\'; break;
1579 case '\'': *p++ = '\''; break;
1580 case '\"': *p++ = '\"'; break;
1581 case 'b': *p++ = '\b'; break;
1582 case 'f': *p++ = '\014'; break; /* FF */
1583 case 't': *p++ = '\t'; break;
1584 case 'n': *p++ = '\n'; break;
1585 case 'r': *p++ = '\r'; break;
1586 case 'v': *p++ = '\013'; break; /* VT */
1587 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1589 /* \OOO (octal) escapes */
1590 case '0': case '1': case '2': case '3':
1591 case '4': case '5': case '6': case '7':
1592 x = s[-1] - '0';
1593 if ('0' <= *s && *s <= '7') {
1594 x = (x<<3) + *s++ - '0';
1595 if ('0' <= *s && *s <= '7')
1596 x = (x<<3) + *s++ - '0';
1598 *p++ = x;
1599 break;
1601 /* hex escapes */
1602 /* \xXX */
1603 case 'x':
1604 digits = 2;
1605 message = "truncated \\xXX escape";
1606 goto hexescape;
1608 /* \uXXXX */
1609 case 'u':
1610 digits = 4;
1611 message = "truncated \\uXXXX escape";
1612 goto hexescape;
1614 /* \UXXXXXXXX */
1615 case 'U':
1616 digits = 8;
1617 message = "truncated \\UXXXXXXXX escape";
1618 hexescape:
1619 chr = 0;
1620 for (i = 0; i < digits; i++) {
1621 c = (unsigned char) s[i];
1622 if (!isxdigit(c)) {
1623 if (unicodeescape_decoding_error(&s, &x, errors, message))
1624 goto onError;
1625 chr = x;
1626 i++;
1627 break;
1629 chr = (chr<<4) & ~0xF;
1630 if (c >= '0' && c <= '9')
1631 chr += c - '0';
1632 else if (c >= 'a' && c <= 'f')
1633 chr += 10 + c - 'a';
1634 else
1635 chr += 10 + c - 'A';
1637 s += i;
1638 store:
1639 /* when we get here, chr is a 32-bit unicode character */
1640 if (chr <= 0xffff)
1641 /* UCS-2 character */
1642 *p++ = (Py_UNICODE) chr;
1643 else if (chr <= 0x10ffff) {
1644 /* UCS-4 character. Either store directly, or as
1645 surrogate pair. */
1646 #ifdef Py_UNICODE_WIDE
1647 *p++ = chr;
1648 #else
1649 chr -= 0x10000L;
1650 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1651 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1652 #endif
1653 } else {
1654 if (unicodeescape_decoding_error(
1655 &s, &x, errors,
1656 "illegal Unicode character")
1658 goto onError;
1659 *p++ = x; /* store replacement character */
1661 break;
1663 /* \N{name} */
1664 case 'N':
1665 message = "malformed \\N character escape";
1666 if (ucnhash_CAPI == NULL) {
1667 /* load the unicode data module */
1668 PyObject *m, *v;
1669 m = PyImport_ImportModule("unicodedata");
1670 if (m == NULL)
1671 goto ucnhashError;
1672 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1673 Py_DECREF(m);
1674 if (v == NULL)
1675 goto ucnhashError;
1676 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1677 Py_DECREF(v);
1678 if (ucnhash_CAPI == NULL)
1679 goto ucnhashError;
1681 if (*s == '{') {
1682 const char *start = s+1;
1683 /* look for the closing brace */
1684 while (*s != '}' && s < end)
1685 s++;
1686 if (s > start && s < end && *s == '}') {
1687 /* found a name. look it up in the unicode database */
1688 message = "unknown Unicode character name";
1689 s++;
1690 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1691 goto store;
1694 if (unicodeescape_decoding_error(&s, &x, errors, message))
1695 goto onError;
1696 *p++ = x;
1697 break;
1699 default:
1700 *p++ = '\\';
1701 *p++ = (unsigned char)s[-1];
1702 break;
1705 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1706 goto onError;
1707 return (PyObject *)v;
1709 ucnhashError:
1710 PyErr_SetString(
1711 PyExc_UnicodeError,
1712 "\\N escapes not supported (can't load unicodedata module)"
1714 return NULL;
1716 onError:
1717 Py_XDECREF(v);
1718 return NULL;
1721 /* Return a Unicode-Escape string version of the Unicode object.
1723 If quotes is true, the string is enclosed in u"" or u'' quotes as
1724 appropriate.
1728 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1729 int size,
1730 Py_UNICODE ch);
1732 static
1733 PyObject *unicodeescape_string(const Py_UNICODE *s,
1734 int size,
1735 int quotes)
1737 PyObject *repr;
1738 char *p;
1740 static const char *hexdigit = "0123456789abcdef";
1742 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1743 if (repr == NULL)
1744 return NULL;
1746 p = PyString_AS_STRING(repr);
1748 if (quotes) {
1749 *p++ = 'u';
1750 *p++ = (findchar(s, size, '\'') &&
1751 !findchar(s, size, '"')) ? '"' : '\'';
1753 while (size-- > 0) {
1754 Py_UNICODE ch = *s++;
1756 /* Escape quotes */
1757 if (quotes &&
1758 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1759 *p++ = '\\';
1760 *p++ = (char) ch;
1761 continue;
1764 #ifdef Py_UNICODE_WIDE
1765 /* Map 21-bit characters to '\U00xxxxxx' */
1766 else if (ch >= 0x10000) {
1767 int offset = p - PyString_AS_STRING(repr);
1769 /* Resize the string if necessary */
1770 if (offset + 12 > PyString_GET_SIZE(repr)) {
1771 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1772 goto onError;
1773 p = PyString_AS_STRING(repr) + offset;
1776 *p++ = '\\';
1777 *p++ = 'U';
1778 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1781 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1782 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1783 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1784 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1785 *p++ = hexdigit[ch & 0x0000000F];
1786 continue;
1788 #endif
1789 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1790 else if (ch >= 0xD800 && ch < 0xDC00) {
1791 Py_UNICODE ch2;
1792 Py_UCS4 ucs;
1794 ch2 = *s++;
1795 size--;
1796 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1797 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1798 *p++ = '\\';
1799 *p++ = 'U';
1800 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1803 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1804 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1805 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1806 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1807 *p++ = hexdigit[ucs & 0x0000000F];
1808 continue;
1810 /* Fall through: isolated surrogates are copied as-is */
1811 s--;
1812 size++;
1815 /* Map 16-bit characters to '\uxxxx' */
1816 if (ch >= 256) {
1817 *p++ = '\\';
1818 *p++ = 'u';
1819 *p++ = hexdigit[(ch >> 12) & 0x000F];
1820 *p++ = hexdigit[(ch >> 8) & 0x000F];
1821 *p++ = hexdigit[(ch >> 4) & 0x000F];
1822 *p++ = hexdigit[ch & 0x000F];
1825 /* Map special whitespace to '\t', \n', '\r' */
1826 else if (ch == '\t') {
1827 *p++ = '\\';
1828 *p++ = 't';
1830 else if (ch == '\n') {
1831 *p++ = '\\';
1832 *p++ = 'n';
1834 else if (ch == '\r') {
1835 *p++ = '\\';
1836 *p++ = 'r';
1839 /* Map non-printable US ASCII to '\xhh' */
1840 else if (ch < ' ' || ch >= 128) {
1841 *p++ = '\\';
1842 *p++ = 'x';
1843 *p++ = hexdigit[(ch >> 4) & 0x000F];
1844 *p++ = hexdigit[ch & 0x000F];
1847 /* Copy everything else as-is */
1848 else
1849 *p++ = (char) ch;
1851 if (quotes)
1852 *p++ = PyString_AS_STRING(repr)[1];
1854 *p = '\0';
1855 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1856 goto onError;
1858 return repr;
1860 onError:
1861 Py_DECREF(repr);
1862 return NULL;
1865 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1866 int size)
1868 return unicodeescape_string(s, size, 0);
1871 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1873 if (!PyUnicode_Check(unicode)) {
1874 PyErr_BadArgument();
1875 return NULL;
1877 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1878 PyUnicode_GET_SIZE(unicode));
1881 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1883 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1884 int size,
1885 const char *errors)
1887 PyUnicodeObject *v;
1888 Py_UNICODE *p, *buf;
1889 const char *end;
1890 const char *bs;
1892 /* Escaped strings will always be longer than the resulting
1893 Unicode string, so we start with size here and then reduce the
1894 length after conversion to the true value. */
1895 v = _PyUnicode_New(size);
1896 if (v == NULL)
1897 goto onError;
1898 if (size == 0)
1899 return (PyObject *)v;
1900 p = buf = PyUnicode_AS_UNICODE(v);
1901 end = s + size;
1902 while (s < end) {
1903 unsigned char c;
1904 Py_UNICODE x;
1905 int i;
1907 /* Non-escape characters are interpreted as Unicode ordinals */
1908 if (*s != '\\') {
1909 *p++ = (unsigned char)*s++;
1910 continue;
1913 /* \u-escapes are only interpreted iff the number of leading
1914 backslashes if odd */
1915 bs = s;
1916 for (;s < end;) {
1917 if (*s != '\\')
1918 break;
1919 *p++ = (unsigned char)*s++;
1921 if (((s - bs) & 1) == 0 ||
1922 s >= end ||
1923 *s != 'u') {
1924 continue;
1926 p--;
1927 s++;
1929 /* \uXXXX with 4 hex digits */
1930 for (x = 0, i = 0; i < 4; i++) {
1931 c = (unsigned char)s[i];
1932 if (!isxdigit(c)) {
1933 if (unicodeescape_decoding_error(&s, &x, errors,
1934 "truncated \\uXXXX"))
1935 goto onError;
1936 i++;
1937 break;
1939 x = (x<<4) & ~0xF;
1940 if (c >= '0' && c <= '9')
1941 x += c - '0';
1942 else if (c >= 'a' && c <= 'f')
1943 x += 10 + c - 'a';
1944 else
1945 x += 10 + c - 'A';
1947 s += i;
1948 *p++ = x;
1950 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1951 goto onError;
1952 return (PyObject *)v;
1954 onError:
1955 Py_XDECREF(v);
1956 return NULL;
1959 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1960 int size)
1962 PyObject *repr;
1963 char *p;
1964 char *q;
1966 static const char *hexdigit = "0123456789abcdef";
1968 repr = PyString_FromStringAndSize(NULL, 6 * size);
1969 if (repr == NULL)
1970 return NULL;
1971 if (size == 0)
1972 return repr;
1974 p = q = PyString_AS_STRING(repr);
1975 while (size-- > 0) {
1976 Py_UNICODE ch = *s++;
1977 /* Map 16-bit characters to '\uxxxx' */
1978 if (ch >= 256) {
1979 *p++ = '\\';
1980 *p++ = 'u';
1981 *p++ = hexdigit[(ch >> 12) & 0xf];
1982 *p++ = hexdigit[(ch >> 8) & 0xf];
1983 *p++ = hexdigit[(ch >> 4) & 0xf];
1984 *p++ = hexdigit[ch & 15];
1986 /* Copy everything else as-is */
1987 else
1988 *p++ = (char) ch;
1990 *p = '\0';
1991 if (_PyString_Resize(&repr, p - q))
1992 goto onError;
1994 return repr;
1996 onError:
1997 Py_DECREF(repr);
1998 return NULL;
2001 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2003 if (!PyUnicode_Check(unicode)) {
2004 PyErr_BadArgument();
2005 return NULL;
2007 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2008 PyUnicode_GET_SIZE(unicode));
2011 /* --- Latin-1 Codec ------------------------------------------------------ */
2013 PyObject *PyUnicode_DecodeLatin1(const char *s,
2014 int size,
2015 const char *errors)
2017 PyUnicodeObject *v;
2018 Py_UNICODE *p;
2020 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2021 if (size == 1 && *(unsigned char*)s < 256) {
2022 Py_UNICODE r = *(unsigned char*)s;
2023 return PyUnicode_FromUnicode(&r, 1);
2026 v = _PyUnicode_New(size);
2027 if (v == NULL)
2028 goto onError;
2029 if (size == 0)
2030 return (PyObject *)v;
2031 p = PyUnicode_AS_UNICODE(v);
2032 while (size-- > 0)
2033 *p++ = (unsigned char)*s++;
2034 return (PyObject *)v;
2036 onError:
2037 Py_XDECREF(v);
2038 return NULL;
2041 static
2042 int latin1_encoding_error(const Py_UNICODE **source,
2043 char **dest,
2044 const char *errors,
2045 const char *details)
2047 if ((errors == NULL) ||
2048 (strcmp(errors,"strict") == 0)) {
2049 PyErr_Format(PyExc_UnicodeError,
2050 "Latin-1 encoding error: %.400s",
2051 details);
2052 return -1;
2054 else if (strcmp(errors,"ignore") == 0) {
2055 return 0;
2057 else if (strcmp(errors,"replace") == 0) {
2058 **dest = '?';
2059 (*dest)++;
2060 return 0;
2062 else {
2063 PyErr_Format(PyExc_ValueError,
2064 "Latin-1 encoding error; "
2065 "unknown error handling code: %.400s",
2066 errors);
2067 return -1;
2071 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2072 int size,
2073 const char *errors)
2075 PyObject *repr;
2076 char *s, *start;
2078 repr = PyString_FromStringAndSize(NULL, size);
2079 if (repr == NULL)
2080 return NULL;
2081 if (size == 0)
2082 return repr;
2084 s = PyString_AS_STRING(repr);
2085 start = s;
2086 while (size-- > 0) {
2087 Py_UNICODE ch = *p++;
2088 if (ch >= 256) {
2089 if (latin1_encoding_error(&p, &s, errors,
2090 "ordinal not in range(256)"))
2091 goto onError;
2093 else
2094 *s++ = (char)ch;
2096 /* Resize if error handling skipped some characters */
2097 if (s - start < PyString_GET_SIZE(repr))
2098 if (_PyString_Resize(&repr, s - start))
2099 goto onError;
2100 return repr;
2102 onError:
2103 Py_DECREF(repr);
2104 return NULL;
2107 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2109 if (!PyUnicode_Check(unicode)) {
2110 PyErr_BadArgument();
2111 return NULL;
2113 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2114 PyUnicode_GET_SIZE(unicode),
2115 NULL);
2118 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2120 static
2121 int ascii_decoding_error(const char **source,
2122 Py_UNICODE **dest,
2123 const char *errors,
2124 const char *details)
2126 if ((errors == NULL) ||
2127 (strcmp(errors,"strict") == 0)) {
2128 PyErr_Format(PyExc_UnicodeError,
2129 "ASCII decoding error: %.400s",
2130 details);
2131 return -1;
2133 else if (strcmp(errors,"ignore") == 0) {
2134 return 0;
2136 else if (strcmp(errors,"replace") == 0) {
2137 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2138 (*dest)++;
2139 return 0;
2141 else {
2142 PyErr_Format(PyExc_ValueError,
2143 "ASCII decoding error; "
2144 "unknown error handling code: %.400s",
2145 errors);
2146 return -1;
2150 PyObject *PyUnicode_DecodeASCII(const char *s,
2151 int size,
2152 const char *errors)
2154 PyUnicodeObject *v;
2155 Py_UNICODE *p;
2157 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2158 if (size == 1 && *(unsigned char*)s < 128) {
2159 Py_UNICODE r = *(unsigned char*)s;
2160 return PyUnicode_FromUnicode(&r, 1);
2163 v = _PyUnicode_New(size);
2164 if (v == NULL)
2165 goto onError;
2166 if (size == 0)
2167 return (PyObject *)v;
2168 p = PyUnicode_AS_UNICODE(v);
2169 while (size-- > 0) {
2170 register unsigned char c;
2172 c = (unsigned char)*s++;
2173 if (c < 128)
2174 *p++ = c;
2175 else if (ascii_decoding_error(&s, &p, errors,
2176 "ordinal not in range(128)"))
2177 goto onError;
2179 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2180 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2181 goto onError;
2182 return (PyObject *)v;
2184 onError:
2185 Py_XDECREF(v);
2186 return NULL;
2189 static
2190 int ascii_encoding_error(const Py_UNICODE **source,
2191 char **dest,
2192 const char *errors,
2193 const char *details)
2195 if ((errors == NULL) ||
2196 (strcmp(errors,"strict") == 0)) {
2197 PyErr_Format(PyExc_UnicodeError,
2198 "ASCII encoding error: %.400s",
2199 details);
2200 return -1;
2202 else if (strcmp(errors,"ignore") == 0) {
2203 return 0;
2205 else if (strcmp(errors,"replace") == 0) {
2206 **dest = '?';
2207 (*dest)++;
2208 return 0;
2210 else {
2211 PyErr_Format(PyExc_ValueError,
2212 "ASCII encoding error; "
2213 "unknown error handling code: %.400s",
2214 errors);
2215 return -1;
2219 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2220 int size,
2221 const char *errors)
2223 PyObject *repr;
2224 char *s, *start;
2226 repr = PyString_FromStringAndSize(NULL, size);
2227 if (repr == NULL)
2228 return NULL;
2229 if (size == 0)
2230 return repr;
2232 s = PyString_AS_STRING(repr);
2233 start = s;
2234 while (size-- > 0) {
2235 Py_UNICODE ch = *p++;
2236 if (ch >= 128) {
2237 if (ascii_encoding_error(&p, &s, errors,
2238 "ordinal not in range(128)"))
2239 goto onError;
2241 else
2242 *s++ = (char)ch;
2244 /* Resize if error handling skipped some characters */
2245 if (s - start < PyString_GET_SIZE(repr))
2246 if (_PyString_Resize(&repr, s - start))
2247 goto onError;
2248 return repr;
2250 onError:
2251 Py_DECREF(repr);
2252 return NULL;
2255 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2257 if (!PyUnicode_Check(unicode)) {
2258 PyErr_BadArgument();
2259 return NULL;
2261 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2262 PyUnicode_GET_SIZE(unicode),
2263 NULL);
2266 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2268 /* --- MBCS codecs for Windows -------------------------------------------- */
2270 PyObject *PyUnicode_DecodeMBCS(const char *s,
2271 int size,
2272 const char *errors)
2274 PyUnicodeObject *v;
2275 Py_UNICODE *p;
2277 /* First get the size of the result */
2278 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2279 if (size > 0 && usize==0)
2280 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2282 v = _PyUnicode_New(usize);
2283 if (v == NULL)
2284 return NULL;
2285 if (usize == 0)
2286 return (PyObject *)v;
2287 p = PyUnicode_AS_UNICODE(v);
2288 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2289 Py_DECREF(v);
2290 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2293 return (PyObject *)v;
2296 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2297 int size,
2298 const char *errors)
2300 PyObject *repr;
2301 char *s;
2302 DWORD mbcssize;
2304 /* If there are no characters, bail now! */
2305 if (size==0)
2306 return PyString_FromString("");
2308 /* First get the size of the result */
2309 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2310 if (mbcssize==0)
2311 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2313 repr = PyString_FromStringAndSize(NULL, mbcssize);
2314 if (repr == NULL)
2315 return NULL;
2316 if (mbcssize == 0)
2317 return repr;
2319 /* Do the conversion */
2320 s = PyString_AS_STRING(repr);
2321 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2322 Py_DECREF(repr);
2323 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2325 return repr;
2328 #endif /* MS_WIN32 */
2330 /* --- Character Mapping Codec -------------------------------------------- */
2332 static
2333 int charmap_decoding_error(const char **source,
2334 Py_UNICODE **dest,
2335 const char *errors,
2336 const char *details)
2338 if ((errors == NULL) ||
2339 (strcmp(errors,"strict") == 0)) {
2340 PyErr_Format(PyExc_UnicodeError,
2341 "charmap decoding error: %.400s",
2342 details);
2343 return -1;
2345 else if (strcmp(errors,"ignore") == 0) {
2346 return 0;
2348 else if (strcmp(errors,"replace") == 0) {
2349 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2350 (*dest)++;
2351 return 0;
2353 else {
2354 PyErr_Format(PyExc_ValueError,
2355 "charmap decoding error; "
2356 "unknown error handling code: %.400s",
2357 errors);
2358 return -1;
2362 PyObject *PyUnicode_DecodeCharmap(const char *s,
2363 int size,
2364 PyObject *mapping,
2365 const char *errors)
2367 PyUnicodeObject *v;
2368 Py_UNICODE *p;
2369 int extrachars = 0;
2371 /* Default to Latin-1 */
2372 if (mapping == NULL)
2373 return PyUnicode_DecodeLatin1(s, size, errors);
2375 v = _PyUnicode_New(size);
2376 if (v == NULL)
2377 goto onError;
2378 if (size == 0)
2379 return (PyObject *)v;
2380 p = PyUnicode_AS_UNICODE(v);
2381 while (size-- > 0) {
2382 unsigned char ch = *s++;
2383 PyObject *w, *x;
2385 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2386 w = PyInt_FromLong((long)ch);
2387 if (w == NULL)
2388 goto onError;
2389 x = PyObject_GetItem(mapping, w);
2390 Py_DECREF(w);
2391 if (x == NULL) {
2392 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2393 /* No mapping found means: mapping is undefined. */
2394 PyErr_Clear();
2395 x = Py_None;
2396 Py_INCREF(x);
2397 } else
2398 goto onError;
2401 /* Apply mapping */
2402 if (PyInt_Check(x)) {
2403 long value = PyInt_AS_LONG(x);
2404 if (value < 0 || value > 65535) {
2405 PyErr_SetString(PyExc_TypeError,
2406 "character mapping must be in range(65536)");
2407 Py_DECREF(x);
2408 goto onError;
2410 *p++ = (Py_UNICODE)value;
2412 else if (x == Py_None) {
2413 /* undefined mapping */
2414 if (charmap_decoding_error(&s, &p, errors,
2415 "character maps to <undefined>")) {
2416 Py_DECREF(x);
2417 goto onError;
2420 else if (PyUnicode_Check(x)) {
2421 int targetsize = PyUnicode_GET_SIZE(x);
2423 if (targetsize == 1)
2424 /* 1-1 mapping */
2425 *p++ = *PyUnicode_AS_UNICODE(x);
2427 else if (targetsize > 1) {
2428 /* 1-n mapping */
2429 if (targetsize > extrachars) {
2430 /* resize first */
2431 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2432 int needed = (targetsize - extrachars) + \
2433 (targetsize << 2);
2434 extrachars += needed;
2435 if (_PyUnicode_Resize(&v,
2436 PyUnicode_GET_SIZE(v) + needed)) {
2437 Py_DECREF(x);
2438 goto onError;
2440 p = PyUnicode_AS_UNICODE(v) + oldpos;
2442 Py_UNICODE_COPY(p,
2443 PyUnicode_AS_UNICODE(x),
2444 targetsize);
2445 p += targetsize;
2446 extrachars -= targetsize;
2448 /* 1-0 mapping: skip the character */
2450 else {
2451 /* wrong return value */
2452 PyErr_SetString(PyExc_TypeError,
2453 "character mapping must return integer, None or unicode");
2454 Py_DECREF(x);
2455 goto onError;
2457 Py_DECREF(x);
2459 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2460 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2461 goto onError;
2462 return (PyObject *)v;
2464 onError:
2465 Py_XDECREF(v);
2466 return NULL;
2469 static
2470 int charmap_encoding_error(const Py_UNICODE **source,
2471 char **dest,
2472 const char *errors,
2473 const char *details)
2475 if ((errors == NULL) ||
2476 (strcmp(errors,"strict") == 0)) {
2477 PyErr_Format(PyExc_UnicodeError,
2478 "charmap encoding error: %.400s",
2479 details);
2480 return -1;
2482 else if (strcmp(errors,"ignore") == 0) {
2483 return 0;
2485 else if (strcmp(errors,"replace") == 0) {
2486 **dest = '?';
2487 (*dest)++;
2488 return 0;
2490 else {
2491 PyErr_Format(PyExc_ValueError,
2492 "charmap encoding error; "
2493 "unknown error handling code: %.400s",
2494 errors);
2495 return -1;
2499 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2500 int size,
2501 PyObject *mapping,
2502 const char *errors)
2504 PyObject *v;
2505 char *s;
2506 int extrachars = 0;
2508 /* Default to Latin-1 */
2509 if (mapping == NULL)
2510 return PyUnicode_EncodeLatin1(p, size, errors);
2512 v = PyString_FromStringAndSize(NULL, size);
2513 if (v == NULL)
2514 return NULL;
2515 if (size == 0)
2516 return v;
2517 s = PyString_AS_STRING(v);
2518 while (size-- > 0) {
2519 Py_UNICODE ch = *p++;
2520 PyObject *w, *x;
2522 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2523 w = PyInt_FromLong((long)ch);
2524 if (w == NULL)
2525 goto onError;
2526 x = PyObject_GetItem(mapping, w);
2527 Py_DECREF(w);
2528 if (x == NULL) {
2529 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2530 /* No mapping found means: mapping is undefined. */
2531 PyErr_Clear();
2532 x = Py_None;
2533 Py_INCREF(x);
2534 } else
2535 goto onError;
2538 /* Apply mapping */
2539 if (PyInt_Check(x)) {
2540 long value = PyInt_AS_LONG(x);
2541 if (value < 0 || value > 255) {
2542 PyErr_SetString(PyExc_TypeError,
2543 "character mapping must be in range(256)");
2544 Py_DECREF(x);
2545 goto onError;
2547 *s++ = (char)value;
2549 else if (x == Py_None) {
2550 /* undefined mapping */
2551 if (charmap_encoding_error(&p, &s, errors,
2552 "character maps to <undefined>")) {
2553 Py_DECREF(x);
2554 goto onError;
2557 else if (PyString_Check(x)) {
2558 int targetsize = PyString_GET_SIZE(x);
2560 if (targetsize == 1)
2561 /* 1-1 mapping */
2562 *s++ = *PyString_AS_STRING(x);
2564 else if (targetsize > 1) {
2565 /* 1-n mapping */
2566 if (targetsize > extrachars) {
2567 /* resize first */
2568 int oldpos = (int)(s - PyString_AS_STRING(v));
2569 int needed = (targetsize - extrachars) + \
2570 (targetsize << 2);
2571 extrachars += needed;
2572 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2573 Py_DECREF(x);
2574 goto onError;
2576 s = PyString_AS_STRING(v) + oldpos;
2578 memcpy(s, PyString_AS_STRING(x), targetsize);
2579 s += targetsize;
2580 extrachars -= targetsize;
2582 /* 1-0 mapping: skip the character */
2584 else {
2585 /* wrong return value */
2586 PyErr_SetString(PyExc_TypeError,
2587 "character mapping must return integer, None or unicode");
2588 Py_DECREF(x);
2589 goto onError;
2591 Py_DECREF(x);
2593 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2594 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2595 goto onError;
2596 return v;
2598 onError:
2599 Py_DECREF(v);
2600 return NULL;
2603 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2604 PyObject *mapping)
2606 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2607 PyErr_BadArgument();
2608 return NULL;
2610 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2611 PyUnicode_GET_SIZE(unicode),
2612 mapping,
2613 NULL);
2616 static
2617 int translate_error(const Py_UNICODE **source,
2618 Py_UNICODE **dest,
2619 const char *errors,
2620 const char *details)
2622 if ((errors == NULL) ||
2623 (strcmp(errors,"strict") == 0)) {
2624 PyErr_Format(PyExc_UnicodeError,
2625 "translate error: %.400s",
2626 details);
2627 return -1;
2629 else if (strcmp(errors,"ignore") == 0) {
2630 return 0;
2632 else if (strcmp(errors,"replace") == 0) {
2633 **dest = '?';
2634 (*dest)++;
2635 return 0;
2637 else {
2638 PyErr_Format(PyExc_ValueError,
2639 "translate error; "
2640 "unknown error handling code: %.400s",
2641 errors);
2642 return -1;
2646 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2647 int size,
2648 PyObject *mapping,
2649 const char *errors)
2651 PyUnicodeObject *v;
2652 Py_UNICODE *p;
2654 if (mapping == NULL) {
2655 PyErr_BadArgument();
2656 return NULL;
2659 /* Output will never be longer than input */
2660 v = _PyUnicode_New(size);
2661 if (v == NULL)
2662 goto onError;
2663 if (size == 0)
2664 goto done;
2665 p = PyUnicode_AS_UNICODE(v);
2666 while (size-- > 0) {
2667 Py_UNICODE ch = *s++;
2668 PyObject *w, *x;
2670 /* Get mapping */
2671 w = PyInt_FromLong(ch);
2672 if (w == NULL)
2673 goto onError;
2674 x = PyObject_GetItem(mapping, w);
2675 Py_DECREF(w);
2676 if (x == NULL) {
2677 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2678 /* No mapping found: default to 1-1 mapping */
2679 PyErr_Clear();
2680 *p++ = ch;
2681 continue;
2683 goto onError;
2686 /* Apply mapping */
2687 if (PyInt_Check(x))
2688 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2689 else if (x == Py_None) {
2690 /* undefined mapping */
2691 if (translate_error(&s, &p, errors,
2692 "character maps to <undefined>")) {
2693 Py_DECREF(x);
2694 goto onError;
2697 else if (PyUnicode_Check(x)) {
2698 if (PyUnicode_GET_SIZE(x) != 1) {
2699 /* 1-n mapping */
2700 PyErr_SetString(PyExc_NotImplementedError,
2701 "1-n mappings are currently not implemented");
2702 Py_DECREF(x);
2703 goto onError;
2705 *p++ = *PyUnicode_AS_UNICODE(x);
2707 else {
2708 /* wrong return value */
2709 PyErr_SetString(PyExc_TypeError,
2710 "translate mapping must return integer, None or unicode");
2711 Py_DECREF(x);
2712 goto onError;
2714 Py_DECREF(x);
2716 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2717 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2718 goto onError;
2720 done:
2721 return (PyObject *)v;
2723 onError:
2724 Py_XDECREF(v);
2725 return NULL;
2728 PyObject *PyUnicode_Translate(PyObject *str,
2729 PyObject *mapping,
2730 const char *errors)
2732 PyObject *result;
2734 str = PyUnicode_FromObject(str);
2735 if (str == NULL)
2736 goto onError;
2737 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2738 PyUnicode_GET_SIZE(str),
2739 mapping,
2740 errors);
2741 Py_DECREF(str);
2742 return result;
2744 onError:
2745 Py_XDECREF(str);
2746 return NULL;
2749 /* --- Decimal Encoder ---------------------------------------------------- */
2751 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2752 int length,
2753 char *output,
2754 const char *errors)
2756 Py_UNICODE *p, *end;
2758 if (output == NULL) {
2759 PyErr_BadArgument();
2760 return -1;
2763 p = s;
2764 end = s + length;
2765 while (p < end) {
2766 register Py_UNICODE ch = *p++;
2767 int decimal;
2769 if (Py_UNICODE_ISSPACE(ch)) {
2770 *output++ = ' ';
2771 continue;
2773 decimal = Py_UNICODE_TODECIMAL(ch);
2774 if (decimal >= 0) {
2775 *output++ = '0' + decimal;
2776 continue;
2778 if (0 < ch && ch < 256) {
2779 *output++ = (char)ch;
2780 continue;
2782 /* All other characters are considered invalid */
2783 if (errors == NULL || strcmp(errors, "strict") == 0) {
2784 PyErr_SetString(PyExc_ValueError,
2785 "invalid decimal Unicode string");
2786 goto onError;
2788 else if (strcmp(errors, "ignore") == 0)
2789 continue;
2790 else if (strcmp(errors, "replace") == 0) {
2791 *output++ = '?';
2792 continue;
2795 /* 0-terminate the output string */
2796 *output++ = '\0';
2797 return 0;
2799 onError:
2800 return -1;
2803 /* --- Helpers ------------------------------------------------------------ */
2805 static
2806 int count(PyUnicodeObject *self,
2807 int start,
2808 int end,
2809 PyUnicodeObject *substring)
2811 int count = 0;
2813 if (start < 0)
2814 start += self->length;
2815 if (start < 0)
2816 start = 0;
2817 if (end > self->length)
2818 end = self->length;
2819 if (end < 0)
2820 end += self->length;
2821 if (end < 0)
2822 end = 0;
2824 if (substring->length == 0)
2825 return (end - start + 1);
2827 end -= substring->length;
2829 while (start <= end)
2830 if (Py_UNICODE_MATCH(self, start, substring)) {
2831 count++;
2832 start += substring->length;
2833 } else
2834 start++;
2836 return count;
2839 int PyUnicode_Count(PyObject *str,
2840 PyObject *substr,
2841 int start,
2842 int end)
2844 int result;
2846 str = PyUnicode_FromObject(str);
2847 if (str == NULL)
2848 return -1;
2849 substr = PyUnicode_FromObject(substr);
2850 if (substr == NULL) {
2851 Py_DECREF(str);
2852 return -1;
2855 result = count((PyUnicodeObject *)str,
2856 start, end,
2857 (PyUnicodeObject *)substr);
2859 Py_DECREF(str);
2860 Py_DECREF(substr);
2861 return result;
2864 static
2865 int findstring(PyUnicodeObject *self,
2866 PyUnicodeObject *substring,
2867 int start,
2868 int end,
2869 int direction)
2871 if (start < 0)
2872 start += self->length;
2873 if (start < 0)
2874 start = 0;
2876 if (substring->length == 0)
2877 return start;
2879 if (end > self->length)
2880 end = self->length;
2881 if (end < 0)
2882 end += self->length;
2883 if (end < 0)
2884 end = 0;
2886 end -= substring->length;
2888 if (direction < 0) {
2889 for (; end >= start; end--)
2890 if (Py_UNICODE_MATCH(self, end, substring))
2891 return end;
2892 } else {
2893 for (; start <= end; start++)
2894 if (Py_UNICODE_MATCH(self, start, substring))
2895 return start;
2898 return -1;
2901 int PyUnicode_Find(PyObject *str,
2902 PyObject *substr,
2903 int start,
2904 int end,
2905 int direction)
2907 int result;
2909 str = PyUnicode_FromObject(str);
2910 if (str == NULL)
2911 return -1;
2912 substr = PyUnicode_FromObject(substr);
2913 if (substr == NULL) {
2914 Py_DECREF(substr);
2915 return -1;
2918 result = findstring((PyUnicodeObject *)str,
2919 (PyUnicodeObject *)substr,
2920 start, end, direction);
2921 Py_DECREF(str);
2922 Py_DECREF(substr);
2923 return result;
2926 static
2927 int tailmatch(PyUnicodeObject *self,
2928 PyUnicodeObject *substring,
2929 int start,
2930 int end,
2931 int direction)
2933 if (start < 0)
2934 start += self->length;
2935 if (start < 0)
2936 start = 0;
2938 if (substring->length == 0)
2939 return 1;
2941 if (end > self->length)
2942 end = self->length;
2943 if (end < 0)
2944 end += self->length;
2945 if (end < 0)
2946 end = 0;
2948 end -= substring->length;
2949 if (end < start)
2950 return 0;
2952 if (direction > 0) {
2953 if (Py_UNICODE_MATCH(self, end, substring))
2954 return 1;
2955 } else {
2956 if (Py_UNICODE_MATCH(self, start, substring))
2957 return 1;
2960 return 0;
2963 int PyUnicode_Tailmatch(PyObject *str,
2964 PyObject *substr,
2965 int start,
2966 int end,
2967 int direction)
2969 int result;
2971 str = PyUnicode_FromObject(str);
2972 if (str == NULL)
2973 return -1;
2974 substr = PyUnicode_FromObject(substr);
2975 if (substr == NULL) {
2976 Py_DECREF(substr);
2977 return -1;
2980 result = tailmatch((PyUnicodeObject *)str,
2981 (PyUnicodeObject *)substr,
2982 start, end, direction);
2983 Py_DECREF(str);
2984 Py_DECREF(substr);
2985 return result;
2988 static
2989 const Py_UNICODE *findchar(const Py_UNICODE *s,
2990 int size,
2991 Py_UNICODE ch)
2993 /* like wcschr, but doesn't stop at NULL characters */
2995 while (size-- > 0) {
2996 if (*s == ch)
2997 return s;
2998 s++;
3001 return NULL;
3004 /* Apply fixfct filter to the Unicode object self and return a
3005 reference to the modified object */
3007 static
3008 PyObject *fixup(PyUnicodeObject *self,
3009 int (*fixfct)(PyUnicodeObject *s))
3012 PyUnicodeObject *u;
3014 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3015 if (u == NULL)
3016 return NULL;
3018 Py_UNICODE_COPY(u->str, self->str, self->length);
3020 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3021 /* fixfct should return TRUE if it modified the buffer. If
3022 FALSE, return a reference to the original buffer instead
3023 (to save space, not time) */
3024 Py_INCREF(self);
3025 Py_DECREF(u);
3026 return (PyObject*) self;
3028 return (PyObject*) u;
3031 static
3032 int fixupper(PyUnicodeObject *self)
3034 int len = self->length;
3035 Py_UNICODE *s = self->str;
3036 int status = 0;
3038 while (len-- > 0) {
3039 register Py_UNICODE ch;
3041 ch = Py_UNICODE_TOUPPER(*s);
3042 if (ch != *s) {
3043 status = 1;
3044 *s = ch;
3046 s++;
3049 return status;
3052 static
3053 int fixlower(PyUnicodeObject *self)
3055 int len = self->length;
3056 Py_UNICODE *s = self->str;
3057 int status = 0;
3059 while (len-- > 0) {
3060 register Py_UNICODE ch;
3062 ch = Py_UNICODE_TOLOWER(*s);
3063 if (ch != *s) {
3064 status = 1;
3065 *s = ch;
3067 s++;
3070 return status;
3073 static
3074 int fixswapcase(PyUnicodeObject *self)
3076 int len = self->length;
3077 Py_UNICODE *s = self->str;
3078 int status = 0;
3080 while (len-- > 0) {
3081 if (Py_UNICODE_ISUPPER(*s)) {
3082 *s = Py_UNICODE_TOLOWER(*s);
3083 status = 1;
3084 } else if (Py_UNICODE_ISLOWER(*s)) {
3085 *s = Py_UNICODE_TOUPPER(*s);
3086 status = 1;
3088 s++;
3091 return status;
3094 static
3095 int fixcapitalize(PyUnicodeObject *self)
3097 int len = self->length;
3098 Py_UNICODE *s = self->str;
3099 int status = 0;
3101 if (len == 0)
3102 return 0;
3103 if (Py_UNICODE_ISLOWER(*s)) {
3104 *s = Py_UNICODE_TOUPPER(*s);
3105 status = 1;
3107 s++;
3108 while (--len > 0) {
3109 if (Py_UNICODE_ISUPPER(*s)) {
3110 *s = Py_UNICODE_TOLOWER(*s);
3111 status = 1;
3113 s++;
3115 return status;
3118 static
3119 int fixtitle(PyUnicodeObject *self)
3121 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3122 register Py_UNICODE *e;
3123 int previous_is_cased;
3125 /* Shortcut for single character strings */
3126 if (PyUnicode_GET_SIZE(self) == 1) {
3127 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3128 if (*p != ch) {
3129 *p = ch;
3130 return 1;
3132 else
3133 return 0;
3136 e = p + PyUnicode_GET_SIZE(self);
3137 previous_is_cased = 0;
3138 for (; p < e; p++) {
3139 register const Py_UNICODE ch = *p;
3141 if (previous_is_cased)
3142 *p = Py_UNICODE_TOLOWER(ch);
3143 else
3144 *p = Py_UNICODE_TOTITLE(ch);
3146 if (Py_UNICODE_ISLOWER(ch) ||
3147 Py_UNICODE_ISUPPER(ch) ||
3148 Py_UNICODE_ISTITLE(ch))
3149 previous_is_cased = 1;
3150 else
3151 previous_is_cased = 0;
3153 return 1;
3156 PyObject *PyUnicode_Join(PyObject *separator,
3157 PyObject *seq)
3159 Py_UNICODE *sep;
3160 int seplen;
3161 PyUnicodeObject *res = NULL;
3162 int reslen = 0;
3163 Py_UNICODE *p;
3164 int sz = 100;
3165 int i;
3166 PyObject *it;
3168 it = PyObject_GetIter(seq);
3169 if (it == NULL)
3170 return NULL;
3172 if (separator == NULL) {
3173 Py_UNICODE blank = ' ';
3174 sep = &blank;
3175 seplen = 1;
3177 else {
3178 separator = PyUnicode_FromObject(separator);
3179 if (separator == NULL)
3180 goto onError;
3181 sep = PyUnicode_AS_UNICODE(separator);
3182 seplen = PyUnicode_GET_SIZE(separator);
3185 res = _PyUnicode_New(sz);
3186 if (res == NULL)
3187 goto onError;
3188 p = PyUnicode_AS_UNICODE(res);
3189 reslen = 0;
3191 for (i = 0; ; ++i) {
3192 int itemlen;
3193 PyObject *item = PyIter_Next(it);
3194 if (item == NULL) {
3195 if (PyErr_Occurred())
3196 goto onError;
3197 break;
3199 if (!PyUnicode_Check(item)) {
3200 PyObject *v;
3201 if (!PyString_Check(item)) {
3202 PyErr_Format(PyExc_TypeError,
3203 "sequence item %i: expected string or Unicode,"
3204 " %.80s found",
3205 i, item->ob_type->tp_name);
3206 Py_DECREF(item);
3207 goto onError;
3209 v = PyUnicode_FromObject(item);
3210 Py_DECREF(item);
3211 item = v;
3212 if (item == NULL)
3213 goto onError;
3215 itemlen = PyUnicode_GET_SIZE(item);
3216 while (reslen + itemlen + seplen >= sz) {
3217 if (_PyUnicode_Resize(&res, sz*2)) {
3218 Py_DECREF(item);
3219 goto onError;
3221 sz *= 2;
3222 p = PyUnicode_AS_UNICODE(res) + reslen;
3224 if (i > 0) {
3225 Py_UNICODE_COPY(p, sep, seplen);
3226 p += seplen;
3227 reslen += seplen;
3229 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3230 p += itemlen;
3231 reslen += itemlen;
3232 Py_DECREF(item);
3234 if (_PyUnicode_Resize(&res, reslen))
3235 goto onError;
3237 Py_XDECREF(separator);
3238 Py_DECREF(it);
3239 return (PyObject *)res;
3241 onError:
3242 Py_XDECREF(separator);
3243 Py_XDECREF(res);
3244 Py_DECREF(it);
3245 return NULL;
3248 static
3249 PyUnicodeObject *pad(PyUnicodeObject *self,
3250 int left,
3251 int right,
3252 Py_UNICODE fill)
3254 PyUnicodeObject *u;
3256 if (left < 0)
3257 left = 0;
3258 if (right < 0)
3259 right = 0;
3261 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3262 Py_INCREF(self);
3263 return self;
3266 u = _PyUnicode_New(left + self->length + right);
3267 if (u) {
3268 if (left)
3269 Py_UNICODE_FILL(u->str, fill, left);
3270 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3271 if (right)
3272 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3275 return u;
3278 #define SPLIT_APPEND(data, left, right) \
3279 str = PyUnicode_FromUnicode(data + left, right - left); \
3280 if (!str) \
3281 goto onError; \
3282 if (PyList_Append(list, str)) { \
3283 Py_DECREF(str); \
3284 goto onError; \
3286 else \
3287 Py_DECREF(str);
3289 static
3290 PyObject *split_whitespace(PyUnicodeObject *self,
3291 PyObject *list,
3292 int maxcount)
3294 register int i;
3295 register int j;
3296 int len = self->length;
3297 PyObject *str;
3299 for (i = j = 0; i < len; ) {
3300 /* find a token */
3301 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3302 i++;
3303 j = i;
3304 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3305 i++;
3306 if (j < i) {
3307 if (maxcount-- <= 0)
3308 break;
3309 SPLIT_APPEND(self->str, j, i);
3310 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3311 i++;
3312 j = i;
3315 if (j < len) {
3316 SPLIT_APPEND(self->str, j, len);
3318 return list;
3320 onError:
3321 Py_DECREF(list);
3322 return NULL;
3325 PyObject *PyUnicode_Splitlines(PyObject *string,
3326 int keepends)
3328 register int i;
3329 register int j;
3330 int len;
3331 PyObject *list;
3332 PyObject *str;
3333 Py_UNICODE *data;
3335 string = PyUnicode_FromObject(string);
3336 if (string == NULL)
3337 return NULL;
3338 data = PyUnicode_AS_UNICODE(string);
3339 len = PyUnicode_GET_SIZE(string);
3341 list = PyList_New(0);
3342 if (!list)
3343 goto onError;
3345 for (i = j = 0; i < len; ) {
3346 int eol;
3348 /* Find a line and append it */
3349 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3350 i++;
3352 /* Skip the line break reading CRLF as one line break */
3353 eol = i;
3354 if (i < len) {
3355 if (data[i] == '\r' && i + 1 < len &&
3356 data[i+1] == '\n')
3357 i += 2;
3358 else
3359 i++;
3360 if (keepends)
3361 eol = i;
3363 SPLIT_APPEND(data, j, eol);
3364 j = i;
3366 if (j < len) {
3367 SPLIT_APPEND(data, j, len);
3370 Py_DECREF(string);
3371 return list;
3373 onError:
3374 Py_DECREF(list);
3375 Py_DECREF(string);
3376 return NULL;
3379 static
3380 PyObject *split_char(PyUnicodeObject *self,
3381 PyObject *list,
3382 Py_UNICODE ch,
3383 int maxcount)
3385 register int i;
3386 register int j;
3387 int len = self->length;
3388 PyObject *str;
3390 for (i = j = 0; i < len; ) {
3391 if (self->str[i] == ch) {
3392 if (maxcount-- <= 0)
3393 break;
3394 SPLIT_APPEND(self->str, j, i);
3395 i = j = i + 1;
3396 } else
3397 i++;
3399 if (j <= len) {
3400 SPLIT_APPEND(self->str, j, len);
3402 return list;
3404 onError:
3405 Py_DECREF(list);
3406 return NULL;
3409 static
3410 PyObject *split_substring(PyUnicodeObject *self,
3411 PyObject *list,
3412 PyUnicodeObject *substring,
3413 int maxcount)
3415 register int i;
3416 register int j;
3417 int len = self->length;
3418 int sublen = substring->length;
3419 PyObject *str;
3421 for (i = j = 0; i <= len - sublen; ) {
3422 if (Py_UNICODE_MATCH(self, i, substring)) {
3423 if (maxcount-- <= 0)
3424 break;
3425 SPLIT_APPEND(self->str, j, i);
3426 i = j = i + sublen;
3427 } else
3428 i++;
3430 if (j <= len) {
3431 SPLIT_APPEND(self->str, j, len);
3433 return list;
3435 onError:
3436 Py_DECREF(list);
3437 return NULL;
3440 #undef SPLIT_APPEND
3442 static
3443 PyObject *split(PyUnicodeObject *self,
3444 PyUnicodeObject *substring,
3445 int maxcount)
3447 PyObject *list;
3449 if (maxcount < 0)
3450 maxcount = INT_MAX;
3452 list = PyList_New(0);
3453 if (!list)
3454 return NULL;
3456 if (substring == NULL)
3457 return split_whitespace(self,list,maxcount);
3459 else if (substring->length == 1)
3460 return split_char(self,list,substring->str[0],maxcount);
3462 else if (substring->length == 0) {
3463 Py_DECREF(list);
3464 PyErr_SetString(PyExc_ValueError, "empty separator");
3465 return NULL;
3467 else
3468 return split_substring(self,list,substring,maxcount);
3471 static
3472 PyObject *strip(PyUnicodeObject *self,
3473 int left,
3474 int right)
3476 Py_UNICODE *p = self->str;
3477 int start = 0;
3478 int end = self->length;
3480 if (left)
3481 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3482 start++;
3484 if (right)
3485 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3486 end--;
3488 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3489 /* couldn't strip anything off, return original string */
3490 Py_INCREF(self);
3491 return (PyObject*) self;
3494 return (PyObject*) PyUnicode_FromUnicode(
3495 self->str + start,
3496 end - start
3500 static
3501 PyObject *replace(PyUnicodeObject *self,
3502 PyUnicodeObject *str1,
3503 PyUnicodeObject *str2,
3504 int maxcount)
3506 PyUnicodeObject *u;
3508 if (maxcount < 0)
3509 maxcount = INT_MAX;
3511 if (str1->length == 1 && str2->length == 1) {
3512 int i;
3514 /* replace characters */
3515 if (!findchar(self->str, self->length, str1->str[0]) &&
3516 PyUnicode_CheckExact(self)) {
3517 /* nothing to replace, return original string */
3518 Py_INCREF(self);
3519 u = self;
3520 } else {
3521 Py_UNICODE u1 = str1->str[0];
3522 Py_UNICODE u2 = str2->str[0];
3524 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3525 NULL,
3526 self->length
3528 if (u != NULL) {
3529 Py_UNICODE_COPY(u->str, self->str,
3530 self->length);
3531 for (i = 0; i < u->length; i++)
3532 if (u->str[i] == u1) {
3533 if (--maxcount < 0)
3534 break;
3535 u->str[i] = u2;
3540 } else {
3541 int n, i;
3542 Py_UNICODE *p;
3544 /* replace strings */
3545 n = count(self, 0, self->length, str1);
3546 if (n > maxcount)
3547 n = maxcount;
3548 if (n == 0 && PyUnicode_CheckExact(self)) {
3549 /* nothing to replace, return original string */
3550 Py_INCREF(self);
3551 u = self;
3552 } else {
3553 u = _PyUnicode_New(
3554 self->length + n * (str2->length - str1->length));
3555 if (u) {
3556 i = 0;
3557 p = u->str;
3558 while (i <= self->length - str1->length)
3559 if (Py_UNICODE_MATCH(self, i, str1)) {
3560 /* replace string segment */
3561 Py_UNICODE_COPY(p, str2->str, str2->length);
3562 p += str2->length;
3563 i += str1->length;
3564 if (--n <= 0) {
3565 /* copy remaining part */
3566 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3567 break;
3569 } else
3570 *p++ = self->str[i++];
3575 return (PyObject *) u;
3578 /* --- Unicode Object Methods --------------------------------------------- */
3580 static char title__doc__[] =
3581 "S.title() -> unicode\n\
3583 Return a titlecased version of S, i.e. words start with title case\n\
3584 characters, all remaining cased characters have lower case.";
3586 static PyObject*
3587 unicode_title(PyUnicodeObject *self)
3589 return fixup(self, fixtitle);
3592 static char capitalize__doc__[] =
3593 "S.capitalize() -> unicode\n\
3595 Return a capitalized version of S, i.e. make the first character\n\
3596 have upper case.";
3598 static PyObject*
3599 unicode_capitalize(PyUnicodeObject *self)
3601 return fixup(self, fixcapitalize);
3604 #if 0
3605 static char capwords__doc__[] =
3606 "S.capwords() -> unicode\n\
3608 Apply .capitalize() to all words in S and return the result with\n\
3609 normalized whitespace (all whitespace strings are replaced by ' ').";
3611 static PyObject*
3612 unicode_capwords(PyUnicodeObject *self)
3614 PyObject *list;
3615 PyObject *item;
3616 int i;
3618 /* Split into words */
3619 list = split(self, NULL, -1);
3620 if (!list)
3621 return NULL;
3623 /* Capitalize each word */
3624 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3625 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3626 fixcapitalize);
3627 if (item == NULL)
3628 goto onError;
3629 Py_DECREF(PyList_GET_ITEM(list, i));
3630 PyList_SET_ITEM(list, i, item);
3633 /* Join the words to form a new string */
3634 item = PyUnicode_Join(NULL, list);
3636 onError:
3637 Py_DECREF(list);
3638 return (PyObject *)item;
3640 #endif
3642 static char center__doc__[] =
3643 "S.center(width) -> unicode\n\
3645 Return S centered in a Unicode string of length width. Padding is done\n\
3646 using spaces.";
3648 static PyObject *
3649 unicode_center(PyUnicodeObject *self, PyObject *args)
3651 int marg, left;
3652 int width;
3654 if (!PyArg_ParseTuple(args, "i:center", &width))
3655 return NULL;
3657 if (self->length >= width && PyUnicode_CheckExact(self)) {
3658 Py_INCREF(self);
3659 return (PyObject*) self;
3662 marg = width - self->length;
3663 left = marg / 2 + (marg & width & 1);
3665 return (PyObject*) pad(self, left, marg - left, ' ');
3668 #if 0
3670 /* This code should go into some future Unicode collation support
3671 module. The basic comparison should compare ordinals on a naive
3672 basis (this is what Java does and thus JPython too). */
3674 /* speedy UTF-16 code point order comparison */
3675 /* gleaned from: */
3676 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3678 static short utf16Fixup[32] =
3680 0, 0, 0, 0, 0, 0, 0, 0,
3681 0, 0, 0, 0, 0, 0, 0, 0,
3682 0, 0, 0, 0, 0, 0, 0, 0,
3683 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3686 static int
3687 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3689 int len1, len2;
3691 Py_UNICODE *s1 = str1->str;
3692 Py_UNICODE *s2 = str2->str;
3694 len1 = str1->length;
3695 len2 = str2->length;
3697 while (len1 > 0 && len2 > 0) {
3698 Py_UNICODE c1, c2;
3700 c1 = *s1++;
3701 c2 = *s2++;
3703 if (c1 > (1<<11) * 26)
3704 c1 += utf16Fixup[c1>>11];
3705 if (c2 > (1<<11) * 26)
3706 c2 += utf16Fixup[c2>>11];
3707 /* now c1 and c2 are in UTF-32-compatible order */
3709 if (c1 != c2)
3710 return (c1 < c2) ? -1 : 1;
3712 len1--; len2--;
3715 return (len1 < len2) ? -1 : (len1 != len2);
3718 #else
3720 static int
3721 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3723 register int len1, len2;
3725 Py_UNICODE *s1 = str1->str;
3726 Py_UNICODE *s2 = str2->str;
3728 len1 = str1->length;
3729 len2 = str2->length;
3731 while (len1 > 0 && len2 > 0) {
3732 Py_UNICODE c1, c2;
3734 c1 = *s1++;
3735 c2 = *s2++;
3737 if (c1 != c2)
3738 return (c1 < c2) ? -1 : 1;
3740 len1--; len2--;
3743 return (len1 < len2) ? -1 : (len1 != len2);
3746 #endif
3748 int PyUnicode_Compare(PyObject *left,
3749 PyObject *right)
3751 PyUnicodeObject *u = NULL, *v = NULL;
3752 int result;
3754 /* Coerce the two arguments */
3755 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3756 if (u == NULL)
3757 goto onError;
3758 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3759 if (v == NULL)
3760 goto onError;
3762 /* Shortcut for empty or interned objects */
3763 if (v == u) {
3764 Py_DECREF(u);
3765 Py_DECREF(v);
3766 return 0;
3769 result = unicode_compare(u, v);
3771 Py_DECREF(u);
3772 Py_DECREF(v);
3773 return result;
3775 onError:
3776 Py_XDECREF(u);
3777 Py_XDECREF(v);
3778 return -1;
3781 int PyUnicode_Contains(PyObject *container,
3782 PyObject *element)
3784 PyUnicodeObject *u = NULL, *v = NULL;
3785 int result;
3786 register const Py_UNICODE *p, *e;
3787 register Py_UNICODE ch;
3789 /* Coerce the two arguments */
3790 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3791 if (v == NULL) {
3792 PyErr_SetString(PyExc_TypeError,
3793 "'in <string>' requires character as left operand");
3794 goto onError;
3796 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3797 if (u == NULL) {
3798 Py_DECREF(v);
3799 goto onError;
3802 /* Check v in u */
3803 if (PyUnicode_GET_SIZE(v) != 1) {
3804 PyErr_SetString(PyExc_TypeError,
3805 "'in <string>' requires character as left operand");
3806 goto onError;
3808 ch = *PyUnicode_AS_UNICODE(v);
3809 p = PyUnicode_AS_UNICODE(u);
3810 e = p + PyUnicode_GET_SIZE(u);
3811 result = 0;
3812 while (p < e) {
3813 if (*p++ == ch) {
3814 result = 1;
3815 break;
3819 Py_DECREF(u);
3820 Py_DECREF(v);
3821 return result;
3823 onError:
3824 Py_XDECREF(u);
3825 Py_XDECREF(v);
3826 return -1;
3829 /* Concat to string or Unicode object giving a new Unicode object. */
3831 PyObject *PyUnicode_Concat(PyObject *left,
3832 PyObject *right)
3834 PyUnicodeObject *u = NULL, *v = NULL, *w;
3836 /* Coerce the two arguments */
3837 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3838 if (u == NULL)
3839 goto onError;
3840 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3841 if (v == NULL)
3842 goto onError;
3844 /* Shortcuts */
3845 if (v == unicode_empty) {
3846 Py_DECREF(v);
3847 return (PyObject *)u;
3849 if (u == unicode_empty) {
3850 Py_DECREF(u);
3851 return (PyObject *)v;
3854 /* Concat the two Unicode strings */
3855 w = _PyUnicode_New(u->length + v->length);
3856 if (w == NULL)
3857 goto onError;
3858 Py_UNICODE_COPY(w->str, u->str, u->length);
3859 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3861 Py_DECREF(u);
3862 Py_DECREF(v);
3863 return (PyObject *)w;
3865 onError:
3866 Py_XDECREF(u);
3867 Py_XDECREF(v);
3868 return NULL;
3871 static char count__doc__[] =
3872 "S.count(sub[, start[, end]]) -> int\n\
3874 Return the number of occurrences of substring sub in Unicode string\n\
3875 S[start:end]. Optional arguments start and end are\n\
3876 interpreted as in slice notation.";
3878 static PyObject *
3879 unicode_count(PyUnicodeObject *self, PyObject *args)
3881 PyUnicodeObject *substring;
3882 int start = 0;
3883 int end = INT_MAX;
3884 PyObject *result;
3886 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3887 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3888 return NULL;
3890 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3891 (PyObject *)substring);
3892 if (substring == NULL)
3893 return NULL;
3895 if (start < 0)
3896 start += self->length;
3897 if (start < 0)
3898 start = 0;
3899 if (end > self->length)
3900 end = self->length;
3901 if (end < 0)
3902 end += self->length;
3903 if (end < 0)
3904 end = 0;
3906 result = PyInt_FromLong((long) count(self, start, end, substring));
3908 Py_DECREF(substring);
3909 return result;
3912 static char encode__doc__[] =
3913 "S.encode([encoding[,errors]]) -> string\n\
3915 Return an encoded string version of S. Default encoding is the current\n\
3916 default string encoding. errors may be given to set a different error\n\
3917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3918 a ValueError. Other possible values are 'ignore' and 'replace'.";
3920 static PyObject *
3921 unicode_encode(PyUnicodeObject *self, PyObject *args)
3923 char *encoding = NULL;
3924 char *errors = NULL;
3925 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3926 return NULL;
3927 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3930 static char expandtabs__doc__[] =
3931 "S.expandtabs([tabsize]) -> unicode\n\
3933 Return a copy of S where all tab characters are expanded using spaces.\n\
3934 If tabsize is not given, a tab size of 8 characters is assumed.";
3936 static PyObject*
3937 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3939 Py_UNICODE *e;
3940 Py_UNICODE *p;
3941 Py_UNICODE *q;
3942 int i, j;
3943 PyUnicodeObject *u;
3944 int tabsize = 8;
3946 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3947 return NULL;
3949 /* First pass: determine size of output string */
3950 i = j = 0;
3951 e = self->str + self->length;
3952 for (p = self->str; p < e; p++)
3953 if (*p == '\t') {
3954 if (tabsize > 0)
3955 j += tabsize - (j % tabsize);
3957 else {
3958 j++;
3959 if (*p == '\n' || *p == '\r') {
3960 i += j;
3961 j = 0;
3965 /* Second pass: create output string and fill it */
3966 u = _PyUnicode_New(i + j);
3967 if (!u)
3968 return NULL;
3970 j = 0;
3971 q = u->str;
3973 for (p = self->str; p < e; p++)
3974 if (*p == '\t') {
3975 if (tabsize > 0) {
3976 i = tabsize - (j % tabsize);
3977 j += i;
3978 while (i--)
3979 *q++ = ' ';
3982 else {
3983 j++;
3984 *q++ = *p;
3985 if (*p == '\n' || *p == '\r')
3986 j = 0;
3989 return (PyObject*) u;
3992 static char find__doc__[] =
3993 "S.find(sub [,start [,end]]) -> int\n\
3995 Return the lowest index in S where substring sub is found,\n\
3996 such that sub is contained within s[start,end]. Optional\n\
3997 arguments start and end are interpreted as in slice notation.\n\
3999 Return -1 on failure.";
4001 static PyObject *
4002 unicode_find(PyUnicodeObject *self, PyObject *args)
4004 PyUnicodeObject *substring;
4005 int start = 0;
4006 int end = INT_MAX;
4007 PyObject *result;
4009 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4010 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4011 return NULL;
4012 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4013 (PyObject *)substring);
4014 if (substring == NULL)
4015 return NULL;
4017 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4019 Py_DECREF(substring);
4020 return result;
4023 static PyObject *
4024 unicode_getitem(PyUnicodeObject *self, int index)
4026 if (index < 0 || index >= self->length) {
4027 PyErr_SetString(PyExc_IndexError, "string index out of range");
4028 return NULL;
4031 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4034 static long
4035 unicode_hash(PyUnicodeObject *self)
4037 /* Since Unicode objects compare equal to their ASCII string
4038 counterparts, they should use the individual character values
4039 as basis for their hash value. This is needed to assure that
4040 strings and Unicode objects behave in the same way as
4041 dictionary keys. */
4043 register int len;
4044 register Py_UNICODE *p;
4045 register long x;
4047 if (self->hash != -1)
4048 return self->hash;
4049 len = PyUnicode_GET_SIZE(self);
4050 p = PyUnicode_AS_UNICODE(self);
4051 x = *p << 7;
4052 while (--len >= 0)
4053 x = (1000003*x) ^ *p++;
4054 x ^= PyUnicode_GET_SIZE(self);
4055 if (x == -1)
4056 x = -2;
4057 self->hash = x;
4058 return x;
4061 static char index__doc__[] =
4062 "S.index(sub [,start [,end]]) -> int\n\
4064 Like S.find() but raise ValueError when the substring is not found.";
4066 static PyObject *
4067 unicode_index(PyUnicodeObject *self, PyObject *args)
4069 int result;
4070 PyUnicodeObject *substring;
4071 int start = 0;
4072 int end = INT_MAX;
4074 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4075 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4076 return NULL;
4078 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4079 (PyObject *)substring);
4080 if (substring == NULL)
4081 return NULL;
4083 result = findstring(self, substring, start, end, 1);
4085 Py_DECREF(substring);
4086 if (result < 0) {
4087 PyErr_SetString(PyExc_ValueError, "substring not found");
4088 return NULL;
4090 return PyInt_FromLong(result);
4093 static char islower__doc__[] =
4094 "S.islower() -> int\n\
4096 Return 1 if all cased characters in S are lowercase and there is\n\
4097 at least one cased character in S, 0 otherwise.";
4099 static PyObject*
4100 unicode_islower(PyUnicodeObject *self)
4102 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4103 register const Py_UNICODE *e;
4104 int cased;
4106 /* Shortcut for single character strings */
4107 if (PyUnicode_GET_SIZE(self) == 1)
4108 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4110 /* Special case for empty strings */
4111 if (PyString_GET_SIZE(self) == 0)
4112 return PyInt_FromLong(0);
4114 e = p + PyUnicode_GET_SIZE(self);
4115 cased = 0;
4116 for (; p < e; p++) {
4117 register const Py_UNICODE ch = *p;
4119 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4120 return PyInt_FromLong(0);
4121 else if (!cased && Py_UNICODE_ISLOWER(ch))
4122 cased = 1;
4124 return PyInt_FromLong(cased);
4127 static char isupper__doc__[] =
4128 "S.isupper() -> int\n\
4130 Return 1 if all cased characters in S are uppercase and there is\n\
4131 at least one cased character in S, 0 otherwise.";
4133 static PyObject*
4134 unicode_isupper(PyUnicodeObject *self)
4136 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4137 register const Py_UNICODE *e;
4138 int cased;
4140 /* Shortcut for single character strings */
4141 if (PyUnicode_GET_SIZE(self) == 1)
4142 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4144 /* Special case for empty strings */
4145 if (PyString_GET_SIZE(self) == 0)
4146 return PyInt_FromLong(0);
4148 e = p + PyUnicode_GET_SIZE(self);
4149 cased = 0;
4150 for (; p < e; p++) {
4151 register const Py_UNICODE ch = *p;
4153 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4154 return PyInt_FromLong(0);
4155 else if (!cased && Py_UNICODE_ISUPPER(ch))
4156 cased = 1;
4158 return PyInt_FromLong(cased);
4161 static char istitle__doc__[] =
4162 "S.istitle() -> int\n\
4164 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4165 may only follow uncased characters and lowercase characters only cased\n\
4166 ones. Return 0 otherwise.";
4168 static PyObject*
4169 unicode_istitle(PyUnicodeObject *self)
4171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4172 register const Py_UNICODE *e;
4173 int cased, previous_is_cased;
4175 /* Shortcut for single character strings */
4176 if (PyUnicode_GET_SIZE(self) == 1)
4177 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4178 (Py_UNICODE_ISUPPER(*p) != 0));
4180 /* Special case for empty strings */
4181 if (PyString_GET_SIZE(self) == 0)
4182 return PyInt_FromLong(0);
4184 e = p + PyUnicode_GET_SIZE(self);
4185 cased = 0;
4186 previous_is_cased = 0;
4187 for (; p < e; p++) {
4188 register const Py_UNICODE ch = *p;
4190 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4191 if (previous_is_cased)
4192 return PyInt_FromLong(0);
4193 previous_is_cased = 1;
4194 cased = 1;
4196 else if (Py_UNICODE_ISLOWER(ch)) {
4197 if (!previous_is_cased)
4198 return PyInt_FromLong(0);
4199 previous_is_cased = 1;
4200 cased = 1;
4202 else
4203 previous_is_cased = 0;
4205 return PyInt_FromLong(cased);
4208 static char isspace__doc__[] =
4209 "S.isspace() -> int\n\
4211 Return 1 if there are only whitespace characters in S,\n\
4212 0 otherwise.";
4214 static PyObject*
4215 unicode_isspace(PyUnicodeObject *self)
4217 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4218 register const Py_UNICODE *e;
4220 /* Shortcut for single character strings */
4221 if (PyUnicode_GET_SIZE(self) == 1 &&
4222 Py_UNICODE_ISSPACE(*p))
4223 return PyInt_FromLong(1);
4225 /* Special case for empty strings */
4226 if (PyString_GET_SIZE(self) == 0)
4227 return PyInt_FromLong(0);
4229 e = p + PyUnicode_GET_SIZE(self);
4230 for (; p < e; p++) {
4231 if (!Py_UNICODE_ISSPACE(*p))
4232 return PyInt_FromLong(0);
4234 return PyInt_FromLong(1);
4237 static char isalpha__doc__[] =
4238 "S.isalpha() -> int\n\
4240 Return 1 if all characters in S are alphabetic\n\
4241 and there is at least one character in S, 0 otherwise.";
4243 static PyObject*
4244 unicode_isalpha(PyUnicodeObject *self)
4246 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4247 register const Py_UNICODE *e;
4249 /* Shortcut for single character strings */
4250 if (PyUnicode_GET_SIZE(self) == 1 &&
4251 Py_UNICODE_ISALPHA(*p))
4252 return PyInt_FromLong(1);
4254 /* Special case for empty strings */
4255 if (PyString_GET_SIZE(self) == 0)
4256 return PyInt_FromLong(0);
4258 e = p + PyUnicode_GET_SIZE(self);
4259 for (; p < e; p++) {
4260 if (!Py_UNICODE_ISALPHA(*p))
4261 return PyInt_FromLong(0);
4263 return PyInt_FromLong(1);
4266 static char isalnum__doc__[] =
4267 "S.isalnum() -> int\n\
4269 Return 1 if all characters in S are alphanumeric\n\
4270 and there is at least one character in S, 0 otherwise.";
4272 static PyObject*
4273 unicode_isalnum(PyUnicodeObject *self)
4275 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4276 register const Py_UNICODE *e;
4278 /* Shortcut for single character strings */
4279 if (PyUnicode_GET_SIZE(self) == 1 &&
4280 Py_UNICODE_ISALNUM(*p))
4281 return PyInt_FromLong(1);
4283 /* Special case for empty strings */
4284 if (PyString_GET_SIZE(self) == 0)
4285 return PyInt_FromLong(0);
4287 e = p + PyUnicode_GET_SIZE(self);
4288 for (; p < e; p++) {
4289 if (!Py_UNICODE_ISALNUM(*p))
4290 return PyInt_FromLong(0);
4292 return PyInt_FromLong(1);
4295 static char isdecimal__doc__[] =
4296 "S.isdecimal() -> int\n\
4298 Return 1 if there are only decimal characters in S,\n\
4299 0 otherwise.";
4301 static PyObject*
4302 unicode_isdecimal(PyUnicodeObject *self)
4304 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4305 register const Py_UNICODE *e;
4307 /* Shortcut for single character strings */
4308 if (PyUnicode_GET_SIZE(self) == 1 &&
4309 Py_UNICODE_ISDECIMAL(*p))
4310 return PyInt_FromLong(1);
4312 /* Special case for empty strings */
4313 if (PyString_GET_SIZE(self) == 0)
4314 return PyInt_FromLong(0);
4316 e = p + PyUnicode_GET_SIZE(self);
4317 for (; p < e; p++) {
4318 if (!Py_UNICODE_ISDECIMAL(*p))
4319 return PyInt_FromLong(0);
4321 return PyInt_FromLong(1);
4324 static char isdigit__doc__[] =
4325 "S.isdigit() -> int\n\
4327 Return 1 if there are only digit characters in S,\n\
4328 0 otherwise.";
4330 static PyObject*
4331 unicode_isdigit(PyUnicodeObject *self)
4333 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4334 register const Py_UNICODE *e;
4336 /* Shortcut for single character strings */
4337 if (PyUnicode_GET_SIZE(self) == 1 &&
4338 Py_UNICODE_ISDIGIT(*p))
4339 return PyInt_FromLong(1);
4341 /* Special case for empty strings */
4342 if (PyString_GET_SIZE(self) == 0)
4343 return PyInt_FromLong(0);
4345 e = p + PyUnicode_GET_SIZE(self);
4346 for (; p < e; p++) {
4347 if (!Py_UNICODE_ISDIGIT(*p))
4348 return PyInt_FromLong(0);
4350 return PyInt_FromLong(1);
4353 static char isnumeric__doc__[] =
4354 "S.isnumeric() -> int\n\
4356 Return 1 if there are only numeric characters in S,\n\
4357 0 otherwise.";
4359 static PyObject*
4360 unicode_isnumeric(PyUnicodeObject *self)
4362 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4363 register const Py_UNICODE *e;
4365 /* Shortcut for single character strings */
4366 if (PyUnicode_GET_SIZE(self) == 1 &&
4367 Py_UNICODE_ISNUMERIC(*p))
4368 return PyInt_FromLong(1);
4370 /* Special case for empty strings */
4371 if (PyString_GET_SIZE(self) == 0)
4372 return PyInt_FromLong(0);
4374 e = p + PyUnicode_GET_SIZE(self);
4375 for (; p < e; p++) {
4376 if (!Py_UNICODE_ISNUMERIC(*p))
4377 return PyInt_FromLong(0);
4379 return PyInt_FromLong(1);
4382 static char join__doc__[] =
4383 "S.join(sequence) -> unicode\n\
4385 Return a string which is the concatenation of the strings in the\n\
4386 sequence. The separator between elements is S.";
4388 static PyObject*
4389 unicode_join(PyObject *self, PyObject *data)
4391 return PyUnicode_Join(self, data);
4394 static int
4395 unicode_length(PyUnicodeObject *self)
4397 return self->length;
4400 static char ljust__doc__[] =
4401 "S.ljust(width) -> unicode\n\
4403 Return S left justified in a Unicode string of length width. Padding is\n\
4404 done using spaces.";
4406 static PyObject *
4407 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4409 int width;
4410 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4411 return NULL;
4413 if (self->length >= width && PyUnicode_CheckExact(self)) {
4414 Py_INCREF(self);
4415 return (PyObject*) self;
4418 return (PyObject*) pad(self, 0, width - self->length, ' ');
4421 static char lower__doc__[] =
4422 "S.lower() -> unicode\n\
4424 Return a copy of the string S converted to lowercase.";
4426 static PyObject*
4427 unicode_lower(PyUnicodeObject *self)
4429 return fixup(self, fixlower);
4432 static char lstrip__doc__[] =
4433 "S.lstrip() -> unicode\n\
4435 Return a copy of the string S with leading whitespace removed.";
4437 static PyObject *
4438 unicode_lstrip(PyUnicodeObject *self)
4440 return strip(self, 1, 0);
4443 static PyObject*
4444 unicode_repeat(PyUnicodeObject *str, int len)
4446 PyUnicodeObject *u;
4447 Py_UNICODE *p;
4448 int nchars;
4449 size_t nbytes;
4451 if (len < 0)
4452 len = 0;
4454 if (len == 1 && PyUnicode_CheckExact(str)) {
4455 /* no repeat, return original string */
4456 Py_INCREF(str);
4457 return (PyObject*) str;
4460 /* ensure # of chars needed doesn't overflow int and # of bytes
4461 * needed doesn't overflow size_t
4463 nchars = len * str->length;
4464 if (len && nchars / len != str->length) {
4465 PyErr_SetString(PyExc_OverflowError,
4466 "repeated string is too long");
4467 return NULL;
4469 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4470 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4471 PyErr_SetString(PyExc_OverflowError,
4472 "repeated string is too long");
4473 return NULL;
4475 u = _PyUnicode_New(nchars);
4476 if (!u)
4477 return NULL;
4479 p = u->str;
4481 while (len-- > 0) {
4482 Py_UNICODE_COPY(p, str->str, str->length);
4483 p += str->length;
4486 return (PyObject*) u;
4489 PyObject *PyUnicode_Replace(PyObject *obj,
4490 PyObject *subobj,
4491 PyObject *replobj,
4492 int maxcount)
4494 PyObject *self;
4495 PyObject *str1;
4496 PyObject *str2;
4497 PyObject *result;
4499 self = PyUnicode_FromObject(obj);
4500 if (self == NULL)
4501 return NULL;
4502 str1 = PyUnicode_FromObject(subobj);
4503 if (str1 == NULL) {
4504 Py_DECREF(self);
4505 return NULL;
4507 str2 = PyUnicode_FromObject(replobj);
4508 if (str2 == NULL) {
4509 Py_DECREF(self);
4510 Py_DECREF(str1);
4511 return NULL;
4513 result = replace((PyUnicodeObject *)self,
4514 (PyUnicodeObject *)str1,
4515 (PyUnicodeObject *)str2,
4516 maxcount);
4517 Py_DECREF(self);
4518 Py_DECREF(str1);
4519 Py_DECREF(str2);
4520 return result;
4523 static char replace__doc__[] =
4524 "S.replace (old, new[, maxsplit]) -> unicode\n\
4526 Return a copy of S with all occurrences of substring\n\
4527 old replaced by new. If the optional argument maxsplit is\n\
4528 given, only the first maxsplit occurrences are replaced.";
4530 static PyObject*
4531 unicode_replace(PyUnicodeObject *self, PyObject *args)
4533 PyUnicodeObject *str1;
4534 PyUnicodeObject *str2;
4535 int maxcount = -1;
4536 PyObject *result;
4538 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4539 return NULL;
4540 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4541 if (str1 == NULL)
4542 return NULL;
4543 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4544 if (str2 == NULL)
4545 return NULL;
4547 result = replace(self, str1, str2, maxcount);
4549 Py_DECREF(str1);
4550 Py_DECREF(str2);
4551 return result;
4554 static
4555 PyObject *unicode_repr(PyObject *unicode)
4557 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4558 PyUnicode_GET_SIZE(unicode),
4562 static char rfind__doc__[] =
4563 "S.rfind(sub [,start [,end]]) -> int\n\
4565 Return the highest index in S where substring sub is found,\n\
4566 such that sub is contained within s[start,end]. Optional\n\
4567 arguments start and end are interpreted as in slice notation.\n\
4569 Return -1 on failure.";
4571 static PyObject *
4572 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4574 PyUnicodeObject *substring;
4575 int start = 0;
4576 int end = INT_MAX;
4577 PyObject *result;
4579 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4581 return NULL;
4582 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4583 (PyObject *)substring);
4584 if (substring == NULL)
4585 return NULL;
4587 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4589 Py_DECREF(substring);
4590 return result;
4593 static char rindex__doc__[] =
4594 "S.rindex(sub [,start [,end]]) -> int\n\
4596 Like S.rfind() but raise ValueError when the substring is not found.";
4598 static PyObject *
4599 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4601 int result;
4602 PyUnicodeObject *substring;
4603 int start = 0;
4604 int end = INT_MAX;
4606 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4607 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4608 return NULL;
4609 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4610 (PyObject *)substring);
4611 if (substring == NULL)
4612 return NULL;
4614 result = findstring(self, substring, start, end, -1);
4616 Py_DECREF(substring);
4617 if (result < 0) {
4618 PyErr_SetString(PyExc_ValueError, "substring not found");
4619 return NULL;
4621 return PyInt_FromLong(result);
4624 static char rjust__doc__[] =
4625 "S.rjust(width) -> unicode\n\
4627 Return S right justified in a Unicode string of length width. Padding is\n\
4628 done using spaces.";
4630 static PyObject *
4631 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4633 int width;
4634 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4635 return NULL;
4637 if (self->length >= width && PyUnicode_CheckExact(self)) {
4638 Py_INCREF(self);
4639 return (PyObject*) self;
4642 return (PyObject*) pad(self, width - self->length, 0, ' ');
4645 static char rstrip__doc__[] =
4646 "S.rstrip() -> unicode\n\
4648 Return a copy of the string S with trailing whitespace removed.";
4650 static PyObject *
4651 unicode_rstrip(PyUnicodeObject *self)
4653 return strip(self, 0, 1);
4656 static PyObject*
4657 unicode_slice(PyUnicodeObject *self, int start, int end)
4659 /* standard clamping */
4660 if (start < 0)
4661 start = 0;
4662 if (end < 0)
4663 end = 0;
4664 if (end > self->length)
4665 end = self->length;
4666 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4667 /* full slice, return original string */
4668 Py_INCREF(self);
4669 return (PyObject*) self;
4671 if (start > end)
4672 start = end;
4673 /* copy slice */
4674 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4675 end - start);
4678 PyObject *PyUnicode_Split(PyObject *s,
4679 PyObject *sep,
4680 int maxsplit)
4682 PyObject *result;
4684 s = PyUnicode_FromObject(s);
4685 if (s == NULL)
4686 return NULL;
4687 if (sep != NULL) {
4688 sep = PyUnicode_FromObject(sep);
4689 if (sep == NULL) {
4690 Py_DECREF(s);
4691 return NULL;
4695 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4697 Py_DECREF(s);
4698 Py_XDECREF(sep);
4699 return result;
4702 static char split__doc__[] =
4703 "S.split([sep [,maxsplit]]) -> list of strings\n\
4705 Return a list of the words in S, using sep as the\n\
4706 delimiter string. If maxsplit is given, at most maxsplit\n\
4707 splits are done. If sep is not specified, any whitespace string\n\
4708 is a separator.";
4710 static PyObject*
4711 unicode_split(PyUnicodeObject *self, PyObject *args)
4713 PyObject *substring = Py_None;
4714 int maxcount = -1;
4716 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4717 return NULL;
4719 if (substring == Py_None)
4720 return split(self, NULL, maxcount);
4721 else if (PyUnicode_Check(substring))
4722 return split(self, (PyUnicodeObject *)substring, maxcount);
4723 else
4724 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4727 static char splitlines__doc__[] =
4728 "S.splitlines([keepends]]) -> list of strings\n\
4730 Return a list of the lines in S, breaking at line boundaries.\n\
4731 Line breaks are not included in the resulting list unless keepends\n\
4732 is given and true.";
4734 static PyObject*
4735 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4737 int keepends = 0;
4739 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4740 return NULL;
4742 return PyUnicode_Splitlines((PyObject *)self, keepends);
4745 static
4746 PyObject *unicode_str(PyUnicodeObject *self)
4748 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4751 static char strip__doc__[] =
4752 "S.strip() -> unicode\n\
4754 Return a copy of S with leading and trailing whitespace removed.";
4756 static PyObject *
4757 unicode_strip(PyUnicodeObject *self)
4759 return strip(self, 1, 1);
4762 static char swapcase__doc__[] =
4763 "S.swapcase() -> unicode\n\
4765 Return a copy of S with uppercase characters converted to lowercase\n\
4766 and vice versa.";
4768 static PyObject*
4769 unicode_swapcase(PyUnicodeObject *self)
4771 return fixup(self, fixswapcase);
4774 static char translate__doc__[] =
4775 "S.translate(table) -> unicode\n\
4777 Return a copy of the string S, where all characters have been mapped\n\
4778 through the given translation table, which must be a mapping of\n\
4779 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4780 are left untouched. Characters mapped to None are deleted.";
4782 static PyObject*
4783 unicode_translate(PyUnicodeObject *self, PyObject *table)
4785 return PyUnicode_TranslateCharmap(self->str,
4786 self->length,
4787 table,
4788 "ignore");
4791 static char upper__doc__[] =
4792 "S.upper() -> unicode\n\
4794 Return a copy of S converted to uppercase.";
4796 static PyObject*
4797 unicode_upper(PyUnicodeObject *self)
4799 return fixup(self, fixupper);
4802 #if 0
4803 static char zfill__doc__[] =
4804 "S.zfill(width) -> unicode\n\
4806 Pad a numeric string x with zeros on the left, to fill a field\n\
4807 of the specified width. The string x is never truncated.";
4809 static PyObject *
4810 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4812 int fill;
4813 PyUnicodeObject *u;
4815 int width;
4816 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4817 return NULL;
4819 if (self->length >= width) {
4820 Py_INCREF(self);
4821 return (PyObject*) self;
4824 fill = width - self->length;
4826 u = pad(self, fill, 0, '0');
4828 if (u->str[fill] == '+' || u->str[fill] == '-') {
4829 /* move sign to beginning of string */
4830 u->str[0] = u->str[fill];
4831 u->str[fill] = '0';
4834 return (PyObject*) u;
4836 #endif
4838 #if 0
4839 static PyObject*
4840 unicode_freelistsize(PyUnicodeObject *self)
4842 return PyInt_FromLong(unicode_freelist_size);
4844 #endif
4846 static char startswith__doc__[] =
4847 "S.startswith(prefix[, start[, end]]) -> int\n\
4849 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4850 optional start, test S beginning at that position. With optional end, stop\n\
4851 comparing S at that position.";
4853 static PyObject *
4854 unicode_startswith(PyUnicodeObject *self,
4855 PyObject *args)
4857 PyUnicodeObject *substring;
4858 int start = 0;
4859 int end = INT_MAX;
4860 PyObject *result;
4862 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4863 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4864 return NULL;
4865 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4866 (PyObject *)substring);
4867 if (substring == NULL)
4868 return NULL;
4870 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4872 Py_DECREF(substring);
4873 return result;
4877 static char endswith__doc__[] =
4878 "S.endswith(suffix[, start[, end]]) -> int\n\
4880 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4881 optional start, test S beginning at that position. With optional end, stop\n\
4882 comparing S at that position.";
4884 static PyObject *
4885 unicode_endswith(PyUnicodeObject *self,
4886 PyObject *args)
4888 PyUnicodeObject *substring;
4889 int start = 0;
4890 int end = INT_MAX;
4891 PyObject *result;
4893 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4894 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4895 return NULL;
4896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4897 (PyObject *)substring);
4898 if (substring == NULL)
4899 return NULL;
4901 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4903 Py_DECREF(substring);
4904 return result;
4908 static PyMethodDef unicode_methods[] = {
4910 /* Order is according to common usage: often used methods should
4911 appear first, since lookup is done sequentially. */
4913 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4914 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4915 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4916 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4917 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4918 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4919 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4920 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4921 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4922 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4923 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4924 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4925 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4926 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4927 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4928 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4929 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4930 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4931 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4932 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4933 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4934 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4935 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4936 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4937 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4938 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4939 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4940 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4941 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4942 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4943 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4944 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4945 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4946 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4947 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4948 #if 0
4949 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4950 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4951 #endif
4953 #if 0
4954 /* This one is just used for debugging the implementation. */
4955 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4956 #endif
4958 {NULL, NULL}
4961 static PySequenceMethods unicode_as_sequence = {
4962 (inquiry) unicode_length, /* sq_length */
4963 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4964 (intargfunc) unicode_repeat, /* sq_repeat */
4965 (intargfunc) unicode_getitem, /* sq_item */
4966 (intintargfunc) unicode_slice, /* sq_slice */
4967 0, /* sq_ass_item */
4968 0, /* sq_ass_slice */
4969 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4972 static int
4973 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4974 int index,
4975 const void **ptr)
4977 if (index != 0) {
4978 PyErr_SetString(PyExc_SystemError,
4979 "accessing non-existent unicode segment");
4980 return -1;
4982 *ptr = (void *) self->str;
4983 return PyUnicode_GET_DATA_SIZE(self);
4986 static int
4987 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4988 const void **ptr)
4990 PyErr_SetString(PyExc_TypeError,
4991 "cannot use unicode as modifyable buffer");
4992 return -1;
4995 static int
4996 unicode_buffer_getsegcount(PyUnicodeObject *self,
4997 int *lenp)
4999 if (lenp)
5000 *lenp = PyUnicode_GET_DATA_SIZE(self);
5001 return 1;
5004 static int
5005 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5006 int index,
5007 const void **ptr)
5009 PyObject *str;
5011 if (index != 0) {
5012 PyErr_SetString(PyExc_SystemError,
5013 "accessing non-existent unicode segment");
5014 return -1;
5016 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5017 if (str == NULL)
5018 return -1;
5019 *ptr = (void *) PyString_AS_STRING(str);
5020 return PyString_GET_SIZE(str);
5023 /* Helpers for PyUnicode_Format() */
5025 static PyObject *
5026 getnextarg(PyObject *args, int arglen, int *p_argidx)
5028 int argidx = *p_argidx;
5029 if (argidx < arglen) {
5030 (*p_argidx)++;
5031 if (arglen < 0)
5032 return args;
5033 else
5034 return PyTuple_GetItem(args, argidx);
5036 PyErr_SetString(PyExc_TypeError,
5037 "not enough arguments for format string");
5038 return NULL;
5041 #define F_LJUST (1<<0)
5042 #define F_SIGN (1<<1)
5043 #define F_BLANK (1<<2)
5044 #define F_ALT (1<<3)
5045 #define F_ZERO (1<<4)
5047 static
5048 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5050 register int i;
5051 int len;
5052 va_list va;
5053 char *charbuffer;
5054 va_start(va, format);
5056 /* First, format the string as char array, then expand to Py_UNICODE
5057 array. */
5058 charbuffer = (char *)buffer;
5059 len = vsprintf(charbuffer, format, va);
5060 for (i = len - 1; i >= 0; i--)
5061 buffer[i] = (Py_UNICODE) charbuffer[i];
5063 va_end(va);
5064 return len;
5067 static int
5068 formatfloat(Py_UNICODE *buf,
5069 size_t buflen,
5070 int flags,
5071 int prec,
5072 int type,
5073 PyObject *v)
5075 /* fmt = '%#.' + `prec` + `type`
5076 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5077 char fmt[20];
5078 double x;
5080 x = PyFloat_AsDouble(v);
5081 if (x == -1.0 && PyErr_Occurred())
5082 return -1;
5083 if (prec < 0)
5084 prec = 6;
5085 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5086 type = 'g';
5087 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
5088 /* worst case length calc to ensure no buffer overrun:
5089 fmt = %#.<prec>g
5090 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5091 for any double rep.)
5092 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5093 If prec=0 the effective precision is 1 (the leading digit is
5094 always given), therefore increase by one to 10+prec. */
5095 if (buflen <= (size_t)10 + (size_t)prec) {
5096 PyErr_SetString(PyExc_OverflowError,
5097 "formatted float is too long (precision too long?)");
5098 return -1;
5100 return usprintf(buf, fmt, x);
5103 static PyObject*
5104 formatlong(PyObject *val, int flags, int prec, int type)
5106 char *buf;
5107 int i, len;
5108 PyObject *str; /* temporary string object. */
5109 PyUnicodeObject *result;
5111 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5112 if (!str)
5113 return NULL;
5114 result = _PyUnicode_New(len);
5115 for (i = 0; i < len; i++)
5116 result->str[i] = buf[i];
5117 result->str[len] = 0;
5118 Py_DECREF(str);
5119 return (PyObject*)result;
5122 static int
5123 formatint(Py_UNICODE *buf,
5124 size_t buflen,
5125 int flags,
5126 int prec,
5127 int type,
5128 PyObject *v)
5130 /* fmt = '%#.' + `prec` + 'l' + `type`
5131 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5132 + 1 + 1 = 24*/
5133 char fmt[64]; /* plenty big enough! */
5134 long x;
5135 int use_native_c_format = 1;
5137 x = PyInt_AsLong(v);
5138 if (x == -1 && PyErr_Occurred())
5139 return -1;
5140 if (prec < 0)
5141 prec = 1;
5142 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5143 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5144 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5145 PyErr_SetString(PyExc_OverflowError,
5146 "formatted integer is too long (precision too long?)");
5147 return -1;
5149 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5150 * but we want it (for consistency with other %#x conversions, and
5151 * for consistency with Python's hex() function).
5152 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5153 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5154 * So add it only if the platform doesn't already.
5156 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5157 /* Only way to know what the platform does is to try it. */
5158 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5159 if (fmt[1] != (char)type) {
5160 /* Supply our own leading 0x/0X -- needed under std C */
5161 use_native_c_format = 0;
5162 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5165 if (use_native_c_format)
5166 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
5167 return usprintf(buf, fmt, x);
5170 static int
5171 formatchar(Py_UNICODE *buf,
5172 size_t buflen,
5173 PyObject *v)
5175 /* presume that the buffer is at least 2 characters long */
5176 if (PyUnicode_Check(v)) {
5177 if (PyUnicode_GET_SIZE(v) != 1)
5178 goto onError;
5179 buf[0] = PyUnicode_AS_UNICODE(v)[0];
5182 else if (PyString_Check(v)) {
5183 if (PyString_GET_SIZE(v) != 1)
5184 goto onError;
5185 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5188 else {
5189 /* Integer input truncated to a character */
5190 long x;
5191 x = PyInt_AsLong(v);
5192 if (x == -1 && PyErr_Occurred())
5193 goto onError;
5194 buf[0] = (char) x;
5196 buf[1] = '\0';
5197 return 1;
5199 onError:
5200 PyErr_SetString(PyExc_TypeError,
5201 "%c requires int or char");
5202 return -1;
5205 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5207 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5208 chars are formatted. XXX This is a magic number. Each formatting
5209 routine does bounds checking to ensure no overflow, but a better
5210 solution may be to malloc a buffer of appropriate size for each
5211 format. For now, the current solution is sufficient.
5213 #define FORMATBUFLEN (size_t)120
5215 PyObject *PyUnicode_Format(PyObject *format,
5216 PyObject *args)
5218 Py_UNICODE *fmt, *res;
5219 int fmtcnt, rescnt, reslen, arglen, argidx;
5220 int args_owned = 0;
5221 PyUnicodeObject *result = NULL;
5222 PyObject *dict = NULL;
5223 PyObject *uformat;
5225 if (format == NULL || args == NULL) {
5226 PyErr_BadInternalCall();
5227 return NULL;
5229 uformat = PyUnicode_FromObject(format);
5230 if (uformat == NULL)
5231 return NULL;
5232 fmt = PyUnicode_AS_UNICODE(uformat);
5233 fmtcnt = PyUnicode_GET_SIZE(uformat);
5235 reslen = rescnt = fmtcnt + 100;
5236 result = _PyUnicode_New(reslen);
5237 if (result == NULL)
5238 goto onError;
5239 res = PyUnicode_AS_UNICODE(result);
5241 if (PyTuple_Check(args)) {
5242 arglen = PyTuple_Size(args);
5243 argidx = 0;
5245 else {
5246 arglen = -1;
5247 argidx = -2;
5249 if (args->ob_type->tp_as_mapping)
5250 dict = args;
5252 while (--fmtcnt >= 0) {
5253 if (*fmt != '%') {
5254 if (--rescnt < 0) {
5255 rescnt = fmtcnt + 100;
5256 reslen += rescnt;
5257 if (_PyUnicode_Resize(&result, reslen) < 0)
5258 return NULL;
5259 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5260 --rescnt;
5262 *res++ = *fmt++;
5264 else {
5265 /* Got a format specifier */
5266 int flags = 0;
5267 int width = -1;
5268 int prec = -1;
5269 Py_UNICODE c = '\0';
5270 Py_UNICODE fill;
5271 PyObject *v = NULL;
5272 PyObject *temp = NULL;
5273 Py_UNICODE *pbuf;
5274 Py_UNICODE sign;
5275 int len;
5276 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5278 fmt++;
5279 if (*fmt == '(') {
5280 Py_UNICODE *keystart;
5281 int keylen;
5282 PyObject *key;
5283 int pcount = 1;
5285 if (dict == NULL) {
5286 PyErr_SetString(PyExc_TypeError,
5287 "format requires a mapping");
5288 goto onError;
5290 ++fmt;
5291 --fmtcnt;
5292 keystart = fmt;
5293 /* Skip over balanced parentheses */
5294 while (pcount > 0 && --fmtcnt >= 0) {
5295 if (*fmt == ')')
5296 --pcount;
5297 else if (*fmt == '(')
5298 ++pcount;
5299 fmt++;
5301 keylen = fmt - keystart - 1;
5302 if (fmtcnt < 0 || pcount > 0) {
5303 PyErr_SetString(PyExc_ValueError,
5304 "incomplete format key");
5305 goto onError;
5307 /* keys are converted to strings using UTF-8 and
5308 then looked up since Python uses strings to hold
5309 variables names etc. in its namespaces and we
5310 wouldn't want to break common idioms. */
5311 key = PyUnicode_EncodeUTF8(keystart,
5312 keylen,
5313 NULL);
5314 if (key == NULL)
5315 goto onError;
5316 if (args_owned) {
5317 Py_DECREF(args);
5318 args_owned = 0;
5320 args = PyObject_GetItem(dict, key);
5321 Py_DECREF(key);
5322 if (args == NULL) {
5323 goto onError;
5325 args_owned = 1;
5326 arglen = -1;
5327 argidx = -2;
5329 while (--fmtcnt >= 0) {
5330 switch (c = *fmt++) {
5331 case '-': flags |= F_LJUST; continue;
5332 case '+': flags |= F_SIGN; continue;
5333 case ' ': flags |= F_BLANK; continue;
5334 case '#': flags |= F_ALT; continue;
5335 case '0': flags |= F_ZERO; continue;
5337 break;
5339 if (c == '*') {
5340 v = getnextarg(args, arglen, &argidx);
5341 if (v == NULL)
5342 goto onError;
5343 if (!PyInt_Check(v)) {
5344 PyErr_SetString(PyExc_TypeError,
5345 "* wants int");
5346 goto onError;
5348 width = PyInt_AsLong(v);
5349 if (width < 0) {
5350 flags |= F_LJUST;
5351 width = -width;
5353 if (--fmtcnt >= 0)
5354 c = *fmt++;
5356 else if (c >= '0' && c <= '9') {
5357 width = c - '0';
5358 while (--fmtcnt >= 0) {
5359 c = *fmt++;
5360 if (c < '0' || c > '9')
5361 break;
5362 if ((width*10) / 10 != width) {
5363 PyErr_SetString(PyExc_ValueError,
5364 "width too big");
5365 goto onError;
5367 width = width*10 + (c - '0');
5370 if (c == '.') {
5371 prec = 0;
5372 if (--fmtcnt >= 0)
5373 c = *fmt++;
5374 if (c == '*') {
5375 v = getnextarg(args, arglen, &argidx);
5376 if (v == NULL)
5377 goto onError;
5378 if (!PyInt_Check(v)) {
5379 PyErr_SetString(PyExc_TypeError,
5380 "* wants int");
5381 goto onError;
5383 prec = PyInt_AsLong(v);
5384 if (prec < 0)
5385 prec = 0;
5386 if (--fmtcnt >= 0)
5387 c = *fmt++;
5389 else if (c >= '0' && c <= '9') {
5390 prec = c - '0';
5391 while (--fmtcnt >= 0) {
5392 c = Py_CHARMASK(*fmt++);
5393 if (c < '0' || c > '9')
5394 break;
5395 if ((prec*10) / 10 != prec) {
5396 PyErr_SetString(PyExc_ValueError,
5397 "prec too big");
5398 goto onError;
5400 prec = prec*10 + (c - '0');
5403 } /* prec */
5404 if (fmtcnt >= 0) {
5405 if (c == 'h' || c == 'l' || c == 'L') {
5406 if (--fmtcnt >= 0)
5407 c = *fmt++;
5410 if (fmtcnt < 0) {
5411 PyErr_SetString(PyExc_ValueError,
5412 "incomplete format");
5413 goto onError;
5415 if (c != '%') {
5416 v = getnextarg(args, arglen, &argidx);
5417 if (v == NULL)
5418 goto onError;
5420 sign = 0;
5421 fill = ' ';
5422 switch (c) {
5424 case '%':
5425 pbuf = formatbuf;
5426 /* presume that buffer length is at least 1 */
5427 pbuf[0] = '%';
5428 len = 1;
5429 break;
5431 case 's':
5432 case 'r':
5433 if (PyUnicode_Check(v) && c == 's') {
5434 temp = v;
5435 Py_INCREF(temp);
5437 else {
5438 PyObject *unicode;
5439 if (c == 's')
5440 temp = PyObject_Str(v);
5441 else
5442 temp = PyObject_Repr(v);
5443 if (temp == NULL)
5444 goto onError;
5445 if (!PyString_Check(temp)) {
5446 /* XXX Note: this should never happen, since
5447 PyObject_Repr() and PyObject_Str() assure
5448 this */
5449 Py_DECREF(temp);
5450 PyErr_SetString(PyExc_TypeError,
5451 "%s argument has non-string str()");
5452 goto onError;
5454 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5455 PyString_GET_SIZE(temp),
5456 NULL,
5457 "strict");
5458 Py_DECREF(temp);
5459 temp = unicode;
5460 if (temp == NULL)
5461 goto onError;
5463 pbuf = PyUnicode_AS_UNICODE(temp);
5464 len = PyUnicode_GET_SIZE(temp);
5465 if (prec >= 0 && len > prec)
5466 len = prec;
5467 break;
5469 case 'i':
5470 case 'd':
5471 case 'u':
5472 case 'o':
5473 case 'x':
5474 case 'X':
5475 if (c == 'i')
5476 c = 'd';
5477 if (PyLong_Check(v)) {
5478 temp = formatlong(v, flags, prec, c);
5479 if (!temp)
5480 goto onError;
5481 pbuf = PyUnicode_AS_UNICODE(temp);
5482 len = PyUnicode_GET_SIZE(temp);
5483 /* unbounded ints can always produce
5484 a sign character! */
5485 sign = 1;
5487 else {
5488 pbuf = formatbuf;
5489 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5490 flags, prec, c, v);
5491 if (len < 0)
5492 goto onError;
5493 /* only d conversion is signed */
5494 sign = c == 'd';
5496 if (flags & F_ZERO)
5497 fill = '0';
5498 break;
5500 case 'e':
5501 case 'E':
5502 case 'f':
5503 case 'g':
5504 case 'G':
5505 pbuf = formatbuf;
5506 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5507 flags, prec, c, v);
5508 if (len < 0)
5509 goto onError;
5510 sign = 1;
5511 if (flags & F_ZERO)
5512 fill = '0';
5513 break;
5515 case 'c':
5516 pbuf = formatbuf;
5517 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5518 if (len < 0)
5519 goto onError;
5520 break;
5522 default:
5523 PyErr_Format(PyExc_ValueError,
5524 "unsupported format character '%c' (0x%x) "
5525 "at index %i",
5526 (31<=c && c<=126) ? c : '?',
5527 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5528 goto onError;
5530 if (sign) {
5531 if (*pbuf == '-' || *pbuf == '+') {
5532 sign = *pbuf++;
5533 len--;
5535 else if (flags & F_SIGN)
5536 sign = '+';
5537 else if (flags & F_BLANK)
5538 sign = ' ';
5539 else
5540 sign = 0;
5542 if (width < len)
5543 width = len;
5544 if (rescnt < width + (sign != 0)) {
5545 reslen -= rescnt;
5546 rescnt = width + fmtcnt + 100;
5547 reslen += rescnt;
5548 if (_PyUnicode_Resize(&result, reslen) < 0)
5549 return NULL;
5550 res = PyUnicode_AS_UNICODE(result)
5551 + reslen - rescnt;
5553 if (sign) {
5554 if (fill != ' ')
5555 *res++ = sign;
5556 rescnt--;
5557 if (width > len)
5558 width--;
5560 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5561 assert(pbuf[0] == '0');
5562 assert(pbuf[1] == c);
5563 if (fill != ' ') {
5564 *res++ = *pbuf++;
5565 *res++ = *pbuf++;
5567 rescnt -= 2;
5568 width -= 2;
5569 if (width < 0)
5570 width = 0;
5571 len -= 2;
5573 if (width > len && !(flags & F_LJUST)) {
5574 do {
5575 --rescnt;
5576 *res++ = fill;
5577 } while (--width > len);
5579 if (fill == ' ') {
5580 if (sign)
5581 *res++ = sign;
5582 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5583 assert(pbuf[0] == '0');
5584 assert(pbuf[1] == c);
5585 *res++ = *pbuf++;
5586 *res++ = *pbuf++;
5589 Py_UNICODE_COPY(res, pbuf, len);
5590 res += len;
5591 rescnt -= len;
5592 while (--width >= len) {
5593 --rescnt;
5594 *res++ = ' ';
5596 if (dict && (argidx < arglen) && c != '%') {
5597 PyErr_SetString(PyExc_TypeError,
5598 "not all arguments converted");
5599 goto onError;
5601 Py_XDECREF(temp);
5602 } /* '%' */
5603 } /* until end */
5604 if (argidx < arglen && !dict) {
5605 PyErr_SetString(PyExc_TypeError,
5606 "not all arguments converted");
5607 goto onError;
5610 if (args_owned) {
5611 Py_DECREF(args);
5613 Py_DECREF(uformat);
5614 if (_PyUnicode_Resize(&result, reslen - rescnt))
5615 goto onError;
5616 return (PyObject *)result;
5618 onError:
5619 Py_XDECREF(result);
5620 Py_DECREF(uformat);
5621 if (args_owned) {
5622 Py_DECREF(args);
5624 return NULL;
5627 static PyBufferProcs unicode_as_buffer = {
5628 (getreadbufferproc) unicode_buffer_getreadbuf,
5629 (getwritebufferproc) unicode_buffer_getwritebuf,
5630 (getsegcountproc) unicode_buffer_getsegcount,
5631 (getcharbufferproc) unicode_buffer_getcharbuf,
5634 staticforward PyObject *
5635 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5637 static PyObject *
5638 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5640 PyObject *x = NULL;
5641 static char *kwlist[] = {"string", "encoding", "errors", 0};
5642 char *encoding = NULL;
5643 char *errors = NULL;
5645 if (type != &PyUnicode_Type)
5646 return unicode_subtype_new(type, args, kwds);
5647 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5648 kwlist, &x, &encoding, &errors))
5649 return NULL;
5650 if (x == NULL)
5651 return (PyObject *)_PyUnicode_New(0);
5652 return PyUnicode_FromEncodedObject(x, encoding, errors);
5655 static PyObject *
5656 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5658 PyUnicodeObject *tmp, *pnew;
5659 int n;
5661 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5662 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5663 if (tmp == NULL)
5664 return NULL;
5665 assert(PyUnicode_Check(tmp));
5666 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5667 if (pnew == NULL)
5668 return NULL;
5669 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5670 if (pnew->str == NULL) {
5671 _Py_ForgetReference((PyObject *)pnew);
5672 PyObject_DEL(pnew);
5673 return NULL;
5675 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5676 pnew->length = n;
5677 pnew->hash = tmp->hash;
5678 Py_DECREF(tmp);
5679 return (PyObject *)pnew;
5682 static char unicode_doc[] =
5683 "unicode(string [, encoding[, errors]]) -> object\n\
5685 Create a new Unicode object from the given encoded string.\n\
5686 encoding defaults to the current default string encoding and \n\
5687 errors, defining the error handling, to 'strict'.";
5689 PyTypeObject PyUnicode_Type = {
5690 PyObject_HEAD_INIT(&PyType_Type)
5691 0, /* ob_size */
5692 "unicode", /* tp_name */
5693 sizeof(PyUnicodeObject), /* tp_size */
5694 0, /* tp_itemsize */
5695 /* Slots */
5696 (destructor)_PyUnicode_Free, /* tp_dealloc */
5697 0, /* tp_print */
5698 0, /* tp_getattr */
5699 0, /* tp_setattr */
5700 (cmpfunc) unicode_compare, /* tp_compare */
5701 (reprfunc) unicode_repr, /* tp_repr */
5702 0, /* tp_as_number */
5703 &unicode_as_sequence, /* tp_as_sequence */
5704 0, /* tp_as_mapping */
5705 (hashfunc) unicode_hash, /* tp_hash*/
5706 0, /* tp_call*/
5707 (reprfunc) unicode_str, /* tp_str */
5708 PyObject_GenericGetAttr, /* tp_getattro */
5709 0, /* tp_setattro */
5710 &unicode_as_buffer, /* tp_as_buffer */
5711 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5712 unicode_doc, /* tp_doc */
5713 0, /* tp_traverse */
5714 0, /* tp_clear */
5715 0, /* tp_richcompare */
5716 0, /* tp_weaklistoffset */
5717 0, /* tp_iter */
5718 0, /* tp_iternext */
5719 unicode_methods, /* tp_methods */
5720 0, /* tp_members */
5721 0, /* tp_getset */
5722 0, /* tp_base */
5723 0, /* tp_dict */
5724 0, /* tp_descr_get */
5725 0, /* tp_descr_set */
5726 0, /* tp_dictoffset */
5727 0, /* tp_init */
5728 0, /* tp_alloc */
5729 unicode_new, /* tp_new */
5732 /* Initialize the Unicode implementation */
5734 void _PyUnicode_Init(void)
5736 int i;
5738 /* Init the implementation */
5739 unicode_freelist = NULL;
5740 unicode_freelist_size = 0;
5741 unicode_empty = _PyUnicode_New(0);
5742 strcpy(unicode_default_encoding, "ascii");
5743 for (i = 0; i < 256; i++)
5744 unicode_latin1[i] = NULL;
5747 /* Finalize the Unicode implementation */
5749 void
5750 _PyUnicode_Fini(void)
5752 PyUnicodeObject *u;
5753 int i;
5755 Py_XDECREF(unicode_empty);
5756 unicode_empty = NULL;
5758 for (i = 0; i < 256; i++) {
5759 if (unicode_latin1[i]) {
5760 Py_DECREF(unicode_latin1[i]);
5761 unicode_latin1[i] = NULL;
5765 for (u = unicode_freelist; u != NULL;) {
5766 PyUnicodeObject *v = u;
5767 u = *(PyUnicodeObject **)u;
5768 if (v->str)
5769 PyMem_DEL(v->str);
5770 Py_XDECREF(v->defenc);
5771 PyObject_DEL(v);
5773 unicode_freelist = NULL;
5774 unicode_freelist_size = 0;