This commit was manufactured by cvs2svn to create tag
[python/dscho.git] / Objects / unicodeobject.c
blob68afaa05c85fb25c5e3c1a7932b0878d36f2ee9f
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_DEL(unicode);
223 return NULL;
226 static
227 void unicode_dealloc(register PyUnicodeObject *unicode)
229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233 PyMem_DEL(unicode->str);
234 unicode->str = NULL;
235 unicode->length = 0;
237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
241 /* Add to free list */
242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
246 else {
247 PyMem_DEL(unicode->str);
248 Py_XDECREF(unicode->defenc);
249 unicode->ob_type->tp_free((PyObject *)unicode);
253 int PyUnicode_Resize(PyObject **unicode,
254 int length)
256 register PyUnicodeObject *v;
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
288 /* Internal API for use in unicodeobject.c only ! */
289 #define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
295 PyUnicodeObject *unicode;
297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
313 if (!unicode)
314 return NULL;
315 unicode->str[0] = *u;
316 unicode_latin1[*u] = unicode;
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
329 Py_UNICODE_COPY(unicode->str, u, size);
331 return (PyObject *)unicode;
334 #ifdef HAVE_WCHAR_H
336 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
339 PyUnicodeObject *unicode;
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
350 /* Copy the wchar_t data into the new object */
351 #ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353 #else
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
361 #endif
363 return (PyObject *)unicode;
366 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376 #ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378 #else
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
386 #endif
388 return size;
391 #endif
393 PyObject *PyUnicode_FromObject(register PyObject *obj)
395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
410 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
414 const char *s = NULL;
415 int len;
416 int owned = 0;
417 PyObject *v;
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
424 #if 0
425 /* For b/w compatibility we also accept Unicode objects provided
426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
438 return NULL;
440 return PyObject_Unicode(obj);
442 #else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
448 #endif
450 /* Coerce object */
451 if (PyString_Check(obj)) {
452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
459 PyErr_Format(PyExc_TypeError,
460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
462 obj->ob_type->tp_name);
463 goto onError;
466 /* Convert to Unicode */
467 if (len == 0) {
468 Py_INCREF(unicode_empty);
469 v = (PyObject *)unicode_empty;
471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
474 if (owned) {
475 Py_DECREF(obj);
477 return v;
479 onError:
480 if (owned) {
481 Py_DECREF(obj);
483 return NULL;
486 PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
491 PyObject *buffer = NULL, *unicode;
493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
498 return PyUnicode_DecodeUTF8(s, size, errors);
499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
513 "decoder did not return an unicode object (type=%.400s)",
514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
518 Py_DECREF(buffer);
519 return unicode;
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
526 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
531 PyObject *v, *unicode;
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
541 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
545 PyObject *v;
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
558 return PyUnicode_AsUTF8String(unicode);
559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
572 "encoder did not return a string object (type=%.400s)",
573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
577 return v;
579 onError:
580 return NULL;
583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
602 return PyUnicode_AS_UNICODE(unicode);
604 onError:
605 return NULL;
608 int PyUnicode_GetSize(PyObject *unicode)
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
614 return PyUnicode_GET_SIZE(unicode);
616 onError:
617 return -1;
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding;
625 int PyUnicode_SetDefaultEncoding(const char *encoding)
627 PyObject *v;
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
640 onError:
641 return -1;
644 /* --- UTF-7 Codec -------------------------------------------------------- */
646 /* see RFC2152 for details */
648 static
649 char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667 #define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
672 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
677 #define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
683 #define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
702 static
703 int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
722 return 0;
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
732 PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
751 p = unicode->str;
752 e = s + size;
754 while (s < e) {
755 Py_UNICODE ch = *s;
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
806 inShift = 1;
807 bitsleft = 0;
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
815 else {
816 *p++ = ch;
817 s++;
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
833 return (PyObject *)unicode;
835 onError:
836 Py_DECREF(unicode);
837 return NULL;
841 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
917 else {
918 *out++ = '-';
919 inShift = 0;
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
934 return v;
937 #undef SPECIAL
938 #undef B64
939 #undef B64CHAR
940 #undef UB64
941 #undef ENCODE
942 #undef DECODE
944 /* --- UTF-8 Codec -------------------------------------------------------- */
946 static
947 char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
968 static
969 int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
977 "UTF-8 decoding error: %.400s",
978 details);
979 return -1;
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
991 else {
992 PyErr_Format(PyExc_ValueError,
993 "UTF-8 decoding error; unknown error handling code: %.400s",
994 errors);
995 return -1;
999 PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
1007 const char *errmsg = "";
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1021 while (s < e) {
1022 Py_UCS4 ch = (unsigned char)*s;
1024 if (ch < 0x80) {
1025 *p++ = (Py_UNICODE)ch;
1026 s++;
1027 continue;
1030 n = utf8_code_length[ch];
1032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1037 switch (n) {
1039 case 0:
1040 errmsg = "unexpected code byte";
1041 goto utf8Error;
1043 case 1:
1044 errmsg = "internal error";
1045 goto utf8Error;
1047 case 2:
1048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1057 else
1058 *p++ = (Py_UNICODE)ch;
1059 break;
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
1063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069 errmsg = "illegal encoding";
1070 goto utf8Error;
1072 else
1073 *p++ = (Py_UNICODE)ch;
1074 break;
1076 case 4:
1077 if ((s[1] & 0xc0) != 0x80 ||
1078 (s[2] & 0xc0) != 0x80 ||
1079 (s[3] & 0xc0) != 0x80) {
1080 errmsg = "invalid data";
1081 goto utf8Error;
1083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
1086 if ((ch < 0x10000) /* minimum value allowed for 4
1087 byte encoding */
1088 || (ch > 0x10ffff)) /* maximum value allowed for
1089 UTF-16 */
1091 errmsg = "illegal encoding";
1092 goto utf8Error;
1094 #ifdef Py_UNICODE_WIDE
1095 *p++ = (Py_UNICODE)ch;
1096 #else
1097 /* compute and append the two surrogates: */
1099 /* translate from 10000..10FFFF to 0..FFFF */
1100 ch -= 0x10000;
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1105 /* low surrogate = bottom 10 bits added to DC00 */
1106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1107 #endif
1108 break;
1110 default:
1111 /* Other sizes are only needed for UCS-4 */
1112 errmsg = "unsupported Unicode code range";
1113 goto utf8Error;
1115 s += n;
1116 continue;
1118 utf8Error:
1119 if (utf8_decoding_error(&s, &p, errors, errmsg))
1120 goto onError;
1123 /* Adjust length */
1124 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1125 goto onError;
1127 return (PyObject *)unicode;
1129 onError:
1130 Py_DECREF(unicode);
1131 return NULL;
1134 /* Not used anymore, now that the encoder supports UTF-16
1135 surrogates. */
1136 #if 0
1137 static
1138 int utf8_encoding_error(const Py_UNICODE **source,
1139 char **dest,
1140 const char *errors,
1141 const char *details)
1143 if ((errors == NULL) ||
1144 (strcmp(errors,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError,
1146 "UTF-8 encoding error: %.400s",
1147 details);
1148 return -1;
1150 else if (strcmp(errors,"ignore") == 0) {
1151 return 0;
1153 else if (strcmp(errors,"replace") == 0) {
1154 **dest = '?';
1155 (*dest)++;
1156 return 0;
1158 else {
1159 PyErr_Format(PyExc_ValueError,
1160 "UTF-8 encoding error; "
1161 "unknown error handling code: %.400s",
1162 errors);
1163 return -1;
1166 #endif
1168 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169 int size,
1170 const char *errors)
1172 PyObject *v;
1173 char *p;
1174 char *q;
1175 Py_UCS4 ch2;
1176 unsigned int cbAllocated = 3 * size;
1177 unsigned int cbWritten = 0;
1178 int i = 0;
1180 v = PyString_FromStringAndSize(NULL, cbAllocated);
1181 if (v == NULL)
1182 return NULL;
1183 if (size == 0)
1184 return v;
1186 p = q = PyString_AS_STRING(v);
1187 while (i < size) {
1188 Py_UCS4 ch = s[i++];
1189 if (ch < 0x80) {
1190 *p++ = (char) ch;
1191 cbWritten++;
1193 else if (ch < 0x0800) {
1194 *p++ = 0xc0 | (ch >> 6);
1195 *p++ = 0x80 | (ch & 0x3f);
1196 cbWritten += 2;
1198 else if (ch < 0x10000) {
1199 /* Check for high surrogate */
1200 if (0xD800 <= ch && ch <= 0xDBFF) {
1201 if (i != size) {
1202 ch2 = s[i];
1203 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1205 if (cbWritten >= (cbAllocated - 4)) {
1206 /* Provide enough room for some more
1207 surrogates */
1208 cbAllocated += 4*10;
1209 if (_PyString_Resize(&v, cbAllocated))
1210 goto onError;
1213 /* combine the two values */
1214 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1216 *p++ = (char)((ch >> 18) | 0xf0);
1217 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1218 i++;
1219 cbWritten += 4;
1223 else {
1224 *p++ = (char)(0xe0 | (ch >> 12));
1225 cbWritten += 3;
1227 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1228 *p++ = (char)(0x80 | (ch & 0x3f));
1229 } else {
1230 *p++ = 0xf0 | (ch>>18);
1231 *p++ = 0x80 | ((ch>>12) & 0x3f);
1232 *p++ = 0x80 | ((ch>>6) & 0x3f);
1233 *p++ = 0x80 | (ch & 0x3f);
1234 cbWritten += 4;
1237 *p = '\0';
1238 if (_PyString_Resize(&v, p - q))
1239 goto onError;
1240 return v;
1242 onError:
1243 Py_DECREF(v);
1244 return NULL;
1247 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1249 if (!PyUnicode_Check(unicode)) {
1250 PyErr_BadArgument();
1251 return NULL;
1253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254 PyUnicode_GET_SIZE(unicode),
1255 NULL);
1258 /* --- UTF-16 Codec ------------------------------------------------------- */
1260 static
1261 int utf16_decoding_error(Py_UNICODE **dest,
1262 const char *errors,
1263 const char *details)
1265 if ((errors == NULL) ||
1266 (strcmp(errors,"strict") == 0)) {
1267 PyErr_Format(PyExc_UnicodeError,
1268 "UTF-16 decoding error: %.400s",
1269 details);
1270 return -1;
1272 else if (strcmp(errors,"ignore") == 0) {
1273 return 0;
1275 else if (strcmp(errors,"replace") == 0) {
1276 if (dest) {
1277 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1278 (*dest)++;
1280 return 0;
1282 else {
1283 PyErr_Format(PyExc_ValueError,
1284 "UTF-16 decoding error; "
1285 "unknown error handling code: %.400s",
1286 errors);
1287 return -1;
1291 PyObject *
1292 PyUnicode_DecodeUTF16(const char *s,
1293 int size,
1294 const char *errors,
1295 int *byteorder)
1297 PyUnicodeObject *unicode;
1298 Py_UNICODE *p;
1299 const unsigned char *q, *e;
1300 int bo = 0; /* assume native ordering by default */
1301 const char *errmsg = "";
1302 /* Offsets from q for retrieving byte pairs in the right order. */
1303 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1304 int ihi = 1, ilo = 0;
1305 #else
1306 int ihi = 0, ilo = 1;
1307 #endif
1309 /* size should be an even number */
1310 if (size & 1) {
1311 if (utf16_decoding_error(NULL, errors, "truncated data"))
1312 return NULL;
1313 --size; /* else ignore the oddball byte */
1316 /* Note: size will always be longer than the resulting Unicode
1317 character count */
1318 unicode = _PyUnicode_New(size);
1319 if (!unicode)
1320 return NULL;
1321 if (size == 0)
1322 return (PyObject *)unicode;
1324 /* Unpack UTF-16 encoded data */
1325 p = unicode->str;
1326 q = (unsigned char *)s;
1327 e = q + size;
1329 if (byteorder)
1330 bo = *byteorder;
1332 /* Check for BOM marks (U+FEFF) in the input and adjust current
1333 byte order setting accordingly. In native mode, the leading BOM
1334 mark is skipped, in all other modes, it is copied to the output
1335 stream as-is (giving a ZWNBSP character). */
1336 if (bo == 0) {
1337 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1338 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1339 if (bom == 0xFEFF) {
1340 q += 2;
1341 bo = -1;
1343 else if (bom == 0xFFFE) {
1344 q += 2;
1345 bo = 1;
1347 #else
1348 if (bom == 0xFEFF) {
1349 q += 2;
1350 bo = 1;
1352 else if (bom == 0xFFFE) {
1353 q += 2;
1354 bo = -1;
1356 #endif
1359 if (bo == -1) {
1360 /* force LE */
1361 ihi = 1;
1362 ilo = 0;
1364 else if (bo == 1) {
1365 /* force BE */
1366 ihi = 0;
1367 ilo = 1;
1370 while (q < e) {
1371 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1372 q += 2;
1374 if (ch < 0xD800 || ch > 0xDFFF) {
1375 *p++ = ch;
1376 continue;
1379 /* UTF-16 code pair: */
1380 if (q >= e) {
1381 errmsg = "unexpected end of data";
1382 goto utf16Error;
1384 if (0xD800 <= ch && ch <= 0xDBFF) {
1385 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1386 q += 2;
1387 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1388 #ifndef Py_UNICODE_WIDE
1389 *p++ = ch;
1390 *p++ = ch2;
1391 #else
1392 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1393 #endif
1394 continue;
1396 else {
1397 errmsg = "illegal UTF-16 surrogate";
1398 goto utf16Error;
1402 errmsg = "illegal encoding";
1403 /* Fall through to report the error */
1405 utf16Error:
1406 if (utf16_decoding_error(&p, errors, errmsg))
1407 goto onError;
1410 if (byteorder)
1411 *byteorder = bo;
1413 /* Adjust length */
1414 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1415 goto onError;
1417 return (PyObject *)unicode;
1419 onError:
1420 Py_DECREF(unicode);
1421 return NULL;
1424 PyObject *
1425 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1426 int size,
1427 const char *errors,
1428 int byteorder)
1430 PyObject *v;
1431 unsigned char *p;
1432 int i, pairs;
1433 /* Offsets from p for storing byte pairs in the right order. */
1434 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1435 int ihi = 1, ilo = 0;
1436 #else
1437 int ihi = 0, ilo = 1;
1438 #endif
1440 #define STORECHAR(CH) \
1441 do { \
1442 p[ihi] = ((CH) >> 8) & 0xff; \
1443 p[ilo] = (CH) & 0xff; \
1444 p += 2; \
1445 } while(0)
1447 for (i = pairs = 0; i < size; i++)
1448 if (s[i] >= 0x10000)
1449 pairs++;
1450 v = PyString_FromStringAndSize(NULL,
1451 2 * (size + pairs + (byteorder == 0)));
1452 if (v == NULL)
1453 return NULL;
1455 p = (unsigned char *)PyString_AS_STRING(v);
1456 if (byteorder == 0)
1457 STORECHAR(0xFEFF);
1458 if (size == 0)
1459 return v;
1461 if (byteorder == -1) {
1462 /* force LE */
1463 ihi = 1;
1464 ilo = 0;
1466 else if (byteorder == 1) {
1467 /* force BE */
1468 ihi = 0;
1469 ilo = 1;
1472 while (size-- > 0) {
1473 Py_UNICODE ch = *s++;
1474 Py_UNICODE ch2 = 0;
1475 if (ch >= 0x10000) {
1476 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1477 ch = 0xD800 | ((ch-0x10000) >> 10);
1479 STORECHAR(ch);
1480 if (ch2)
1481 STORECHAR(ch2);
1483 return v;
1484 #undef STORECHAR
1487 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1489 if (!PyUnicode_Check(unicode)) {
1490 PyErr_BadArgument();
1491 return NULL;
1493 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1494 PyUnicode_GET_SIZE(unicode),
1495 NULL,
1499 /* --- Unicode Escape Codec ----------------------------------------------- */
1501 static
1502 int unicodeescape_decoding_error(const char **source,
1503 Py_UNICODE *x,
1504 const char *errors,
1505 const char *details)
1507 if ((errors == NULL) ||
1508 (strcmp(errors,"strict") == 0)) {
1509 PyErr_Format(PyExc_UnicodeError,
1510 "Unicode-Escape decoding error: %.400s",
1511 details);
1512 return -1;
1514 else if (strcmp(errors,"ignore") == 0) {
1515 return 0;
1517 else if (strcmp(errors,"replace") == 0) {
1518 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1519 return 0;
1521 else {
1522 PyErr_Format(PyExc_ValueError,
1523 "Unicode-Escape decoding error; "
1524 "unknown error handling code: %.400s",
1525 errors);
1526 return -1;
1530 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1532 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1533 int size,
1534 const char *errors)
1536 PyUnicodeObject *v;
1537 Py_UNICODE *p, *buf;
1538 const char *end;
1539 char* message;
1540 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1542 /* Escaped strings will always be longer than the resulting
1543 Unicode string, so we start with size here and then reduce the
1544 length after conversion to the true value. */
1545 v = _PyUnicode_New(size);
1546 if (v == NULL)
1547 goto onError;
1548 if (size == 0)
1549 return (PyObject *)v;
1551 p = buf = PyUnicode_AS_UNICODE(v);
1552 end = s + size;
1554 while (s < end) {
1555 unsigned char c;
1556 Py_UNICODE x;
1557 int i, digits;
1559 /* Non-escape characters are interpreted as Unicode ordinals */
1560 if (*s != '\\') {
1561 *p++ = (unsigned char) *s++;
1562 continue;
1565 /* \ - Escapes */
1566 s++;
1567 switch (*s++) {
1569 /* \x escapes */
1570 case '\n': break;
1571 case '\\': *p++ = '\\'; break;
1572 case '\'': *p++ = '\''; break;
1573 case '\"': *p++ = '\"'; break;
1574 case 'b': *p++ = '\b'; break;
1575 case 'f': *p++ = '\014'; break; /* FF */
1576 case 't': *p++ = '\t'; break;
1577 case 'n': *p++ = '\n'; break;
1578 case 'r': *p++ = '\r'; break;
1579 case 'v': *p++ = '\013'; break; /* VT */
1580 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1582 /* \OOO (octal) escapes */
1583 case '0': case '1': case '2': case '3':
1584 case '4': case '5': case '6': case '7':
1585 x = s[-1] - '0';
1586 if ('0' <= *s && *s <= '7') {
1587 x = (x<<3) + *s++ - '0';
1588 if ('0' <= *s && *s <= '7')
1589 x = (x<<3) + *s++ - '0';
1591 *p++ = x;
1592 break;
1594 /* hex escapes */
1595 /* \xXX */
1596 case 'x':
1597 digits = 2;
1598 message = "truncated \\xXX escape";
1599 goto hexescape;
1601 /* \uXXXX */
1602 case 'u':
1603 digits = 4;
1604 message = "truncated \\uXXXX escape";
1605 goto hexescape;
1607 /* \UXXXXXXXX */
1608 case 'U':
1609 digits = 8;
1610 message = "truncated \\UXXXXXXXX escape";
1611 hexescape:
1612 chr = 0;
1613 for (i = 0; i < digits; i++) {
1614 c = (unsigned char) s[i];
1615 if (!isxdigit(c)) {
1616 if (unicodeescape_decoding_error(&s, &x, errors, message))
1617 goto onError;
1618 chr = x;
1619 i++;
1620 break;
1622 chr = (chr<<4) & ~0xF;
1623 if (c >= '0' && c <= '9')
1624 chr += c - '0';
1625 else if (c >= 'a' && c <= 'f')
1626 chr += 10 + c - 'a';
1627 else
1628 chr += 10 + c - 'A';
1630 s += i;
1631 store:
1632 /* when we get here, chr is a 32-bit unicode character */
1633 if (chr <= 0xffff)
1634 /* UCS-2 character */
1635 *p++ = (Py_UNICODE) chr;
1636 else if (chr <= 0x10ffff) {
1637 /* UCS-4 character. Either store directly, or as
1638 surrogate pair. */
1639 #ifdef Py_UNICODE_WIDE
1640 *p++ = chr;
1641 #else
1642 chr -= 0x10000L;
1643 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1644 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1645 #endif
1646 } else {
1647 if (unicodeescape_decoding_error(
1648 &s, &x, errors,
1649 "illegal Unicode character")
1651 goto onError;
1652 *p++ = x; /* store replacement character */
1654 break;
1656 /* \N{name} */
1657 case 'N':
1658 message = "malformed \\N character escape";
1659 if (ucnhash_CAPI == NULL) {
1660 /* load the unicode data module */
1661 PyObject *m, *v;
1662 m = PyImport_ImportModule("unicodedata");
1663 if (m == NULL)
1664 goto ucnhashError;
1665 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1666 Py_DECREF(m);
1667 if (v == NULL)
1668 goto ucnhashError;
1669 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1670 Py_DECREF(v);
1671 if (ucnhash_CAPI == NULL)
1672 goto ucnhashError;
1674 if (*s == '{') {
1675 const char *start = s+1;
1676 /* look for the closing brace */
1677 while (*s != '}' && s < end)
1678 s++;
1679 if (s > start && s < end && *s == '}') {
1680 /* found a name. look it up in the unicode database */
1681 message = "unknown Unicode character name";
1682 s++;
1683 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1684 goto store;
1687 if (unicodeescape_decoding_error(&s, &x, errors, message))
1688 goto onError;
1689 *p++ = x;
1690 break;
1692 default:
1693 *p++ = '\\';
1694 *p++ = (unsigned char)s[-1];
1695 break;
1698 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1699 goto onError;
1700 return (PyObject *)v;
1702 ucnhashError:
1703 PyErr_SetString(
1704 PyExc_UnicodeError,
1705 "\\N escapes not supported (can't load unicodedata module)"
1707 return NULL;
1709 onError:
1710 Py_XDECREF(v);
1711 return NULL;
1714 /* Return a Unicode-Escape string version of the Unicode object.
1716 If quotes is true, the string is enclosed in u"" or u'' quotes as
1717 appropriate.
1721 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1722 int size,
1723 Py_UNICODE ch);
1725 static
1726 PyObject *unicodeescape_string(const Py_UNICODE *s,
1727 int size,
1728 int quotes)
1730 PyObject *repr;
1731 char *p;
1733 static const char *hexdigit = "0123456789abcdef";
1735 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1736 if (repr == NULL)
1737 return NULL;
1739 p = PyString_AS_STRING(repr);
1741 if (quotes) {
1742 *p++ = 'u';
1743 *p++ = (findchar(s, size, '\'') &&
1744 !findchar(s, size, '"')) ? '"' : '\'';
1746 while (size-- > 0) {
1747 Py_UNICODE ch = *s++;
1749 /* Escape quotes */
1750 if (quotes &&
1751 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1752 *p++ = '\\';
1753 *p++ = (char) ch;
1754 continue;
1757 #ifdef Py_UNICODE_WIDE
1758 /* Map 21-bit characters to '\U00xxxxxx' */
1759 else if (ch >= 0x10000) {
1760 int offset = p - PyString_AS_STRING(repr);
1762 /* Resize the string if necessary */
1763 if (offset + 12 > PyString_GET_SIZE(repr)) {
1764 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1765 goto onError;
1766 p = PyString_AS_STRING(repr) + offset;
1769 *p++ = '\\';
1770 *p++ = 'U';
1771 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1772 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1773 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1777 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1778 *p++ = hexdigit[ch & 0x0000000F];
1779 continue;
1781 #endif
1782 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1783 else if (ch >= 0xD800 && ch < 0xDC00) {
1784 Py_UNICODE ch2;
1785 Py_UCS4 ucs;
1787 ch2 = *s++;
1788 size--;
1789 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1790 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1791 *p++ = '\\';
1792 *p++ = 'U';
1793 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1794 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1795 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1799 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1800 *p++ = hexdigit[ucs & 0x0000000F];
1801 continue;
1803 /* Fall through: isolated surrogates are copied as-is */
1804 s--;
1805 size++;
1808 /* Map 16-bit characters to '\uxxxx' */
1809 if (ch >= 256) {
1810 *p++ = '\\';
1811 *p++ = 'u';
1812 *p++ = hexdigit[(ch >> 12) & 0x000F];
1813 *p++ = hexdigit[(ch >> 8) & 0x000F];
1814 *p++ = hexdigit[(ch >> 4) & 0x000F];
1815 *p++ = hexdigit[ch & 0x000F];
1818 /* Map special whitespace to '\t', \n', '\r' */
1819 else if (ch == '\t') {
1820 *p++ = '\\';
1821 *p++ = 't';
1823 else if (ch == '\n') {
1824 *p++ = '\\';
1825 *p++ = 'n';
1827 else if (ch == '\r') {
1828 *p++ = '\\';
1829 *p++ = 'r';
1832 /* Map non-printable US ASCII to '\xhh' */
1833 else if (ch < ' ' || ch >= 0x7F) {
1834 *p++ = '\\';
1835 *p++ = 'x';
1836 *p++ = hexdigit[(ch >> 4) & 0x000F];
1837 *p++ = hexdigit[ch & 0x000F];
1840 /* Copy everything else as-is */
1841 else
1842 *p++ = (char) ch;
1844 if (quotes)
1845 *p++ = PyString_AS_STRING(repr)[1];
1847 *p = '\0';
1848 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1849 goto onError;
1851 return repr;
1853 onError:
1854 Py_DECREF(repr);
1855 return NULL;
1858 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1859 int size)
1861 return unicodeescape_string(s, size, 0);
1864 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1866 if (!PyUnicode_Check(unicode)) {
1867 PyErr_BadArgument();
1868 return NULL;
1870 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1871 PyUnicode_GET_SIZE(unicode));
1874 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1876 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1877 int size,
1878 const char *errors)
1880 PyUnicodeObject *v;
1881 Py_UNICODE *p, *buf;
1882 const char *end;
1883 const char *bs;
1885 /* Escaped strings will always be longer than the resulting
1886 Unicode string, so we start with size here and then reduce the
1887 length after conversion to the true value. */
1888 v = _PyUnicode_New(size);
1889 if (v == NULL)
1890 goto onError;
1891 if (size == 0)
1892 return (PyObject *)v;
1893 p = buf = PyUnicode_AS_UNICODE(v);
1894 end = s + size;
1895 while (s < end) {
1896 unsigned char c;
1897 Py_UNICODE x;
1898 int i;
1900 /* Non-escape characters are interpreted as Unicode ordinals */
1901 if (*s != '\\') {
1902 *p++ = (unsigned char)*s++;
1903 continue;
1906 /* \u-escapes are only interpreted iff the number of leading
1907 backslashes if odd */
1908 bs = s;
1909 for (;s < end;) {
1910 if (*s != '\\')
1911 break;
1912 *p++ = (unsigned char)*s++;
1914 if (((s - bs) & 1) == 0 ||
1915 s >= end ||
1916 *s != 'u') {
1917 continue;
1919 p--;
1920 s++;
1922 /* \uXXXX with 4 hex digits */
1923 for (x = 0, i = 0; i < 4; i++) {
1924 c = (unsigned char)s[i];
1925 if (!isxdigit(c)) {
1926 if (unicodeescape_decoding_error(&s, &x, errors,
1927 "truncated \\uXXXX"))
1928 goto onError;
1929 i++;
1930 break;
1932 x = (x<<4) & ~0xF;
1933 if (c >= '0' && c <= '9')
1934 x += c - '0';
1935 else if (c >= 'a' && c <= 'f')
1936 x += 10 + c - 'a';
1937 else
1938 x += 10 + c - 'A';
1940 s += i;
1941 *p++ = x;
1943 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1944 goto onError;
1945 return (PyObject *)v;
1947 onError:
1948 Py_XDECREF(v);
1949 return NULL;
1952 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1953 int size)
1955 PyObject *repr;
1956 char *p;
1957 char *q;
1959 static const char *hexdigit = "0123456789abcdef";
1961 repr = PyString_FromStringAndSize(NULL, 6 * size);
1962 if (repr == NULL)
1963 return NULL;
1964 if (size == 0)
1965 return repr;
1967 p = q = PyString_AS_STRING(repr);
1968 while (size-- > 0) {
1969 Py_UNICODE ch = *s++;
1970 /* Map 16-bit characters to '\uxxxx' */
1971 if (ch >= 256) {
1972 *p++ = '\\';
1973 *p++ = 'u';
1974 *p++ = hexdigit[(ch >> 12) & 0xf];
1975 *p++ = hexdigit[(ch >> 8) & 0xf];
1976 *p++ = hexdigit[(ch >> 4) & 0xf];
1977 *p++ = hexdigit[ch & 15];
1979 /* Copy everything else as-is */
1980 else
1981 *p++ = (char) ch;
1983 *p = '\0';
1984 if (_PyString_Resize(&repr, p - q))
1985 goto onError;
1987 return repr;
1989 onError:
1990 Py_DECREF(repr);
1991 return NULL;
1994 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1996 if (!PyUnicode_Check(unicode)) {
1997 PyErr_BadArgument();
1998 return NULL;
2000 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2001 PyUnicode_GET_SIZE(unicode));
2004 /* --- Latin-1 Codec ------------------------------------------------------ */
2006 PyObject *PyUnicode_DecodeLatin1(const char *s,
2007 int size,
2008 const char *errors)
2010 PyUnicodeObject *v;
2011 Py_UNICODE *p;
2013 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2014 if (size == 1 && *(unsigned char*)s < 256) {
2015 Py_UNICODE r = *(unsigned char*)s;
2016 return PyUnicode_FromUnicode(&r, 1);
2019 v = _PyUnicode_New(size);
2020 if (v == NULL)
2021 goto onError;
2022 if (size == 0)
2023 return (PyObject *)v;
2024 p = PyUnicode_AS_UNICODE(v);
2025 while (size-- > 0)
2026 *p++ = (unsigned char)*s++;
2027 return (PyObject *)v;
2029 onError:
2030 Py_XDECREF(v);
2031 return NULL;
2034 static
2035 int latin1_encoding_error(const Py_UNICODE **source,
2036 char **dest,
2037 const char *errors,
2038 const char *details)
2040 if ((errors == NULL) ||
2041 (strcmp(errors,"strict") == 0)) {
2042 PyErr_Format(PyExc_UnicodeError,
2043 "Latin-1 encoding error: %.400s",
2044 details);
2045 return -1;
2047 else if (strcmp(errors,"ignore") == 0) {
2048 return 0;
2050 else if (strcmp(errors,"replace") == 0) {
2051 **dest = '?';
2052 (*dest)++;
2053 return 0;
2055 else {
2056 PyErr_Format(PyExc_ValueError,
2057 "Latin-1 encoding error; "
2058 "unknown error handling code: %.400s",
2059 errors);
2060 return -1;
2064 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2065 int size,
2066 const char *errors)
2068 PyObject *repr;
2069 char *s, *start;
2071 repr = PyString_FromStringAndSize(NULL, size);
2072 if (repr == NULL)
2073 return NULL;
2074 if (size == 0)
2075 return repr;
2077 s = PyString_AS_STRING(repr);
2078 start = s;
2079 while (size-- > 0) {
2080 Py_UNICODE ch = *p++;
2081 if (ch >= 256) {
2082 if (latin1_encoding_error(&p, &s, errors,
2083 "ordinal not in range(256)"))
2084 goto onError;
2086 else
2087 *s++ = (char)ch;
2089 /* Resize if error handling skipped some characters */
2090 if (s - start < PyString_GET_SIZE(repr))
2091 if (_PyString_Resize(&repr, s - start))
2092 goto onError;
2093 return repr;
2095 onError:
2096 Py_DECREF(repr);
2097 return NULL;
2100 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2102 if (!PyUnicode_Check(unicode)) {
2103 PyErr_BadArgument();
2104 return NULL;
2106 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2107 PyUnicode_GET_SIZE(unicode),
2108 NULL);
2111 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2113 static
2114 int ascii_decoding_error(const char **source,
2115 Py_UNICODE **dest,
2116 const char *errors,
2117 const char *details)
2119 if ((errors == NULL) ||
2120 (strcmp(errors,"strict") == 0)) {
2121 PyErr_Format(PyExc_UnicodeError,
2122 "ASCII decoding error: %.400s",
2123 details);
2124 return -1;
2126 else if (strcmp(errors,"ignore") == 0) {
2127 return 0;
2129 else if (strcmp(errors,"replace") == 0) {
2130 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2131 (*dest)++;
2132 return 0;
2134 else {
2135 PyErr_Format(PyExc_ValueError,
2136 "ASCII decoding error; "
2137 "unknown error handling code: %.400s",
2138 errors);
2139 return -1;
2143 PyObject *PyUnicode_DecodeASCII(const char *s,
2144 int size,
2145 const char *errors)
2147 PyUnicodeObject *v;
2148 Py_UNICODE *p;
2150 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2151 if (size == 1 && *(unsigned char*)s < 128) {
2152 Py_UNICODE r = *(unsigned char*)s;
2153 return PyUnicode_FromUnicode(&r, 1);
2156 v = _PyUnicode_New(size);
2157 if (v == NULL)
2158 goto onError;
2159 if (size == 0)
2160 return (PyObject *)v;
2161 p = PyUnicode_AS_UNICODE(v);
2162 while (size-- > 0) {
2163 register unsigned char c;
2165 c = (unsigned char)*s++;
2166 if (c < 128)
2167 *p++ = c;
2168 else if (ascii_decoding_error(&s, &p, errors,
2169 "ordinal not in range(128)"))
2170 goto onError;
2172 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2173 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2174 goto onError;
2175 return (PyObject *)v;
2177 onError:
2178 Py_XDECREF(v);
2179 return NULL;
2182 static
2183 int ascii_encoding_error(const Py_UNICODE **source,
2184 char **dest,
2185 const char *errors,
2186 const char *details)
2188 if ((errors == NULL) ||
2189 (strcmp(errors,"strict") == 0)) {
2190 PyErr_Format(PyExc_UnicodeError,
2191 "ASCII encoding error: %.400s",
2192 details);
2193 return -1;
2195 else if (strcmp(errors,"ignore") == 0) {
2196 return 0;
2198 else if (strcmp(errors,"replace") == 0) {
2199 **dest = '?';
2200 (*dest)++;
2201 return 0;
2203 else {
2204 PyErr_Format(PyExc_ValueError,
2205 "ASCII encoding error; "
2206 "unknown error handling code: %.400s",
2207 errors);
2208 return -1;
2212 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2213 int size,
2214 const char *errors)
2216 PyObject *repr;
2217 char *s, *start;
2219 repr = PyString_FromStringAndSize(NULL, size);
2220 if (repr == NULL)
2221 return NULL;
2222 if (size == 0)
2223 return repr;
2225 s = PyString_AS_STRING(repr);
2226 start = s;
2227 while (size-- > 0) {
2228 Py_UNICODE ch = *p++;
2229 if (ch >= 128) {
2230 if (ascii_encoding_error(&p, &s, errors,
2231 "ordinal not in range(128)"))
2232 goto onError;
2234 else
2235 *s++ = (char)ch;
2237 /* Resize if error handling skipped some characters */
2238 if (s - start < PyString_GET_SIZE(repr))
2239 if (_PyString_Resize(&repr, s - start))
2240 goto onError;
2241 return repr;
2243 onError:
2244 Py_DECREF(repr);
2245 return NULL;
2248 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2250 if (!PyUnicode_Check(unicode)) {
2251 PyErr_BadArgument();
2252 return NULL;
2254 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2255 PyUnicode_GET_SIZE(unicode),
2256 NULL);
2259 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2261 /* --- MBCS codecs for Windows -------------------------------------------- */
2263 PyObject *PyUnicode_DecodeMBCS(const char *s,
2264 int size,
2265 const char *errors)
2267 PyUnicodeObject *v;
2268 Py_UNICODE *p;
2270 /* First get the size of the result */
2271 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2272 if (size > 0 && usize==0)
2273 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2275 v = _PyUnicode_New(usize);
2276 if (v == NULL)
2277 return NULL;
2278 if (usize == 0)
2279 return (PyObject *)v;
2280 p = PyUnicode_AS_UNICODE(v);
2281 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2282 Py_DECREF(v);
2283 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2286 return (PyObject *)v;
2289 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2290 int size,
2291 const char *errors)
2293 PyObject *repr;
2294 char *s;
2295 DWORD mbcssize;
2297 /* If there are no characters, bail now! */
2298 if (size==0)
2299 return PyString_FromString("");
2301 /* First get the size of the result */
2302 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2303 if (mbcssize==0)
2304 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2306 repr = PyString_FromStringAndSize(NULL, mbcssize);
2307 if (repr == NULL)
2308 return NULL;
2309 if (mbcssize == 0)
2310 return repr;
2312 /* Do the conversion */
2313 s = PyString_AS_STRING(repr);
2314 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2315 Py_DECREF(repr);
2316 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2318 return repr;
2321 #endif /* MS_WIN32 */
2323 /* --- Character Mapping Codec -------------------------------------------- */
2325 static
2326 int charmap_decoding_error(const char **source,
2327 Py_UNICODE **dest,
2328 const char *errors,
2329 const char *details)
2331 if ((errors == NULL) ||
2332 (strcmp(errors,"strict") == 0)) {
2333 PyErr_Format(PyExc_UnicodeError,
2334 "charmap decoding error: %.400s",
2335 details);
2336 return -1;
2338 else if (strcmp(errors,"ignore") == 0) {
2339 return 0;
2341 else if (strcmp(errors,"replace") == 0) {
2342 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2343 (*dest)++;
2344 return 0;
2346 else {
2347 PyErr_Format(PyExc_ValueError,
2348 "charmap decoding error; "
2349 "unknown error handling code: %.400s",
2350 errors);
2351 return -1;
2355 PyObject *PyUnicode_DecodeCharmap(const char *s,
2356 int size,
2357 PyObject *mapping,
2358 const char *errors)
2360 PyUnicodeObject *v;
2361 Py_UNICODE *p;
2362 int extrachars = 0;
2364 /* Default to Latin-1 */
2365 if (mapping == NULL)
2366 return PyUnicode_DecodeLatin1(s, size, errors);
2368 v = _PyUnicode_New(size);
2369 if (v == NULL)
2370 goto onError;
2371 if (size == 0)
2372 return (PyObject *)v;
2373 p = PyUnicode_AS_UNICODE(v);
2374 while (size-- > 0) {
2375 unsigned char ch = *s++;
2376 PyObject *w, *x;
2378 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2379 w = PyInt_FromLong((long)ch);
2380 if (w == NULL)
2381 goto onError;
2382 x = PyObject_GetItem(mapping, w);
2383 Py_DECREF(w);
2384 if (x == NULL) {
2385 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2386 /* No mapping found means: mapping is undefined. */
2387 PyErr_Clear();
2388 x = Py_None;
2389 Py_INCREF(x);
2390 } else
2391 goto onError;
2394 /* Apply mapping */
2395 if (PyInt_Check(x)) {
2396 long value = PyInt_AS_LONG(x);
2397 if (value < 0 || value > 65535) {
2398 PyErr_SetString(PyExc_TypeError,
2399 "character mapping must be in range(65536)");
2400 Py_DECREF(x);
2401 goto onError;
2403 *p++ = (Py_UNICODE)value;
2405 else if (x == Py_None) {
2406 /* undefined mapping */
2407 if (charmap_decoding_error(&s, &p, errors,
2408 "character maps to <undefined>")) {
2409 Py_DECREF(x);
2410 goto onError;
2413 else if (PyUnicode_Check(x)) {
2414 int targetsize = PyUnicode_GET_SIZE(x);
2416 if (targetsize == 1)
2417 /* 1-1 mapping */
2418 *p++ = *PyUnicode_AS_UNICODE(x);
2420 else if (targetsize > 1) {
2421 /* 1-n mapping */
2422 if (targetsize > extrachars) {
2423 /* resize first */
2424 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2425 int needed = (targetsize - extrachars) + \
2426 (targetsize << 2);
2427 extrachars += needed;
2428 if (_PyUnicode_Resize(&v,
2429 PyUnicode_GET_SIZE(v) + needed)) {
2430 Py_DECREF(x);
2431 goto onError;
2433 p = PyUnicode_AS_UNICODE(v) + oldpos;
2435 Py_UNICODE_COPY(p,
2436 PyUnicode_AS_UNICODE(x),
2437 targetsize);
2438 p += targetsize;
2439 extrachars -= targetsize;
2441 /* 1-0 mapping: skip the character */
2443 else {
2444 /* wrong return value */
2445 PyErr_SetString(PyExc_TypeError,
2446 "character mapping must return integer, None or unicode");
2447 Py_DECREF(x);
2448 goto onError;
2450 Py_DECREF(x);
2452 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2453 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2454 goto onError;
2455 return (PyObject *)v;
2457 onError:
2458 Py_XDECREF(v);
2459 return NULL;
2462 static
2463 int charmap_encoding_error(const Py_UNICODE **source,
2464 char **dest,
2465 const char *errors,
2466 const char *details)
2468 if ((errors == NULL) ||
2469 (strcmp(errors,"strict") == 0)) {
2470 PyErr_Format(PyExc_UnicodeError,
2471 "charmap encoding error: %.400s",
2472 details);
2473 return -1;
2475 else if (strcmp(errors,"ignore") == 0) {
2476 return 0;
2478 else if (strcmp(errors,"replace") == 0) {
2479 **dest = '?';
2480 (*dest)++;
2481 return 0;
2483 else {
2484 PyErr_Format(PyExc_ValueError,
2485 "charmap encoding error; "
2486 "unknown error handling code: %.400s",
2487 errors);
2488 return -1;
2492 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2493 int size,
2494 PyObject *mapping,
2495 const char *errors)
2497 PyObject *v;
2498 char *s;
2499 int extrachars = 0;
2501 /* Default to Latin-1 */
2502 if (mapping == NULL)
2503 return PyUnicode_EncodeLatin1(p, size, errors);
2505 v = PyString_FromStringAndSize(NULL, size);
2506 if (v == NULL)
2507 return NULL;
2508 if (size == 0)
2509 return v;
2510 s = PyString_AS_STRING(v);
2511 while (size-- > 0) {
2512 Py_UNICODE ch = *p++;
2513 PyObject *w, *x;
2515 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2516 w = PyInt_FromLong((long)ch);
2517 if (w == NULL)
2518 goto onError;
2519 x = PyObject_GetItem(mapping, w);
2520 Py_DECREF(w);
2521 if (x == NULL) {
2522 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2523 /* No mapping found means: mapping is undefined. */
2524 PyErr_Clear();
2525 x = Py_None;
2526 Py_INCREF(x);
2527 } else
2528 goto onError;
2531 /* Apply mapping */
2532 if (PyInt_Check(x)) {
2533 long value = PyInt_AS_LONG(x);
2534 if (value < 0 || value > 255) {
2535 PyErr_SetString(PyExc_TypeError,
2536 "character mapping must be in range(256)");
2537 Py_DECREF(x);
2538 goto onError;
2540 *s++ = (char)value;
2542 else if (x == Py_None) {
2543 /* undefined mapping */
2544 if (charmap_encoding_error(&p, &s, errors,
2545 "character maps to <undefined>")) {
2546 Py_DECREF(x);
2547 goto onError;
2550 else if (PyString_Check(x)) {
2551 int targetsize = PyString_GET_SIZE(x);
2553 if (targetsize == 1)
2554 /* 1-1 mapping */
2555 *s++ = *PyString_AS_STRING(x);
2557 else if (targetsize > 1) {
2558 /* 1-n mapping */
2559 if (targetsize > extrachars) {
2560 /* resize first */
2561 int oldpos = (int)(s - PyString_AS_STRING(v));
2562 int needed = (targetsize - extrachars) + \
2563 (targetsize << 2);
2564 extrachars += needed;
2565 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2566 Py_DECREF(x);
2567 goto onError;
2569 s = PyString_AS_STRING(v) + oldpos;
2571 memcpy(s, PyString_AS_STRING(x), targetsize);
2572 s += targetsize;
2573 extrachars -= targetsize;
2575 /* 1-0 mapping: skip the character */
2577 else {
2578 /* wrong return value */
2579 PyErr_SetString(PyExc_TypeError,
2580 "character mapping must return integer, None or unicode");
2581 Py_DECREF(x);
2582 goto onError;
2584 Py_DECREF(x);
2586 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2587 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2588 goto onError;
2589 return v;
2591 onError:
2592 Py_DECREF(v);
2593 return NULL;
2596 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2597 PyObject *mapping)
2599 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2600 PyErr_BadArgument();
2601 return NULL;
2603 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2604 PyUnicode_GET_SIZE(unicode),
2605 mapping,
2606 NULL);
2609 static
2610 int translate_error(const Py_UNICODE **source,
2611 Py_UNICODE **dest,
2612 const char *errors,
2613 const char *details)
2615 if ((errors == NULL) ||
2616 (strcmp(errors,"strict") == 0)) {
2617 PyErr_Format(PyExc_UnicodeError,
2618 "translate error: %.400s",
2619 details);
2620 return -1;
2622 else if (strcmp(errors,"ignore") == 0) {
2623 return 0;
2625 else if (strcmp(errors,"replace") == 0) {
2626 **dest = '?';
2627 (*dest)++;
2628 return 0;
2630 else {
2631 PyErr_Format(PyExc_ValueError,
2632 "translate error; "
2633 "unknown error handling code: %.400s",
2634 errors);
2635 return -1;
2639 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2640 int size,
2641 PyObject *mapping,
2642 const char *errors)
2644 PyUnicodeObject *v;
2645 Py_UNICODE *p;
2647 if (mapping == NULL) {
2648 PyErr_BadArgument();
2649 return NULL;
2652 /* Output will never be longer than input */
2653 v = _PyUnicode_New(size);
2654 if (v == NULL)
2655 goto onError;
2656 if (size == 0)
2657 goto done;
2658 p = PyUnicode_AS_UNICODE(v);
2659 while (size-- > 0) {
2660 Py_UNICODE ch = *s++;
2661 PyObject *w, *x;
2663 /* Get mapping */
2664 w = PyInt_FromLong(ch);
2665 if (w == NULL)
2666 goto onError;
2667 x = PyObject_GetItem(mapping, w);
2668 Py_DECREF(w);
2669 if (x == NULL) {
2670 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2671 /* No mapping found: default to 1-1 mapping */
2672 PyErr_Clear();
2673 *p++ = ch;
2674 continue;
2676 goto onError;
2679 /* Apply mapping */
2680 if (PyInt_Check(x))
2681 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2682 else if (x == Py_None) {
2683 /* undefined mapping */
2684 if (translate_error(&s, &p, errors,
2685 "character maps to <undefined>")) {
2686 Py_DECREF(x);
2687 goto onError;
2690 else if (PyUnicode_Check(x)) {
2691 if (PyUnicode_GET_SIZE(x) != 1) {
2692 /* 1-n mapping */
2693 PyErr_SetString(PyExc_NotImplementedError,
2694 "1-n mappings are currently not implemented");
2695 Py_DECREF(x);
2696 goto onError;
2698 *p++ = *PyUnicode_AS_UNICODE(x);
2700 else {
2701 /* wrong return value */
2702 PyErr_SetString(PyExc_TypeError,
2703 "translate mapping must return integer, None or unicode");
2704 Py_DECREF(x);
2705 goto onError;
2707 Py_DECREF(x);
2709 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2710 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2711 goto onError;
2713 done:
2714 return (PyObject *)v;
2716 onError:
2717 Py_XDECREF(v);
2718 return NULL;
2721 PyObject *PyUnicode_Translate(PyObject *str,
2722 PyObject *mapping,
2723 const char *errors)
2725 PyObject *result;
2727 str = PyUnicode_FromObject(str);
2728 if (str == NULL)
2729 goto onError;
2730 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2731 PyUnicode_GET_SIZE(str),
2732 mapping,
2733 errors);
2734 Py_DECREF(str);
2735 return result;
2737 onError:
2738 Py_XDECREF(str);
2739 return NULL;
2742 /* --- Decimal Encoder ---------------------------------------------------- */
2744 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2745 int length,
2746 char *output,
2747 const char *errors)
2749 Py_UNICODE *p, *end;
2751 if (output == NULL) {
2752 PyErr_BadArgument();
2753 return -1;
2756 p = s;
2757 end = s + length;
2758 while (p < end) {
2759 register Py_UNICODE ch = *p++;
2760 int decimal;
2762 if (Py_UNICODE_ISSPACE(ch)) {
2763 *output++ = ' ';
2764 continue;
2766 decimal = Py_UNICODE_TODECIMAL(ch);
2767 if (decimal >= 0) {
2768 *output++ = '0' + decimal;
2769 continue;
2771 if (0 < ch && ch < 256) {
2772 *output++ = (char)ch;
2773 continue;
2775 /* All other characters are considered invalid */
2776 if (errors == NULL || strcmp(errors, "strict") == 0) {
2777 PyErr_SetString(PyExc_ValueError,
2778 "invalid decimal Unicode string");
2779 goto onError;
2781 else if (strcmp(errors, "ignore") == 0)
2782 continue;
2783 else if (strcmp(errors, "replace") == 0) {
2784 *output++ = '?';
2785 continue;
2788 /* 0-terminate the output string */
2789 *output++ = '\0';
2790 return 0;
2792 onError:
2793 return -1;
2796 /* --- Helpers ------------------------------------------------------------ */
2798 static
2799 int count(PyUnicodeObject *self,
2800 int start,
2801 int end,
2802 PyUnicodeObject *substring)
2804 int count = 0;
2806 if (start < 0)
2807 start += self->length;
2808 if (start < 0)
2809 start = 0;
2810 if (end > self->length)
2811 end = self->length;
2812 if (end < 0)
2813 end += self->length;
2814 if (end < 0)
2815 end = 0;
2817 if (substring->length == 0)
2818 return (end - start + 1);
2820 end -= substring->length;
2822 while (start <= end)
2823 if (Py_UNICODE_MATCH(self, start, substring)) {
2824 count++;
2825 start += substring->length;
2826 } else
2827 start++;
2829 return count;
2832 int PyUnicode_Count(PyObject *str,
2833 PyObject *substr,
2834 int start,
2835 int end)
2837 int result;
2839 str = PyUnicode_FromObject(str);
2840 if (str == NULL)
2841 return -1;
2842 substr = PyUnicode_FromObject(substr);
2843 if (substr == NULL) {
2844 Py_DECREF(str);
2845 return -1;
2848 result = count((PyUnicodeObject *)str,
2849 start, end,
2850 (PyUnicodeObject *)substr);
2852 Py_DECREF(str);
2853 Py_DECREF(substr);
2854 return result;
2857 static
2858 int findstring(PyUnicodeObject *self,
2859 PyUnicodeObject *substring,
2860 int start,
2861 int end,
2862 int direction)
2864 if (start < 0)
2865 start += self->length;
2866 if (start < 0)
2867 start = 0;
2869 if (substring->length == 0)
2870 return start;
2872 if (end > self->length)
2873 end = self->length;
2874 if (end < 0)
2875 end += self->length;
2876 if (end < 0)
2877 end = 0;
2879 end -= substring->length;
2881 if (direction < 0) {
2882 for (; end >= start; end--)
2883 if (Py_UNICODE_MATCH(self, end, substring))
2884 return end;
2885 } else {
2886 for (; start <= end; start++)
2887 if (Py_UNICODE_MATCH(self, start, substring))
2888 return start;
2891 return -1;
2894 int PyUnicode_Find(PyObject *str,
2895 PyObject *substr,
2896 int start,
2897 int end,
2898 int direction)
2900 int result;
2902 str = PyUnicode_FromObject(str);
2903 if (str == NULL)
2904 return -1;
2905 substr = PyUnicode_FromObject(substr);
2906 if (substr == NULL) {
2907 Py_DECREF(substr);
2908 return -1;
2911 result = findstring((PyUnicodeObject *)str,
2912 (PyUnicodeObject *)substr,
2913 start, end, direction);
2914 Py_DECREF(str);
2915 Py_DECREF(substr);
2916 return result;
2919 static
2920 int tailmatch(PyUnicodeObject *self,
2921 PyUnicodeObject *substring,
2922 int start,
2923 int end,
2924 int direction)
2926 if (start < 0)
2927 start += self->length;
2928 if (start < 0)
2929 start = 0;
2931 if (substring->length == 0)
2932 return 1;
2934 if (end > self->length)
2935 end = self->length;
2936 if (end < 0)
2937 end += self->length;
2938 if (end < 0)
2939 end = 0;
2941 end -= substring->length;
2942 if (end < start)
2943 return 0;
2945 if (direction > 0) {
2946 if (Py_UNICODE_MATCH(self, end, substring))
2947 return 1;
2948 } else {
2949 if (Py_UNICODE_MATCH(self, start, substring))
2950 return 1;
2953 return 0;
2956 int PyUnicode_Tailmatch(PyObject *str,
2957 PyObject *substr,
2958 int start,
2959 int end,
2960 int direction)
2962 int result;
2964 str = PyUnicode_FromObject(str);
2965 if (str == NULL)
2966 return -1;
2967 substr = PyUnicode_FromObject(substr);
2968 if (substr == NULL) {
2969 Py_DECREF(substr);
2970 return -1;
2973 result = tailmatch((PyUnicodeObject *)str,
2974 (PyUnicodeObject *)substr,
2975 start, end, direction);
2976 Py_DECREF(str);
2977 Py_DECREF(substr);
2978 return result;
2981 static
2982 const Py_UNICODE *findchar(const Py_UNICODE *s,
2983 int size,
2984 Py_UNICODE ch)
2986 /* like wcschr, but doesn't stop at NULL characters */
2988 while (size-- > 0) {
2989 if (*s == ch)
2990 return s;
2991 s++;
2994 return NULL;
2997 /* Apply fixfct filter to the Unicode object self and return a
2998 reference to the modified object */
3000 static
3001 PyObject *fixup(PyUnicodeObject *self,
3002 int (*fixfct)(PyUnicodeObject *s))
3005 PyUnicodeObject *u;
3007 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3008 if (u == NULL)
3009 return NULL;
3011 Py_UNICODE_COPY(u->str, self->str, self->length);
3013 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3014 /* fixfct should return TRUE if it modified the buffer. If
3015 FALSE, return a reference to the original buffer instead
3016 (to save space, not time) */
3017 Py_INCREF(self);
3018 Py_DECREF(u);
3019 return (PyObject*) self;
3021 return (PyObject*) u;
3024 static
3025 int fixupper(PyUnicodeObject *self)
3027 int len = self->length;
3028 Py_UNICODE *s = self->str;
3029 int status = 0;
3031 while (len-- > 0) {
3032 register Py_UNICODE ch;
3034 ch = Py_UNICODE_TOUPPER(*s);
3035 if (ch != *s) {
3036 status = 1;
3037 *s = ch;
3039 s++;
3042 return status;
3045 static
3046 int fixlower(PyUnicodeObject *self)
3048 int len = self->length;
3049 Py_UNICODE *s = self->str;
3050 int status = 0;
3052 while (len-- > 0) {
3053 register Py_UNICODE ch;
3055 ch = Py_UNICODE_TOLOWER(*s);
3056 if (ch != *s) {
3057 status = 1;
3058 *s = ch;
3060 s++;
3063 return status;
3066 static
3067 int fixswapcase(PyUnicodeObject *self)
3069 int len = self->length;
3070 Py_UNICODE *s = self->str;
3071 int status = 0;
3073 while (len-- > 0) {
3074 if (Py_UNICODE_ISUPPER(*s)) {
3075 *s = Py_UNICODE_TOLOWER(*s);
3076 status = 1;
3077 } else if (Py_UNICODE_ISLOWER(*s)) {
3078 *s = Py_UNICODE_TOUPPER(*s);
3079 status = 1;
3081 s++;
3084 return status;
3087 static
3088 int fixcapitalize(PyUnicodeObject *self)
3090 int len = self->length;
3091 Py_UNICODE *s = self->str;
3092 int status = 0;
3094 if (len == 0)
3095 return 0;
3096 if (Py_UNICODE_ISLOWER(*s)) {
3097 *s = Py_UNICODE_TOUPPER(*s);
3098 status = 1;
3100 s++;
3101 while (--len > 0) {
3102 if (Py_UNICODE_ISUPPER(*s)) {
3103 *s = Py_UNICODE_TOLOWER(*s);
3104 status = 1;
3106 s++;
3108 return status;
3111 static
3112 int fixtitle(PyUnicodeObject *self)
3114 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3115 register Py_UNICODE *e;
3116 int previous_is_cased;
3118 /* Shortcut for single character strings */
3119 if (PyUnicode_GET_SIZE(self) == 1) {
3120 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3121 if (*p != ch) {
3122 *p = ch;
3123 return 1;
3125 else
3126 return 0;
3129 e = p + PyUnicode_GET_SIZE(self);
3130 previous_is_cased = 0;
3131 for (; p < e; p++) {
3132 register const Py_UNICODE ch = *p;
3134 if (previous_is_cased)
3135 *p = Py_UNICODE_TOLOWER(ch);
3136 else
3137 *p = Py_UNICODE_TOTITLE(ch);
3139 if (Py_UNICODE_ISLOWER(ch) ||
3140 Py_UNICODE_ISUPPER(ch) ||
3141 Py_UNICODE_ISTITLE(ch))
3142 previous_is_cased = 1;
3143 else
3144 previous_is_cased = 0;
3146 return 1;
3149 PyObject *PyUnicode_Join(PyObject *separator,
3150 PyObject *seq)
3152 Py_UNICODE *sep;
3153 int seplen;
3154 PyUnicodeObject *res = NULL;
3155 int reslen = 0;
3156 Py_UNICODE *p;
3157 int sz = 100;
3158 int i;
3159 PyObject *it;
3161 it = PyObject_GetIter(seq);
3162 if (it == NULL)
3163 return NULL;
3165 if (separator == NULL) {
3166 Py_UNICODE blank = ' ';
3167 sep = &blank;
3168 seplen = 1;
3170 else {
3171 separator = PyUnicode_FromObject(separator);
3172 if (separator == NULL)
3173 goto onError;
3174 sep = PyUnicode_AS_UNICODE(separator);
3175 seplen = PyUnicode_GET_SIZE(separator);
3178 res = _PyUnicode_New(sz);
3179 if (res == NULL)
3180 goto onError;
3181 p = PyUnicode_AS_UNICODE(res);
3182 reslen = 0;
3184 for (i = 0; ; ++i) {
3185 int itemlen;
3186 PyObject *item = PyIter_Next(it);
3187 if (item == NULL) {
3188 if (PyErr_Occurred())
3189 goto onError;
3190 break;
3192 if (!PyUnicode_Check(item)) {
3193 PyObject *v;
3194 if (!PyString_Check(item)) {
3195 PyErr_Format(PyExc_TypeError,
3196 "sequence item %i: expected string or Unicode,"
3197 " %.80s found",
3198 i, item->ob_type->tp_name);
3199 Py_DECREF(item);
3200 goto onError;
3202 v = PyUnicode_FromObject(item);
3203 Py_DECREF(item);
3204 item = v;
3205 if (item == NULL)
3206 goto onError;
3208 itemlen = PyUnicode_GET_SIZE(item);
3209 while (reslen + itemlen + seplen >= sz) {
3210 if (_PyUnicode_Resize(&res, sz*2)) {
3211 Py_DECREF(item);
3212 goto onError;
3214 sz *= 2;
3215 p = PyUnicode_AS_UNICODE(res) + reslen;
3217 if (i > 0) {
3218 Py_UNICODE_COPY(p, sep, seplen);
3219 p += seplen;
3220 reslen += seplen;
3222 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3223 p += itemlen;
3224 reslen += itemlen;
3225 Py_DECREF(item);
3227 if (_PyUnicode_Resize(&res, reslen))
3228 goto onError;
3230 Py_XDECREF(separator);
3231 Py_DECREF(it);
3232 return (PyObject *)res;
3234 onError:
3235 Py_XDECREF(separator);
3236 Py_XDECREF(res);
3237 Py_DECREF(it);
3238 return NULL;
3241 static
3242 PyUnicodeObject *pad(PyUnicodeObject *self,
3243 int left,
3244 int right,
3245 Py_UNICODE fill)
3247 PyUnicodeObject *u;
3249 if (left < 0)
3250 left = 0;
3251 if (right < 0)
3252 right = 0;
3254 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3255 Py_INCREF(self);
3256 return self;
3259 u = _PyUnicode_New(left + self->length + right);
3260 if (u) {
3261 if (left)
3262 Py_UNICODE_FILL(u->str, fill, left);
3263 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3264 if (right)
3265 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3268 return u;
3271 #define SPLIT_APPEND(data, left, right) \
3272 str = PyUnicode_FromUnicode(data + left, right - left); \
3273 if (!str) \
3274 goto onError; \
3275 if (PyList_Append(list, str)) { \
3276 Py_DECREF(str); \
3277 goto onError; \
3279 else \
3280 Py_DECREF(str);
3282 static
3283 PyObject *split_whitespace(PyUnicodeObject *self,
3284 PyObject *list,
3285 int maxcount)
3287 register int i;
3288 register int j;
3289 int len = self->length;
3290 PyObject *str;
3292 for (i = j = 0; i < len; ) {
3293 /* find a token */
3294 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3295 i++;
3296 j = i;
3297 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3298 i++;
3299 if (j < i) {
3300 if (maxcount-- <= 0)
3301 break;
3302 SPLIT_APPEND(self->str, j, i);
3303 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3304 i++;
3305 j = i;
3308 if (j < len) {
3309 SPLIT_APPEND(self->str, j, len);
3311 return list;
3313 onError:
3314 Py_DECREF(list);
3315 return NULL;
3318 PyObject *PyUnicode_Splitlines(PyObject *string,
3319 int keepends)
3321 register int i;
3322 register int j;
3323 int len;
3324 PyObject *list;
3325 PyObject *str;
3326 Py_UNICODE *data;
3328 string = PyUnicode_FromObject(string);
3329 if (string == NULL)
3330 return NULL;
3331 data = PyUnicode_AS_UNICODE(string);
3332 len = PyUnicode_GET_SIZE(string);
3334 list = PyList_New(0);
3335 if (!list)
3336 goto onError;
3338 for (i = j = 0; i < len; ) {
3339 int eol;
3341 /* Find a line and append it */
3342 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3343 i++;
3345 /* Skip the line break reading CRLF as one line break */
3346 eol = i;
3347 if (i < len) {
3348 if (data[i] == '\r' && i + 1 < len &&
3349 data[i+1] == '\n')
3350 i += 2;
3351 else
3352 i++;
3353 if (keepends)
3354 eol = i;
3356 SPLIT_APPEND(data, j, eol);
3357 j = i;
3359 if (j < len) {
3360 SPLIT_APPEND(data, j, len);
3363 Py_DECREF(string);
3364 return list;
3366 onError:
3367 Py_DECREF(list);
3368 Py_DECREF(string);
3369 return NULL;
3372 static
3373 PyObject *split_char(PyUnicodeObject *self,
3374 PyObject *list,
3375 Py_UNICODE ch,
3376 int maxcount)
3378 register int i;
3379 register int j;
3380 int len = self->length;
3381 PyObject *str;
3383 for (i = j = 0; i < len; ) {
3384 if (self->str[i] == ch) {
3385 if (maxcount-- <= 0)
3386 break;
3387 SPLIT_APPEND(self->str, j, i);
3388 i = j = i + 1;
3389 } else
3390 i++;
3392 if (j <= len) {
3393 SPLIT_APPEND(self->str, j, len);
3395 return list;
3397 onError:
3398 Py_DECREF(list);
3399 return NULL;
3402 static
3403 PyObject *split_substring(PyUnicodeObject *self,
3404 PyObject *list,
3405 PyUnicodeObject *substring,
3406 int maxcount)
3408 register int i;
3409 register int j;
3410 int len = self->length;
3411 int sublen = substring->length;
3412 PyObject *str;
3414 for (i = j = 0; i <= len - sublen; ) {
3415 if (Py_UNICODE_MATCH(self, i, substring)) {
3416 if (maxcount-- <= 0)
3417 break;
3418 SPLIT_APPEND(self->str, j, i);
3419 i = j = i + sublen;
3420 } else
3421 i++;
3423 if (j <= len) {
3424 SPLIT_APPEND(self->str, j, len);
3426 return list;
3428 onError:
3429 Py_DECREF(list);
3430 return NULL;
3433 #undef SPLIT_APPEND
3435 static
3436 PyObject *split(PyUnicodeObject *self,
3437 PyUnicodeObject *substring,
3438 int maxcount)
3440 PyObject *list;
3442 if (maxcount < 0)
3443 maxcount = INT_MAX;
3445 list = PyList_New(0);
3446 if (!list)
3447 return NULL;
3449 if (substring == NULL)
3450 return split_whitespace(self,list,maxcount);
3452 else if (substring->length == 1)
3453 return split_char(self,list,substring->str[0],maxcount);
3455 else if (substring->length == 0) {
3456 Py_DECREF(list);
3457 PyErr_SetString(PyExc_ValueError, "empty separator");
3458 return NULL;
3460 else
3461 return split_substring(self,list,substring,maxcount);
3464 static
3465 PyObject *strip(PyUnicodeObject *self,
3466 int left,
3467 int right)
3469 Py_UNICODE *p = self->str;
3470 int start = 0;
3471 int end = self->length;
3473 if (left)
3474 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3475 start++;
3477 if (right)
3478 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3479 end--;
3481 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3482 /* couldn't strip anything off, return original string */
3483 Py_INCREF(self);
3484 return (PyObject*) self;
3487 return (PyObject*) PyUnicode_FromUnicode(
3488 self->str + start,
3489 end - start
3493 static
3494 PyObject *replace(PyUnicodeObject *self,
3495 PyUnicodeObject *str1,
3496 PyUnicodeObject *str2,
3497 int maxcount)
3499 PyUnicodeObject *u;
3501 if (maxcount < 0)
3502 maxcount = INT_MAX;
3504 if (str1->length == 1 && str2->length == 1) {
3505 int i;
3507 /* replace characters */
3508 if (!findchar(self->str, self->length, str1->str[0]) &&
3509 PyUnicode_CheckExact(self)) {
3510 /* nothing to replace, return original string */
3511 Py_INCREF(self);
3512 u = self;
3513 } else {
3514 Py_UNICODE u1 = str1->str[0];
3515 Py_UNICODE u2 = str2->str[0];
3517 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3518 NULL,
3519 self->length
3521 if (u != NULL) {
3522 Py_UNICODE_COPY(u->str, self->str,
3523 self->length);
3524 for (i = 0; i < u->length; i++)
3525 if (u->str[i] == u1) {
3526 if (--maxcount < 0)
3527 break;
3528 u->str[i] = u2;
3533 } else {
3534 int n, i;
3535 Py_UNICODE *p;
3537 /* replace strings */
3538 n = count(self, 0, self->length, str1);
3539 if (n > maxcount)
3540 n = maxcount;
3541 if (n == 0 && PyUnicode_CheckExact(self)) {
3542 /* nothing to replace, return original string */
3543 Py_INCREF(self);
3544 u = self;
3545 } else {
3546 u = _PyUnicode_New(
3547 self->length + n * (str2->length - str1->length));
3548 if (u) {
3549 i = 0;
3550 p = u->str;
3551 while (i <= self->length - str1->length)
3552 if (Py_UNICODE_MATCH(self, i, str1)) {
3553 /* replace string segment */
3554 Py_UNICODE_COPY(p, str2->str, str2->length);
3555 p += str2->length;
3556 i += str1->length;
3557 if (--n <= 0) {
3558 /* copy remaining part */
3559 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3560 break;
3562 } else
3563 *p++ = self->str[i++];
3568 return (PyObject *) u;
3571 /* --- Unicode Object Methods --------------------------------------------- */
3573 static char title__doc__[] =
3574 "S.title() -> unicode\n\
3576 Return a titlecased version of S, i.e. words start with title case\n\
3577 characters, all remaining cased characters have lower case.";
3579 static PyObject*
3580 unicode_title(PyUnicodeObject *self)
3582 return fixup(self, fixtitle);
3585 static char capitalize__doc__[] =
3586 "S.capitalize() -> unicode\n\
3588 Return a capitalized version of S, i.e. make the first character\n\
3589 have upper case.";
3591 static PyObject*
3592 unicode_capitalize(PyUnicodeObject *self)
3594 return fixup(self, fixcapitalize);
3597 #if 0
3598 static char capwords__doc__[] =
3599 "S.capwords() -> unicode\n\
3601 Apply .capitalize() to all words in S and return the result with\n\
3602 normalized whitespace (all whitespace strings are replaced by ' ').";
3604 static PyObject*
3605 unicode_capwords(PyUnicodeObject *self)
3607 PyObject *list;
3608 PyObject *item;
3609 int i;
3611 /* Split into words */
3612 list = split(self, NULL, -1);
3613 if (!list)
3614 return NULL;
3616 /* Capitalize each word */
3617 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3618 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3619 fixcapitalize);
3620 if (item == NULL)
3621 goto onError;
3622 Py_DECREF(PyList_GET_ITEM(list, i));
3623 PyList_SET_ITEM(list, i, item);
3626 /* Join the words to form a new string */
3627 item = PyUnicode_Join(NULL, list);
3629 onError:
3630 Py_DECREF(list);
3631 return (PyObject *)item;
3633 #endif
3635 static char center__doc__[] =
3636 "S.center(width) -> unicode\n\
3638 Return S centered in a Unicode string of length width. Padding is done\n\
3639 using spaces.";
3641 static PyObject *
3642 unicode_center(PyUnicodeObject *self, PyObject *args)
3644 int marg, left;
3645 int width;
3647 if (!PyArg_ParseTuple(args, "i:center", &width))
3648 return NULL;
3650 if (self->length >= width && PyUnicode_CheckExact(self)) {
3651 Py_INCREF(self);
3652 return (PyObject*) self;
3655 marg = width - self->length;
3656 left = marg / 2 + (marg & width & 1);
3658 return (PyObject*) pad(self, left, marg - left, ' ');
3661 #if 0
3663 /* This code should go into some future Unicode collation support
3664 module. The basic comparison should compare ordinals on a naive
3665 basis (this is what Java does and thus JPython too). */
3667 /* speedy UTF-16 code point order comparison */
3668 /* gleaned from: */
3669 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3671 static short utf16Fixup[32] =
3673 0, 0, 0, 0, 0, 0, 0, 0,
3674 0, 0, 0, 0, 0, 0, 0, 0,
3675 0, 0, 0, 0, 0, 0, 0, 0,
3676 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3679 static int
3680 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3682 int len1, len2;
3684 Py_UNICODE *s1 = str1->str;
3685 Py_UNICODE *s2 = str2->str;
3687 len1 = str1->length;
3688 len2 = str2->length;
3690 while (len1 > 0 && len2 > 0) {
3691 Py_UNICODE c1, c2;
3693 c1 = *s1++;
3694 c2 = *s2++;
3696 if (c1 > (1<<11) * 26)
3697 c1 += utf16Fixup[c1>>11];
3698 if (c2 > (1<<11) * 26)
3699 c2 += utf16Fixup[c2>>11];
3700 /* now c1 and c2 are in UTF-32-compatible order */
3702 if (c1 != c2)
3703 return (c1 < c2) ? -1 : 1;
3705 len1--; len2--;
3708 return (len1 < len2) ? -1 : (len1 != len2);
3711 #else
3713 static int
3714 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3716 register int len1, len2;
3718 Py_UNICODE *s1 = str1->str;
3719 Py_UNICODE *s2 = str2->str;
3721 len1 = str1->length;
3722 len2 = str2->length;
3724 while (len1 > 0 && len2 > 0) {
3725 Py_UNICODE c1, c2;
3727 c1 = *s1++;
3728 c2 = *s2++;
3730 if (c1 != c2)
3731 return (c1 < c2) ? -1 : 1;
3733 len1--; len2--;
3736 return (len1 < len2) ? -1 : (len1 != len2);
3739 #endif
3741 int PyUnicode_Compare(PyObject *left,
3742 PyObject *right)
3744 PyUnicodeObject *u = NULL, *v = NULL;
3745 int result;
3747 /* Coerce the two arguments */
3748 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3749 if (u == NULL)
3750 goto onError;
3751 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3752 if (v == NULL)
3753 goto onError;
3755 /* Shortcut for empty or interned objects */
3756 if (v == u) {
3757 Py_DECREF(u);
3758 Py_DECREF(v);
3759 return 0;
3762 result = unicode_compare(u, v);
3764 Py_DECREF(u);
3765 Py_DECREF(v);
3766 return result;
3768 onError:
3769 Py_XDECREF(u);
3770 Py_XDECREF(v);
3771 return -1;
3774 int PyUnicode_Contains(PyObject *container,
3775 PyObject *element)
3777 PyUnicodeObject *u = NULL, *v = NULL;
3778 int result;
3779 register const Py_UNICODE *p, *e;
3780 register Py_UNICODE ch;
3782 /* Coerce the two arguments */
3783 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3784 if (v == NULL) {
3785 PyErr_SetString(PyExc_TypeError,
3786 "'in <string>' requires character as left operand");
3787 goto onError;
3789 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3790 if (u == NULL) {
3791 Py_DECREF(v);
3792 goto onError;
3795 /* Check v in u */
3796 if (PyUnicode_GET_SIZE(v) != 1) {
3797 PyErr_SetString(PyExc_TypeError,
3798 "'in <string>' requires character as left operand");
3799 goto onError;
3801 ch = *PyUnicode_AS_UNICODE(v);
3802 p = PyUnicode_AS_UNICODE(u);
3803 e = p + PyUnicode_GET_SIZE(u);
3804 result = 0;
3805 while (p < e) {
3806 if (*p++ == ch) {
3807 result = 1;
3808 break;
3812 Py_DECREF(u);
3813 Py_DECREF(v);
3814 return result;
3816 onError:
3817 Py_XDECREF(u);
3818 Py_XDECREF(v);
3819 return -1;
3822 /* Concat to string or Unicode object giving a new Unicode object. */
3824 PyObject *PyUnicode_Concat(PyObject *left,
3825 PyObject *right)
3827 PyUnicodeObject *u = NULL, *v = NULL, *w;
3829 /* Coerce the two arguments */
3830 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3831 if (u == NULL)
3832 goto onError;
3833 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3834 if (v == NULL)
3835 goto onError;
3837 /* Shortcuts */
3838 if (v == unicode_empty) {
3839 Py_DECREF(v);
3840 return (PyObject *)u;
3842 if (u == unicode_empty) {
3843 Py_DECREF(u);
3844 return (PyObject *)v;
3847 /* Concat the two Unicode strings */
3848 w = _PyUnicode_New(u->length + v->length);
3849 if (w == NULL)
3850 goto onError;
3851 Py_UNICODE_COPY(w->str, u->str, u->length);
3852 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3854 Py_DECREF(u);
3855 Py_DECREF(v);
3856 return (PyObject *)w;
3858 onError:
3859 Py_XDECREF(u);
3860 Py_XDECREF(v);
3861 return NULL;
3864 static char count__doc__[] =
3865 "S.count(sub[, start[, end]]) -> int\n\
3867 Return the number of occurrences of substring sub in Unicode string\n\
3868 S[start:end]. Optional arguments start and end are\n\
3869 interpreted as in slice notation.";
3871 static PyObject *
3872 unicode_count(PyUnicodeObject *self, PyObject *args)
3874 PyUnicodeObject *substring;
3875 int start = 0;
3876 int end = INT_MAX;
3877 PyObject *result;
3879 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3880 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3881 return NULL;
3883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3884 (PyObject *)substring);
3885 if (substring == NULL)
3886 return NULL;
3888 if (start < 0)
3889 start += self->length;
3890 if (start < 0)
3891 start = 0;
3892 if (end > self->length)
3893 end = self->length;
3894 if (end < 0)
3895 end += self->length;
3896 if (end < 0)
3897 end = 0;
3899 result = PyInt_FromLong((long) count(self, start, end, substring));
3901 Py_DECREF(substring);
3902 return result;
3905 static char encode__doc__[] =
3906 "S.encode([encoding[,errors]]) -> string\n\
3908 Return an encoded string version of S. Default encoding is the current\n\
3909 default string encoding. errors may be given to set a different error\n\
3910 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3911 a ValueError. Other possible values are 'ignore' and 'replace'.";
3913 static PyObject *
3914 unicode_encode(PyUnicodeObject *self, PyObject *args)
3916 char *encoding = NULL;
3917 char *errors = NULL;
3918 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3919 return NULL;
3920 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3923 static char expandtabs__doc__[] =
3924 "S.expandtabs([tabsize]) -> unicode\n\
3926 Return a copy of S where all tab characters are expanded using spaces.\n\
3927 If tabsize is not given, a tab size of 8 characters is assumed.";
3929 static PyObject*
3930 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3932 Py_UNICODE *e;
3933 Py_UNICODE *p;
3934 Py_UNICODE *q;
3935 int i, j;
3936 PyUnicodeObject *u;
3937 int tabsize = 8;
3939 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3940 return NULL;
3942 /* First pass: determine size of output string */
3943 i = j = 0;
3944 e = self->str + self->length;
3945 for (p = self->str; p < e; p++)
3946 if (*p == '\t') {
3947 if (tabsize > 0)
3948 j += tabsize - (j % tabsize);
3950 else {
3951 j++;
3952 if (*p == '\n' || *p == '\r') {
3953 i += j;
3954 j = 0;
3958 /* Second pass: create output string and fill it */
3959 u = _PyUnicode_New(i + j);
3960 if (!u)
3961 return NULL;
3963 j = 0;
3964 q = u->str;
3966 for (p = self->str; p < e; p++)
3967 if (*p == '\t') {
3968 if (tabsize > 0) {
3969 i = tabsize - (j % tabsize);
3970 j += i;
3971 while (i--)
3972 *q++ = ' ';
3975 else {
3976 j++;
3977 *q++ = *p;
3978 if (*p == '\n' || *p == '\r')
3979 j = 0;
3982 return (PyObject*) u;
3985 static char find__doc__[] =
3986 "S.find(sub [,start [,end]]) -> int\n\
3988 Return the lowest index in S where substring sub is found,\n\
3989 such that sub is contained within s[start,end]. Optional\n\
3990 arguments start and end are interpreted as in slice notation.\n\
3992 Return -1 on failure.";
3994 static PyObject *
3995 unicode_find(PyUnicodeObject *self, PyObject *args)
3997 PyUnicodeObject *substring;
3998 int start = 0;
3999 int end = INT_MAX;
4000 PyObject *result;
4002 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4003 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4004 return NULL;
4005 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4006 (PyObject *)substring);
4007 if (substring == NULL)
4008 return NULL;
4010 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4012 Py_DECREF(substring);
4013 return result;
4016 static PyObject *
4017 unicode_getitem(PyUnicodeObject *self, int index)
4019 if (index < 0 || index >= self->length) {
4020 PyErr_SetString(PyExc_IndexError, "string index out of range");
4021 return NULL;
4024 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4027 static long
4028 unicode_hash(PyUnicodeObject *self)
4030 /* Since Unicode objects compare equal to their ASCII string
4031 counterparts, they should use the individual character values
4032 as basis for their hash value. This is needed to assure that
4033 strings and Unicode objects behave in the same way as
4034 dictionary keys. */
4036 register int len;
4037 register Py_UNICODE *p;
4038 register long x;
4040 if (self->hash != -1)
4041 return self->hash;
4042 len = PyUnicode_GET_SIZE(self);
4043 p = PyUnicode_AS_UNICODE(self);
4044 x = *p << 7;
4045 while (--len >= 0)
4046 x = (1000003*x) ^ *p++;
4047 x ^= PyUnicode_GET_SIZE(self);
4048 if (x == -1)
4049 x = -2;
4050 self->hash = x;
4051 return x;
4054 static char index__doc__[] =
4055 "S.index(sub [,start [,end]]) -> int\n\
4057 Like S.find() but raise ValueError when the substring is not found.";
4059 static PyObject *
4060 unicode_index(PyUnicodeObject *self, PyObject *args)
4062 int result;
4063 PyUnicodeObject *substring;
4064 int start = 0;
4065 int end = INT_MAX;
4067 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4068 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4069 return NULL;
4071 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4072 (PyObject *)substring);
4073 if (substring == NULL)
4074 return NULL;
4076 result = findstring(self, substring, start, end, 1);
4078 Py_DECREF(substring);
4079 if (result < 0) {
4080 PyErr_SetString(PyExc_ValueError, "substring not found");
4081 return NULL;
4083 return PyInt_FromLong(result);
4086 static char islower__doc__[] =
4087 "S.islower() -> int\n\
4089 Return 1 if all cased characters in S are lowercase and there is\n\
4090 at least one cased character in S, 0 otherwise.";
4092 static PyObject*
4093 unicode_islower(PyUnicodeObject *self)
4095 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4096 register const Py_UNICODE *e;
4097 int cased;
4099 /* Shortcut for single character strings */
4100 if (PyUnicode_GET_SIZE(self) == 1)
4101 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4103 /* Special case for empty strings */
4104 if (PyString_GET_SIZE(self) == 0)
4105 return PyInt_FromLong(0);
4107 e = p + PyUnicode_GET_SIZE(self);
4108 cased = 0;
4109 for (; p < e; p++) {
4110 register const Py_UNICODE ch = *p;
4112 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4113 return PyInt_FromLong(0);
4114 else if (!cased && Py_UNICODE_ISLOWER(ch))
4115 cased = 1;
4117 return PyInt_FromLong(cased);
4120 static char isupper__doc__[] =
4121 "S.isupper() -> int\n\
4123 Return 1 if all cased characters in S are uppercase and there is\n\
4124 at least one cased character in S, 0 otherwise.";
4126 static PyObject*
4127 unicode_isupper(PyUnicodeObject *self)
4129 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4130 register const Py_UNICODE *e;
4131 int cased;
4133 /* Shortcut for single character strings */
4134 if (PyUnicode_GET_SIZE(self) == 1)
4135 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4137 /* Special case for empty strings */
4138 if (PyString_GET_SIZE(self) == 0)
4139 return PyInt_FromLong(0);
4141 e = p + PyUnicode_GET_SIZE(self);
4142 cased = 0;
4143 for (; p < e; p++) {
4144 register const Py_UNICODE ch = *p;
4146 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4147 return PyInt_FromLong(0);
4148 else if (!cased && Py_UNICODE_ISUPPER(ch))
4149 cased = 1;
4151 return PyInt_FromLong(cased);
4154 static char istitle__doc__[] =
4155 "S.istitle() -> int\n\
4157 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4158 may only follow uncased characters and lowercase characters only cased\n\
4159 ones. Return 0 otherwise.";
4161 static PyObject*
4162 unicode_istitle(PyUnicodeObject *self)
4164 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4165 register const Py_UNICODE *e;
4166 int cased, previous_is_cased;
4168 /* Shortcut for single character strings */
4169 if (PyUnicode_GET_SIZE(self) == 1)
4170 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4171 (Py_UNICODE_ISUPPER(*p) != 0));
4173 /* Special case for empty strings */
4174 if (PyString_GET_SIZE(self) == 0)
4175 return PyInt_FromLong(0);
4177 e = p + PyUnicode_GET_SIZE(self);
4178 cased = 0;
4179 previous_is_cased = 0;
4180 for (; p < e; p++) {
4181 register const Py_UNICODE ch = *p;
4183 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4184 if (previous_is_cased)
4185 return PyInt_FromLong(0);
4186 previous_is_cased = 1;
4187 cased = 1;
4189 else if (Py_UNICODE_ISLOWER(ch)) {
4190 if (!previous_is_cased)
4191 return PyInt_FromLong(0);
4192 previous_is_cased = 1;
4193 cased = 1;
4195 else
4196 previous_is_cased = 0;
4198 return PyInt_FromLong(cased);
4201 static char isspace__doc__[] =
4202 "S.isspace() -> int\n\
4204 Return 1 if there are only whitespace characters in S,\n\
4205 0 otherwise.";
4207 static PyObject*
4208 unicode_isspace(PyUnicodeObject *self)
4210 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4211 register const Py_UNICODE *e;
4213 /* Shortcut for single character strings */
4214 if (PyUnicode_GET_SIZE(self) == 1 &&
4215 Py_UNICODE_ISSPACE(*p))
4216 return PyInt_FromLong(1);
4218 /* Special case for empty strings */
4219 if (PyString_GET_SIZE(self) == 0)
4220 return PyInt_FromLong(0);
4222 e = p + PyUnicode_GET_SIZE(self);
4223 for (; p < e; p++) {
4224 if (!Py_UNICODE_ISSPACE(*p))
4225 return PyInt_FromLong(0);
4227 return PyInt_FromLong(1);
4230 static char isalpha__doc__[] =
4231 "S.isalpha() -> int\n\
4233 Return 1 if all characters in S are alphabetic\n\
4234 and there is at least one character in S, 0 otherwise.";
4236 static PyObject*
4237 unicode_isalpha(PyUnicodeObject *self)
4239 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4240 register const Py_UNICODE *e;
4242 /* Shortcut for single character strings */
4243 if (PyUnicode_GET_SIZE(self) == 1 &&
4244 Py_UNICODE_ISALPHA(*p))
4245 return PyInt_FromLong(1);
4247 /* Special case for empty strings */
4248 if (PyString_GET_SIZE(self) == 0)
4249 return PyInt_FromLong(0);
4251 e = p + PyUnicode_GET_SIZE(self);
4252 for (; p < e; p++) {
4253 if (!Py_UNICODE_ISALPHA(*p))
4254 return PyInt_FromLong(0);
4256 return PyInt_FromLong(1);
4259 static char isalnum__doc__[] =
4260 "S.isalnum() -> int\n\
4262 Return 1 if all characters in S are alphanumeric\n\
4263 and there is at least one character in S, 0 otherwise.";
4265 static PyObject*
4266 unicode_isalnum(PyUnicodeObject *self)
4268 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4269 register const Py_UNICODE *e;
4271 /* Shortcut for single character strings */
4272 if (PyUnicode_GET_SIZE(self) == 1 &&
4273 Py_UNICODE_ISALNUM(*p))
4274 return PyInt_FromLong(1);
4276 /* Special case for empty strings */
4277 if (PyString_GET_SIZE(self) == 0)
4278 return PyInt_FromLong(0);
4280 e = p + PyUnicode_GET_SIZE(self);
4281 for (; p < e; p++) {
4282 if (!Py_UNICODE_ISALNUM(*p))
4283 return PyInt_FromLong(0);
4285 return PyInt_FromLong(1);
4288 static char isdecimal__doc__[] =
4289 "S.isdecimal() -> int\n\
4291 Return 1 if there are only decimal characters in S,\n\
4292 0 otherwise.";
4294 static PyObject*
4295 unicode_isdecimal(PyUnicodeObject *self)
4297 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4298 register const Py_UNICODE *e;
4300 /* Shortcut for single character strings */
4301 if (PyUnicode_GET_SIZE(self) == 1 &&
4302 Py_UNICODE_ISDECIMAL(*p))
4303 return PyInt_FromLong(1);
4305 /* Special case for empty strings */
4306 if (PyString_GET_SIZE(self) == 0)
4307 return PyInt_FromLong(0);
4309 e = p + PyUnicode_GET_SIZE(self);
4310 for (; p < e; p++) {
4311 if (!Py_UNICODE_ISDECIMAL(*p))
4312 return PyInt_FromLong(0);
4314 return PyInt_FromLong(1);
4317 static char isdigit__doc__[] =
4318 "S.isdigit() -> int\n\
4320 Return 1 if there are only digit characters in S,\n\
4321 0 otherwise.";
4323 static PyObject*
4324 unicode_isdigit(PyUnicodeObject *self)
4326 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4327 register const Py_UNICODE *e;
4329 /* Shortcut for single character strings */
4330 if (PyUnicode_GET_SIZE(self) == 1 &&
4331 Py_UNICODE_ISDIGIT(*p))
4332 return PyInt_FromLong(1);
4334 /* Special case for empty strings */
4335 if (PyString_GET_SIZE(self) == 0)
4336 return PyInt_FromLong(0);
4338 e = p + PyUnicode_GET_SIZE(self);
4339 for (; p < e; p++) {
4340 if (!Py_UNICODE_ISDIGIT(*p))
4341 return PyInt_FromLong(0);
4343 return PyInt_FromLong(1);
4346 static char isnumeric__doc__[] =
4347 "S.isnumeric() -> int\n\
4349 Return 1 if there are only numeric characters in S,\n\
4350 0 otherwise.";
4352 static PyObject*
4353 unicode_isnumeric(PyUnicodeObject *self)
4355 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4356 register const Py_UNICODE *e;
4358 /* Shortcut for single character strings */
4359 if (PyUnicode_GET_SIZE(self) == 1 &&
4360 Py_UNICODE_ISNUMERIC(*p))
4361 return PyInt_FromLong(1);
4363 /* Special case for empty strings */
4364 if (PyString_GET_SIZE(self) == 0)
4365 return PyInt_FromLong(0);
4367 e = p + PyUnicode_GET_SIZE(self);
4368 for (; p < e; p++) {
4369 if (!Py_UNICODE_ISNUMERIC(*p))
4370 return PyInt_FromLong(0);
4372 return PyInt_FromLong(1);
4375 static char join__doc__[] =
4376 "S.join(sequence) -> unicode\n\
4378 Return a string which is the concatenation of the strings in the\n\
4379 sequence. The separator between elements is S.";
4381 static PyObject*
4382 unicode_join(PyObject *self, PyObject *data)
4384 return PyUnicode_Join(self, data);
4387 static int
4388 unicode_length(PyUnicodeObject *self)
4390 return self->length;
4393 static char ljust__doc__[] =
4394 "S.ljust(width) -> unicode\n\
4396 Return S left justified in a Unicode string of length width. Padding is\n\
4397 done using spaces.";
4399 static PyObject *
4400 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4402 int width;
4403 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4404 return NULL;
4406 if (self->length >= width && PyUnicode_CheckExact(self)) {
4407 Py_INCREF(self);
4408 return (PyObject*) self;
4411 return (PyObject*) pad(self, 0, width - self->length, ' ');
4414 static char lower__doc__[] =
4415 "S.lower() -> unicode\n\
4417 Return a copy of the string S converted to lowercase.";
4419 static PyObject*
4420 unicode_lower(PyUnicodeObject *self)
4422 return fixup(self, fixlower);
4425 static char lstrip__doc__[] =
4426 "S.lstrip() -> unicode\n\
4428 Return a copy of the string S with leading whitespace removed.";
4430 static PyObject *
4431 unicode_lstrip(PyUnicodeObject *self)
4433 return strip(self, 1, 0);
4436 static PyObject*
4437 unicode_repeat(PyUnicodeObject *str, int len)
4439 PyUnicodeObject *u;
4440 Py_UNICODE *p;
4441 int nchars;
4442 size_t nbytes;
4444 if (len < 0)
4445 len = 0;
4447 if (len == 1 && PyUnicode_CheckExact(str)) {
4448 /* no repeat, return original string */
4449 Py_INCREF(str);
4450 return (PyObject*) str;
4453 /* ensure # of chars needed doesn't overflow int and # of bytes
4454 * needed doesn't overflow size_t
4456 nchars = len * str->length;
4457 if (len && nchars / len != str->length) {
4458 PyErr_SetString(PyExc_OverflowError,
4459 "repeated string is too long");
4460 return NULL;
4462 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4463 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4464 PyErr_SetString(PyExc_OverflowError,
4465 "repeated string is too long");
4466 return NULL;
4468 u = _PyUnicode_New(nchars);
4469 if (!u)
4470 return NULL;
4472 p = u->str;
4474 while (len-- > 0) {
4475 Py_UNICODE_COPY(p, str->str, str->length);
4476 p += str->length;
4479 return (PyObject*) u;
4482 PyObject *PyUnicode_Replace(PyObject *obj,
4483 PyObject *subobj,
4484 PyObject *replobj,
4485 int maxcount)
4487 PyObject *self;
4488 PyObject *str1;
4489 PyObject *str2;
4490 PyObject *result;
4492 self = PyUnicode_FromObject(obj);
4493 if (self == NULL)
4494 return NULL;
4495 str1 = PyUnicode_FromObject(subobj);
4496 if (str1 == NULL) {
4497 Py_DECREF(self);
4498 return NULL;
4500 str2 = PyUnicode_FromObject(replobj);
4501 if (str2 == NULL) {
4502 Py_DECREF(self);
4503 Py_DECREF(str1);
4504 return NULL;
4506 result = replace((PyUnicodeObject *)self,
4507 (PyUnicodeObject *)str1,
4508 (PyUnicodeObject *)str2,
4509 maxcount);
4510 Py_DECREF(self);
4511 Py_DECREF(str1);
4512 Py_DECREF(str2);
4513 return result;
4516 static char replace__doc__[] =
4517 "S.replace (old, new[, maxsplit]) -> unicode\n\
4519 Return a copy of S with all occurrences of substring\n\
4520 old replaced by new. If the optional argument maxsplit is\n\
4521 given, only the first maxsplit occurrences are replaced.";
4523 static PyObject*
4524 unicode_replace(PyUnicodeObject *self, PyObject *args)
4526 PyUnicodeObject *str1;
4527 PyUnicodeObject *str2;
4528 int maxcount = -1;
4529 PyObject *result;
4531 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4532 return NULL;
4533 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4534 if (str1 == NULL)
4535 return NULL;
4536 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4537 if (str2 == NULL)
4538 return NULL;
4540 result = replace(self, str1, str2, maxcount);
4542 Py_DECREF(str1);
4543 Py_DECREF(str2);
4544 return result;
4547 static
4548 PyObject *unicode_repr(PyObject *unicode)
4550 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4551 PyUnicode_GET_SIZE(unicode),
4555 static char rfind__doc__[] =
4556 "S.rfind(sub [,start [,end]]) -> int\n\
4558 Return the highest index in S where substring sub is found,\n\
4559 such that sub is contained within s[start,end]. Optional\n\
4560 arguments start and end are interpreted as in slice notation.\n\
4562 Return -1 on failure.";
4564 static PyObject *
4565 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4567 PyUnicodeObject *substring;
4568 int start = 0;
4569 int end = INT_MAX;
4570 PyObject *result;
4572 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4574 return NULL;
4575 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4576 (PyObject *)substring);
4577 if (substring == NULL)
4578 return NULL;
4580 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4582 Py_DECREF(substring);
4583 return result;
4586 static char rindex__doc__[] =
4587 "S.rindex(sub [,start [,end]]) -> int\n\
4589 Like S.rfind() but raise ValueError when the substring is not found.";
4591 static PyObject *
4592 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4594 int result;
4595 PyUnicodeObject *substring;
4596 int start = 0;
4597 int end = INT_MAX;
4599 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4600 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4601 return NULL;
4602 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4603 (PyObject *)substring);
4604 if (substring == NULL)
4605 return NULL;
4607 result = findstring(self, substring, start, end, -1);
4609 Py_DECREF(substring);
4610 if (result < 0) {
4611 PyErr_SetString(PyExc_ValueError, "substring not found");
4612 return NULL;
4614 return PyInt_FromLong(result);
4617 static char rjust__doc__[] =
4618 "S.rjust(width) -> unicode\n\
4620 Return S right justified in a Unicode string of length width. Padding is\n\
4621 done using spaces.";
4623 static PyObject *
4624 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4626 int width;
4627 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4628 return NULL;
4630 if (self->length >= width && PyUnicode_CheckExact(self)) {
4631 Py_INCREF(self);
4632 return (PyObject*) self;
4635 return (PyObject*) pad(self, width - self->length, 0, ' ');
4638 static char rstrip__doc__[] =
4639 "S.rstrip() -> unicode\n\
4641 Return a copy of the string S with trailing whitespace removed.";
4643 static PyObject *
4644 unicode_rstrip(PyUnicodeObject *self)
4646 return strip(self, 0, 1);
4649 static PyObject*
4650 unicode_slice(PyUnicodeObject *self, int start, int end)
4652 /* standard clamping */
4653 if (start < 0)
4654 start = 0;
4655 if (end < 0)
4656 end = 0;
4657 if (end > self->length)
4658 end = self->length;
4659 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4660 /* full slice, return original string */
4661 Py_INCREF(self);
4662 return (PyObject*) self;
4664 if (start > end)
4665 start = end;
4666 /* copy slice */
4667 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4668 end - start);
4671 PyObject *PyUnicode_Split(PyObject *s,
4672 PyObject *sep,
4673 int maxsplit)
4675 PyObject *result;
4677 s = PyUnicode_FromObject(s);
4678 if (s == NULL)
4679 return NULL;
4680 if (sep != NULL) {
4681 sep = PyUnicode_FromObject(sep);
4682 if (sep == NULL) {
4683 Py_DECREF(s);
4684 return NULL;
4688 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4690 Py_DECREF(s);
4691 Py_XDECREF(sep);
4692 return result;
4695 static char split__doc__[] =
4696 "S.split([sep [,maxsplit]]) -> list of strings\n\
4698 Return a list of the words in S, using sep as the\n\
4699 delimiter string. If maxsplit is given, at most maxsplit\n\
4700 splits are done. If sep is not specified, any whitespace string\n\
4701 is a separator.";
4703 static PyObject*
4704 unicode_split(PyUnicodeObject *self, PyObject *args)
4706 PyObject *substring = Py_None;
4707 int maxcount = -1;
4709 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4710 return NULL;
4712 if (substring == Py_None)
4713 return split(self, NULL, maxcount);
4714 else if (PyUnicode_Check(substring))
4715 return split(self, (PyUnicodeObject *)substring, maxcount);
4716 else
4717 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4720 static char splitlines__doc__[] =
4721 "S.splitlines([keepends]]) -> list of strings\n\
4723 Return a list of the lines in S, breaking at line boundaries.\n\
4724 Line breaks are not included in the resulting list unless keepends\n\
4725 is given and true.";
4727 static PyObject*
4728 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4730 int keepends = 0;
4732 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4733 return NULL;
4735 return PyUnicode_Splitlines((PyObject *)self, keepends);
4738 static
4739 PyObject *unicode_str(PyUnicodeObject *self)
4741 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4744 static char strip__doc__[] =
4745 "S.strip() -> unicode\n\
4747 Return a copy of S with leading and trailing whitespace removed.";
4749 static PyObject *
4750 unicode_strip(PyUnicodeObject *self)
4752 return strip(self, 1, 1);
4755 static char swapcase__doc__[] =
4756 "S.swapcase() -> unicode\n\
4758 Return a copy of S with uppercase characters converted to lowercase\n\
4759 and vice versa.";
4761 static PyObject*
4762 unicode_swapcase(PyUnicodeObject *self)
4764 return fixup(self, fixswapcase);
4767 static char translate__doc__[] =
4768 "S.translate(table) -> unicode\n\
4770 Return a copy of the string S, where all characters have been mapped\n\
4771 through the given translation table, which must be a mapping of\n\
4772 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4773 are left untouched. Characters mapped to None are deleted.";
4775 static PyObject*
4776 unicode_translate(PyUnicodeObject *self, PyObject *table)
4778 return PyUnicode_TranslateCharmap(self->str,
4779 self->length,
4780 table,
4781 "ignore");
4784 static char upper__doc__[] =
4785 "S.upper() -> unicode\n\
4787 Return a copy of S converted to uppercase.";
4789 static PyObject*
4790 unicode_upper(PyUnicodeObject *self)
4792 return fixup(self, fixupper);
4795 #if 0
4796 static char zfill__doc__[] =
4797 "S.zfill(width) -> unicode\n\
4799 Pad a numeric string x with zeros on the left, to fill a field\n\
4800 of the specified width. The string x is never truncated.";
4802 static PyObject *
4803 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4805 int fill;
4806 PyUnicodeObject *u;
4808 int width;
4809 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4810 return NULL;
4812 if (self->length >= width) {
4813 Py_INCREF(self);
4814 return (PyObject*) self;
4817 fill = width - self->length;
4819 u = pad(self, fill, 0, '0');
4821 if (u->str[fill] == '+' || u->str[fill] == '-') {
4822 /* move sign to beginning of string */
4823 u->str[0] = u->str[fill];
4824 u->str[fill] = '0';
4827 return (PyObject*) u;
4829 #endif
4831 #if 0
4832 static PyObject*
4833 unicode_freelistsize(PyUnicodeObject *self)
4835 return PyInt_FromLong(unicode_freelist_size);
4837 #endif
4839 static char startswith__doc__[] =
4840 "S.startswith(prefix[, start[, end]]) -> int\n\
4842 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4843 optional start, test S beginning at that position. With optional end, stop\n\
4844 comparing S at that position.";
4846 static PyObject *
4847 unicode_startswith(PyUnicodeObject *self,
4848 PyObject *args)
4850 PyUnicodeObject *substring;
4851 int start = 0;
4852 int end = INT_MAX;
4853 PyObject *result;
4855 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4856 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4857 return NULL;
4858 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4859 (PyObject *)substring);
4860 if (substring == NULL)
4861 return NULL;
4863 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4865 Py_DECREF(substring);
4866 return result;
4870 static char endswith__doc__[] =
4871 "S.endswith(suffix[, start[, end]]) -> int\n\
4873 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4874 optional start, test S beginning at that position. With optional end, stop\n\
4875 comparing S at that position.";
4877 static PyObject *
4878 unicode_endswith(PyUnicodeObject *self,
4879 PyObject *args)
4881 PyUnicodeObject *substring;
4882 int start = 0;
4883 int end = INT_MAX;
4884 PyObject *result;
4886 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4887 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4888 return NULL;
4889 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4890 (PyObject *)substring);
4891 if (substring == NULL)
4892 return NULL;
4894 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4896 Py_DECREF(substring);
4897 return result;
4901 static PyMethodDef unicode_methods[] = {
4903 /* Order is according to common usage: often used methods should
4904 appear first, since lookup is done sequentially. */
4906 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4907 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4908 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4909 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4910 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4911 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4912 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4913 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4914 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4915 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4916 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4917 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4918 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4919 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4920 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4921 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4922 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4923 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4924 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4925 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4926 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4927 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4928 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4929 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4930 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4931 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4932 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4933 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4934 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4935 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4936 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4937 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4938 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4939 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4940 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4941 #if 0
4942 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4943 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4944 #endif
4946 #if 0
4947 /* This one is just used for debugging the implementation. */
4948 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4949 #endif
4951 {NULL, NULL}
4954 static PySequenceMethods unicode_as_sequence = {
4955 (inquiry) unicode_length, /* sq_length */
4956 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4957 (intargfunc) unicode_repeat, /* sq_repeat */
4958 (intargfunc) unicode_getitem, /* sq_item */
4959 (intintargfunc) unicode_slice, /* sq_slice */
4960 0, /* sq_ass_item */
4961 0, /* sq_ass_slice */
4962 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4965 static int
4966 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4967 int index,
4968 const void **ptr)
4970 if (index != 0) {
4971 PyErr_SetString(PyExc_SystemError,
4972 "accessing non-existent unicode segment");
4973 return -1;
4975 *ptr = (void *) self->str;
4976 return PyUnicode_GET_DATA_SIZE(self);
4979 static int
4980 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4981 const void **ptr)
4983 PyErr_SetString(PyExc_TypeError,
4984 "cannot use unicode as modifyable buffer");
4985 return -1;
4988 static int
4989 unicode_buffer_getsegcount(PyUnicodeObject *self,
4990 int *lenp)
4992 if (lenp)
4993 *lenp = PyUnicode_GET_DATA_SIZE(self);
4994 return 1;
4997 static int
4998 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4999 int index,
5000 const void **ptr)
5002 PyObject *str;
5004 if (index != 0) {
5005 PyErr_SetString(PyExc_SystemError,
5006 "accessing non-existent unicode segment");
5007 return -1;
5009 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5010 if (str == NULL)
5011 return -1;
5012 *ptr = (void *) PyString_AS_STRING(str);
5013 return PyString_GET_SIZE(str);
5016 /* Helpers for PyUnicode_Format() */
5018 static PyObject *
5019 getnextarg(PyObject *args, int arglen, int *p_argidx)
5021 int argidx = *p_argidx;
5022 if (argidx < arglen) {
5023 (*p_argidx)++;
5024 if (arglen < 0)
5025 return args;
5026 else
5027 return PyTuple_GetItem(args, argidx);
5029 PyErr_SetString(PyExc_TypeError,
5030 "not enough arguments for format string");
5031 return NULL;
5034 #define F_LJUST (1<<0)
5035 #define F_SIGN (1<<1)
5036 #define F_BLANK (1<<2)
5037 #define F_ALT (1<<3)
5038 #define F_ZERO (1<<4)
5040 static
5041 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5043 register int i;
5044 int len;
5045 va_list va;
5046 char *charbuffer;
5047 va_start(va, format);
5049 /* First, format the string as char array, then expand to Py_UNICODE
5050 array. */
5051 charbuffer = (char *)buffer;
5052 len = vsprintf(charbuffer, format, va);
5053 for (i = len - 1; i >= 0; i--)
5054 buffer[i] = (Py_UNICODE) charbuffer[i];
5056 va_end(va);
5057 return len;
5060 static int
5061 formatfloat(Py_UNICODE *buf,
5062 size_t buflen,
5063 int flags,
5064 int prec,
5065 int type,
5066 PyObject *v)
5068 /* fmt = '%#.' + `prec` + `type`
5069 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5070 char fmt[20];
5071 double x;
5073 x = PyFloat_AsDouble(v);
5074 if (x == -1.0 && PyErr_Occurred())
5075 return -1;
5076 if (prec < 0)
5077 prec = 6;
5078 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5079 type = 'g';
5080 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5081 (flags & F_ALT) ? "#" : "", prec, type);
5082 /* worst case length calc to ensure no buffer overrun:
5083 fmt = %#.<prec>g
5084 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5085 for any double rep.)
5086 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5087 If prec=0 the effective precision is 1 (the leading digit is
5088 always given), therefore increase by one to 10+prec. */
5089 if (buflen <= (size_t)10 + (size_t)prec) {
5090 PyErr_SetString(PyExc_OverflowError,
5091 "formatted float is too long (precision too long?)");
5092 return -1;
5094 return usprintf(buf, fmt, x);
5097 static PyObject*
5098 formatlong(PyObject *val, int flags, int prec, int type)
5100 char *buf;
5101 int i, len;
5102 PyObject *str; /* temporary string object. */
5103 PyUnicodeObject *result;
5105 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5106 if (!str)
5107 return NULL;
5108 result = _PyUnicode_New(len);
5109 for (i = 0; i < len; i++)
5110 result->str[i] = buf[i];
5111 result->str[len] = 0;
5112 Py_DECREF(str);
5113 return (PyObject*)result;
5116 static int
5117 formatint(Py_UNICODE *buf,
5118 size_t buflen,
5119 int flags,
5120 int prec,
5121 int type,
5122 PyObject *v)
5124 /* fmt = '%#.' + `prec` + 'l' + `type`
5125 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5126 + 1 + 1 = 24*/
5127 char fmt[64]; /* plenty big enough! */
5128 long x;
5129 int use_native_c_format = 1;
5131 x = PyInt_AsLong(v);
5132 if (x == -1 && PyErr_Occurred())
5133 return -1;
5134 if (prec < 0)
5135 prec = 1;
5136 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5137 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5138 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5139 PyErr_SetString(PyExc_OverflowError,
5140 "formatted integer is too long (precision too long?)");
5141 return -1;
5143 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5144 * but we want it (for consistency with other %#x conversions, and
5145 * for consistency with Python's hex() function).
5146 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5147 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5148 * So add it only if the platform doesn't already.
5150 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5151 /* Only way to know what the platform does is to try it. */
5152 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
5153 if (fmt[1] != (char)type) {
5154 /* Supply our own leading 0x/0X -- needed under std C */
5155 use_native_c_format = 0;
5156 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
5159 if (use_native_c_format)
5160 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5161 (flags & F_ALT) ? "#" : "", prec, type);
5162 return usprintf(buf, fmt, x);
5165 static int
5166 formatchar(Py_UNICODE *buf,
5167 size_t buflen,
5168 PyObject *v)
5170 /* presume that the buffer is at least 2 characters long */
5171 if (PyUnicode_Check(v)) {
5172 if (PyUnicode_GET_SIZE(v) != 1)
5173 goto onError;
5174 buf[0] = PyUnicode_AS_UNICODE(v)[0];
5177 else if (PyString_Check(v)) {
5178 if (PyString_GET_SIZE(v) != 1)
5179 goto onError;
5180 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5183 else {
5184 /* Integer input truncated to a character */
5185 long x;
5186 x = PyInt_AsLong(v);
5187 if (x == -1 && PyErr_Occurred())
5188 goto onError;
5189 buf[0] = (char) x;
5191 buf[1] = '\0';
5192 return 1;
5194 onError:
5195 PyErr_SetString(PyExc_TypeError,
5196 "%c requires int or char");
5197 return -1;
5200 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5202 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5203 chars are formatted. XXX This is a magic number. Each formatting
5204 routine does bounds checking to ensure no overflow, but a better
5205 solution may be to malloc a buffer of appropriate size for each
5206 format. For now, the current solution is sufficient.
5208 #define FORMATBUFLEN (size_t)120
5210 PyObject *PyUnicode_Format(PyObject *format,
5211 PyObject *args)
5213 Py_UNICODE *fmt, *res;
5214 int fmtcnt, rescnt, reslen, arglen, argidx;
5215 int args_owned = 0;
5216 PyUnicodeObject *result = NULL;
5217 PyObject *dict = NULL;
5218 PyObject *uformat;
5220 if (format == NULL || args == NULL) {
5221 PyErr_BadInternalCall();
5222 return NULL;
5224 uformat = PyUnicode_FromObject(format);
5225 if (uformat == NULL)
5226 return NULL;
5227 fmt = PyUnicode_AS_UNICODE(uformat);
5228 fmtcnt = PyUnicode_GET_SIZE(uformat);
5230 reslen = rescnt = fmtcnt + 100;
5231 result = _PyUnicode_New(reslen);
5232 if (result == NULL)
5233 goto onError;
5234 res = PyUnicode_AS_UNICODE(result);
5236 if (PyTuple_Check(args)) {
5237 arglen = PyTuple_Size(args);
5238 argidx = 0;
5240 else {
5241 arglen = -1;
5242 argidx = -2;
5244 if (args->ob_type->tp_as_mapping)
5245 dict = args;
5247 while (--fmtcnt >= 0) {
5248 if (*fmt != '%') {
5249 if (--rescnt < 0) {
5250 rescnt = fmtcnt + 100;
5251 reslen += rescnt;
5252 if (_PyUnicode_Resize(&result, reslen) < 0)
5253 return NULL;
5254 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5255 --rescnt;
5257 *res++ = *fmt++;
5259 else {
5260 /* Got a format specifier */
5261 int flags = 0;
5262 int width = -1;
5263 int prec = -1;
5264 Py_UNICODE c = '\0';
5265 Py_UNICODE fill;
5266 PyObject *v = NULL;
5267 PyObject *temp = NULL;
5268 Py_UNICODE *pbuf;
5269 Py_UNICODE sign;
5270 int len;
5271 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5273 fmt++;
5274 if (*fmt == '(') {
5275 Py_UNICODE *keystart;
5276 int keylen;
5277 PyObject *key;
5278 int pcount = 1;
5280 if (dict == NULL) {
5281 PyErr_SetString(PyExc_TypeError,
5282 "format requires a mapping");
5283 goto onError;
5285 ++fmt;
5286 --fmtcnt;
5287 keystart = fmt;
5288 /* Skip over balanced parentheses */
5289 while (pcount > 0 && --fmtcnt >= 0) {
5290 if (*fmt == ')')
5291 --pcount;
5292 else if (*fmt == '(')
5293 ++pcount;
5294 fmt++;
5296 keylen = fmt - keystart - 1;
5297 if (fmtcnt < 0 || pcount > 0) {
5298 PyErr_SetString(PyExc_ValueError,
5299 "incomplete format key");
5300 goto onError;
5302 #if 0
5303 /* keys are converted to strings using UTF-8 and
5304 then looked up since Python uses strings to hold
5305 variables names etc. in its namespaces and we
5306 wouldn't want to break common idioms. */
5307 key = PyUnicode_EncodeUTF8(keystart,
5308 keylen,
5309 NULL);
5310 #else
5311 key = PyUnicode_FromUnicode(keystart, keylen);
5312 #endif
5313 if (key == NULL)
5314 goto onError;
5315 if (args_owned) {
5316 Py_DECREF(args);
5317 args_owned = 0;
5319 args = PyObject_GetItem(dict, key);
5320 Py_DECREF(key);
5321 if (args == NULL) {
5322 goto onError;
5324 args_owned = 1;
5325 arglen = -1;
5326 argidx = -2;
5328 while (--fmtcnt >= 0) {
5329 switch (c = *fmt++) {
5330 case '-': flags |= F_LJUST; continue;
5331 case '+': flags |= F_SIGN; continue;
5332 case ' ': flags |= F_BLANK; continue;
5333 case '#': flags |= F_ALT; continue;
5334 case '0': flags |= F_ZERO; continue;
5336 break;
5338 if (c == '*') {
5339 v = getnextarg(args, arglen, &argidx);
5340 if (v == NULL)
5341 goto onError;
5342 if (!PyInt_Check(v)) {
5343 PyErr_SetString(PyExc_TypeError,
5344 "* wants int");
5345 goto onError;
5347 width = PyInt_AsLong(v);
5348 if (width < 0) {
5349 flags |= F_LJUST;
5350 width = -width;
5352 if (--fmtcnt >= 0)
5353 c = *fmt++;
5355 else if (c >= '0' && c <= '9') {
5356 width = c - '0';
5357 while (--fmtcnt >= 0) {
5358 c = *fmt++;
5359 if (c < '0' || c > '9')
5360 break;
5361 if ((width*10) / 10 != width) {
5362 PyErr_SetString(PyExc_ValueError,
5363 "width too big");
5364 goto onError;
5366 width = width*10 + (c - '0');
5369 if (c == '.') {
5370 prec = 0;
5371 if (--fmtcnt >= 0)
5372 c = *fmt++;
5373 if (c == '*') {
5374 v = getnextarg(args, arglen, &argidx);
5375 if (v == NULL)
5376 goto onError;
5377 if (!PyInt_Check(v)) {
5378 PyErr_SetString(PyExc_TypeError,
5379 "* wants int");
5380 goto onError;
5382 prec = PyInt_AsLong(v);
5383 if (prec < 0)
5384 prec = 0;
5385 if (--fmtcnt >= 0)
5386 c = *fmt++;
5388 else if (c >= '0' && c <= '9') {
5389 prec = c - '0';
5390 while (--fmtcnt >= 0) {
5391 c = Py_CHARMASK(*fmt++);
5392 if (c < '0' || c > '9')
5393 break;
5394 if ((prec*10) / 10 != prec) {
5395 PyErr_SetString(PyExc_ValueError,
5396 "prec too big");
5397 goto onError;
5399 prec = prec*10 + (c - '0');
5402 } /* prec */
5403 if (fmtcnt >= 0) {
5404 if (c == 'h' || c == 'l' || c == 'L') {
5405 if (--fmtcnt >= 0)
5406 c = *fmt++;
5409 if (fmtcnt < 0) {
5410 PyErr_SetString(PyExc_ValueError,
5411 "incomplete format");
5412 goto onError;
5414 if (c != '%') {
5415 v = getnextarg(args, arglen, &argidx);
5416 if (v == NULL)
5417 goto onError;
5419 sign = 0;
5420 fill = ' ';
5421 switch (c) {
5423 case '%':
5424 pbuf = formatbuf;
5425 /* presume that buffer length is at least 1 */
5426 pbuf[0] = '%';
5427 len = 1;
5428 break;
5430 case 's':
5431 case 'r':
5432 if (PyUnicode_Check(v) && c == 's') {
5433 temp = v;
5434 Py_INCREF(temp);
5436 else {
5437 PyObject *unicode;
5438 if (c == 's')
5439 temp = PyObject_Str(v);
5440 else
5441 temp = PyObject_Repr(v);
5442 if (temp == NULL)
5443 goto onError;
5444 if (!PyString_Check(temp)) {
5445 /* XXX Note: this should never happen, since
5446 PyObject_Repr() and PyObject_Str() assure
5447 this */
5448 Py_DECREF(temp);
5449 PyErr_SetString(PyExc_TypeError,
5450 "%s argument has non-string str()");
5451 goto onError;
5453 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5454 PyString_GET_SIZE(temp),
5455 NULL,
5456 "strict");
5457 Py_DECREF(temp);
5458 temp = unicode;
5459 if (temp == NULL)
5460 goto onError;
5462 pbuf = PyUnicode_AS_UNICODE(temp);
5463 len = PyUnicode_GET_SIZE(temp);
5464 if (prec >= 0 && len > prec)
5465 len = prec;
5466 break;
5468 case 'i':
5469 case 'd':
5470 case 'u':
5471 case 'o':
5472 case 'x':
5473 case 'X':
5474 if (c == 'i')
5475 c = 'd';
5476 if (PyLong_Check(v)) {
5477 temp = formatlong(v, flags, prec, c);
5478 if (!temp)
5479 goto onError;
5480 pbuf = PyUnicode_AS_UNICODE(temp);
5481 len = PyUnicode_GET_SIZE(temp);
5482 /* unbounded ints can always produce
5483 a sign character! */
5484 sign = 1;
5486 else {
5487 pbuf = formatbuf;
5488 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5489 flags, prec, c, v);
5490 if (len < 0)
5491 goto onError;
5492 /* only d conversion is signed */
5493 sign = c == 'd';
5495 if (flags & F_ZERO)
5496 fill = '0';
5497 break;
5499 case 'e':
5500 case 'E':
5501 case 'f':
5502 case 'g':
5503 case 'G':
5504 pbuf = formatbuf;
5505 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5506 flags, prec, c, v);
5507 if (len < 0)
5508 goto onError;
5509 sign = 1;
5510 if (flags & F_ZERO)
5511 fill = '0';
5512 break;
5514 case 'c':
5515 pbuf = formatbuf;
5516 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5517 if (len < 0)
5518 goto onError;
5519 break;
5521 default:
5522 PyErr_Format(PyExc_ValueError,
5523 "unsupported format character '%c' (0x%x) "
5524 "at index %i",
5525 (31<=c && c<=126) ? c : '?',
5526 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5527 goto onError;
5529 if (sign) {
5530 if (*pbuf == '-' || *pbuf == '+') {
5531 sign = *pbuf++;
5532 len--;
5534 else if (flags & F_SIGN)
5535 sign = '+';
5536 else if (flags & F_BLANK)
5537 sign = ' ';
5538 else
5539 sign = 0;
5541 if (width < len)
5542 width = len;
5543 if (rescnt < width + (sign != 0)) {
5544 reslen -= rescnt;
5545 rescnt = width + fmtcnt + 100;
5546 reslen += rescnt;
5547 if (_PyUnicode_Resize(&result, reslen) < 0)
5548 return NULL;
5549 res = PyUnicode_AS_UNICODE(result)
5550 + reslen - rescnt;
5552 if (sign) {
5553 if (fill != ' ')
5554 *res++ = sign;
5555 rescnt--;
5556 if (width > len)
5557 width--;
5559 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5560 assert(pbuf[0] == '0');
5561 assert(pbuf[1] == c);
5562 if (fill != ' ') {
5563 *res++ = *pbuf++;
5564 *res++ = *pbuf++;
5566 rescnt -= 2;
5567 width -= 2;
5568 if (width < 0)
5569 width = 0;
5570 len -= 2;
5572 if (width > len && !(flags & F_LJUST)) {
5573 do {
5574 --rescnt;
5575 *res++ = fill;
5576 } while (--width > len);
5578 if (fill == ' ') {
5579 if (sign)
5580 *res++ = sign;
5581 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5582 assert(pbuf[0] == '0');
5583 assert(pbuf[1] == c);
5584 *res++ = *pbuf++;
5585 *res++ = *pbuf++;
5588 Py_UNICODE_COPY(res, pbuf, len);
5589 res += len;
5590 rescnt -= len;
5591 while (--width >= len) {
5592 --rescnt;
5593 *res++ = ' ';
5595 if (dict && (argidx < arglen) && c != '%') {
5596 PyErr_SetString(PyExc_TypeError,
5597 "not all arguments converted");
5598 goto onError;
5600 Py_XDECREF(temp);
5601 } /* '%' */
5602 } /* until end */
5603 if (argidx < arglen && !dict) {
5604 PyErr_SetString(PyExc_TypeError,
5605 "not all arguments converted");
5606 goto onError;
5609 if (args_owned) {
5610 Py_DECREF(args);
5612 Py_DECREF(uformat);
5613 if (_PyUnicode_Resize(&result, reslen - rescnt))
5614 goto onError;
5615 return (PyObject *)result;
5617 onError:
5618 Py_XDECREF(result);
5619 Py_DECREF(uformat);
5620 if (args_owned) {
5621 Py_DECREF(args);
5623 return NULL;
5626 static PyBufferProcs unicode_as_buffer = {
5627 (getreadbufferproc) unicode_buffer_getreadbuf,
5628 (getwritebufferproc) unicode_buffer_getwritebuf,
5629 (getsegcountproc) unicode_buffer_getsegcount,
5630 (getcharbufferproc) unicode_buffer_getcharbuf,
5633 staticforward PyObject *
5634 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5636 static PyObject *
5637 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5639 PyObject *x = NULL;
5640 static char *kwlist[] = {"string", "encoding", "errors", 0};
5641 char *encoding = NULL;
5642 char *errors = NULL;
5644 if (type != &PyUnicode_Type)
5645 return unicode_subtype_new(type, args, kwds);
5646 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5647 kwlist, &x, &encoding, &errors))
5648 return NULL;
5649 if (x == NULL)
5650 return (PyObject *)_PyUnicode_New(0);
5651 if (encoding == NULL && errors == NULL)
5652 return PyObject_Unicode(x);
5653 else
5654 return PyUnicode_FromEncodedObject(x, encoding, errors);
5657 static PyObject *
5658 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5660 PyUnicodeObject *tmp, *pnew;
5661 int n;
5663 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5664 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5665 if (tmp == NULL)
5666 return NULL;
5667 assert(PyUnicode_Check(tmp));
5668 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5669 if (pnew == NULL)
5670 return NULL;
5671 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5672 if (pnew->str == NULL) {
5673 _Py_ForgetReference((PyObject *)pnew);
5674 PyObject_DEL(pnew);
5675 return NULL;
5677 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5678 pnew->length = n;
5679 pnew->hash = tmp->hash;
5680 Py_DECREF(tmp);
5681 return (PyObject *)pnew;
5684 static char unicode_doc[] =
5685 "unicode(string [, encoding[, errors]]) -> object\n\
5687 Create a new Unicode object from the given encoded string.\n\
5688 encoding defaults to the current default string encoding and \n\
5689 errors, defining the error handling, to 'strict'.";
5691 PyTypeObject PyUnicode_Type = {
5692 PyObject_HEAD_INIT(&PyType_Type)
5693 0, /* ob_size */
5694 "unicode", /* tp_name */
5695 sizeof(PyUnicodeObject), /* tp_size */
5696 0, /* tp_itemsize */
5697 /* Slots */
5698 (destructor)unicode_dealloc, /* tp_dealloc */
5699 0, /* tp_print */
5700 0, /* tp_getattr */
5701 0, /* tp_setattr */
5702 (cmpfunc) unicode_compare, /* tp_compare */
5703 (reprfunc) unicode_repr, /* tp_repr */
5704 0, /* tp_as_number */
5705 &unicode_as_sequence, /* tp_as_sequence */
5706 0, /* tp_as_mapping */
5707 (hashfunc) unicode_hash, /* tp_hash*/
5708 0, /* tp_call*/
5709 (reprfunc) unicode_str, /* tp_str */
5710 PyObject_GenericGetAttr, /* tp_getattro */
5711 0, /* tp_setattro */
5712 &unicode_as_buffer, /* tp_as_buffer */
5713 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5714 unicode_doc, /* tp_doc */
5715 0, /* tp_traverse */
5716 0, /* tp_clear */
5717 0, /* tp_richcompare */
5718 0, /* tp_weaklistoffset */
5719 0, /* tp_iter */
5720 0, /* tp_iternext */
5721 unicode_methods, /* tp_methods */
5722 0, /* tp_members */
5723 0, /* tp_getset */
5724 0, /* tp_base */
5725 0, /* tp_dict */
5726 0, /* tp_descr_get */
5727 0, /* tp_descr_set */
5728 0, /* tp_dictoffset */
5729 0, /* tp_init */
5730 0, /* tp_alloc */
5731 unicode_new, /* tp_new */
5732 _PyObject_Del, /* tp_free */
5735 /* Initialize the Unicode implementation */
5737 void _PyUnicode_Init(void)
5739 int i;
5741 /* Init the implementation */
5742 unicode_freelist = NULL;
5743 unicode_freelist_size = 0;
5744 unicode_empty = _PyUnicode_New(0);
5745 strcpy(unicode_default_encoding, "ascii");
5746 for (i = 0; i < 256; i++)
5747 unicode_latin1[i] = NULL;
5750 /* Finalize the Unicode implementation */
5752 void
5753 _PyUnicode_Fini(void)
5755 PyUnicodeObject *u;
5756 int i;
5758 Py_XDECREF(unicode_empty);
5759 unicode_empty = NULL;
5761 for (i = 0; i < 256; i++) {
5762 if (unicode_latin1[i]) {
5763 Py_DECREF(unicode_latin1[i]);
5764 unicode_latin1[i] = NULL;
5768 for (u = unicode_freelist; u != NULL;) {
5769 PyUnicodeObject *v = u;
5770 u = *(PyUnicodeObject **)u;
5771 if (v->str)
5772 PyMem_DEL(v->str);
5773 Py_XDECREF(v->defenc);
5774 PyObject_DEL(v);
5776 unicode_freelist = NULL;
5777 unicode_freelist_size = 0;