This commit was manufactured by cvs2svn to create tag 'r221'.
[python/dscho.git] / Objects / unicodeobject.c
blob5a8777b39643892d40428e84f6a3ad41d282a480
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_DEL(unicode);
223 return NULL;
226 static
227 void unicode_dealloc(register PyUnicodeObject *unicode)
229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
233 PyMem_DEL(unicode->str);
234 unicode->str = NULL;
235 unicode->length = 0;
237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
241 /* Add to free list */
242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
246 else {
247 PyMem_DEL(unicode->str);
248 Py_XDECREF(unicode->defenc);
249 unicode->ob_type->tp_free((PyObject *)unicode);
253 int PyUnicode_Resize(PyObject **unicode,
254 int length)
256 register PyUnicodeObject *v;
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
288 /* Internal API for use in unicodeobject.c only ! */
289 #define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
295 PyUnicodeObject *unicode;
297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
313 if (!unicode)
314 return NULL;
315 unicode->str[0] = *u;
316 unicode_latin1[*u] = unicode;
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
329 Py_UNICODE_COPY(unicode->str, u, size);
331 return (PyObject *)unicode;
334 #ifdef HAVE_WCHAR_H
336 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
339 PyUnicodeObject *unicode;
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
350 /* Copy the wchar_t data into the new object */
351 #ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353 #else
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
361 #endif
363 return (PyObject *)unicode;
366 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376 #ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378 #else
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
386 #endif
388 return size;
391 #endif
393 PyObject *PyUnicode_FromObject(register PyObject *obj)
395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
410 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
414 const char *s = NULL;
415 int len;
416 int owned = 0;
417 PyObject *v;
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
424 #if 0
425 /* For b/w compatibility we also accept Unicode objects provided
426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
438 return NULL;
440 return PyObject_Unicode(obj);
442 #else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
448 #endif
450 /* Coerce object */
451 if (PyString_Check(obj)) {
452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
459 PyErr_Format(PyExc_TypeError,
460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
462 obj->ob_type->tp_name);
463 goto onError;
466 /* Convert to Unicode */
467 if (len == 0) {
468 Py_INCREF(unicode_empty);
469 v = (PyObject *)unicode_empty;
471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
474 if (owned) {
475 Py_DECREF(obj);
477 return v;
479 onError:
480 if (owned) {
481 Py_DECREF(obj);
483 return NULL;
486 PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
491 PyObject *buffer = NULL, *unicode;
493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
498 return PyUnicode_DecodeUTF8(s, size, errors);
499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
513 "decoder did not return an unicode object (type=%.400s)",
514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
518 Py_DECREF(buffer);
519 return unicode;
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
526 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
531 PyObject *v, *unicode;
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
541 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
545 PyObject *v;
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
558 return PyUnicode_AsUTF8String(unicode);
559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
572 "encoder did not return a string object (type=%.400s)",
573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
577 return v;
579 onError:
580 return NULL;
583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
602 return PyUnicode_AS_UNICODE(unicode);
604 onError:
605 return NULL;
608 int PyUnicode_GetSize(PyObject *unicode)
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
614 return PyUnicode_GET_SIZE(unicode);
616 onError:
617 return -1;
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding;
625 int PyUnicode_SetDefaultEncoding(const char *encoding)
627 PyObject *v;
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
640 onError:
641 return -1;
644 /* --- UTF-7 Codec -------------------------------------------------------- */
646 /* see RFC2152 for details */
648 static
649 char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667 #define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
672 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
677 #define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
683 #define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
702 static
703 int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
722 return 0;
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
732 PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
751 p = unicode->str;
752 e = s + size;
754 while (s < e) {
755 Py_UNICODE ch = *s;
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
806 inShift = 1;
807 bitsleft = 0;
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
815 else {
816 *p++ = ch;
817 s++;
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
833 return (PyObject *)unicode;
835 onError:
836 Py_DECREF(unicode);
837 return NULL;
841 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
917 else {
918 *out++ = '-';
919 inShift = 0;
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
934 return v;
937 #undef SPECIAL
938 #undef B64
939 #undef B64CHAR
940 #undef UB64
941 #undef ENCODE
942 #undef DECODE
944 /* --- UTF-8 Codec -------------------------------------------------------- */
946 static
947 char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
968 static
969 int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
977 "UTF-8 decoding error: %.400s",
978 details);
979 return -1;
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
991 else {
992 PyErr_Format(PyExc_ValueError,
993 "UTF-8 decoding error; unknown error handling code: %.400s",
994 errors);
995 return -1;
999 PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
1007 const char *errmsg = "";
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1021 while (s < e) {
1022 Py_UCS4 ch = (unsigned char)*s;
1024 if (ch < 0x80) {
1025 *p++ = (Py_UNICODE)ch;
1026 s++;
1027 continue;
1030 n = utf8_code_length[ch];
1032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1037 switch (n) {
1039 case 0:
1040 errmsg = "unexpected code byte";
1041 goto utf8Error;
1043 case 1:
1044 errmsg = "internal error";
1045 goto utf8Error;
1047 case 2:
1048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1057 else
1058 *p++ = (Py_UNICODE)ch;
1059 break;
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
1063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069 errmsg = "illegal encoding";
1070 goto utf8Error;
1072 else
1073 *p++ = (Py_UNICODE)ch;
1074 break;
1076 case 4:
1077 if ((s[1] & 0xc0) != 0x80 ||
1078 (s[2] & 0xc0) != 0x80 ||
1079 (s[3] & 0xc0) != 0x80) {
1080 errmsg = "invalid data";
1081 goto utf8Error;
1083 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
1086 if ((ch < 0x10000) /* minimum value allowed for 4
1087 byte encoding */
1088 || (ch > 0x10ffff)) /* maximum value allowed for
1089 UTF-16 */
1091 errmsg = "illegal encoding";
1092 goto utf8Error;
1094 #ifdef Py_UNICODE_WIDE
1095 *p++ = (Py_UNICODE)ch;
1096 #else
1097 /* compute and append the two surrogates: */
1099 /* translate from 10000..10FFFF to 0..FFFF */
1100 ch -= 0x10000;
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1105 /* low surrogate = bottom 10 bits added to DC00 */
1106 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1107 #endif
1108 break;
1110 default:
1111 /* Other sizes are only needed for UCS-4 */
1112 errmsg = "unsupported Unicode code range";
1113 goto utf8Error;
1115 s += n;
1116 continue;
1118 utf8Error:
1119 if (utf8_decoding_error(&s, &p, errors, errmsg))
1120 goto onError;
1123 /* Adjust length */
1124 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1125 goto onError;
1127 return (PyObject *)unicode;
1129 onError:
1130 Py_DECREF(unicode);
1131 return NULL;
1134 /* Not used anymore, now that the encoder supports UTF-16
1135 surrogates. */
1136 #if 0
1137 static
1138 int utf8_encoding_error(const Py_UNICODE **source,
1139 char **dest,
1140 const char *errors,
1141 const char *details)
1143 if ((errors == NULL) ||
1144 (strcmp(errors,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError,
1146 "UTF-8 encoding error: %.400s",
1147 details);
1148 return -1;
1150 else if (strcmp(errors,"ignore") == 0) {
1151 return 0;
1153 else if (strcmp(errors,"replace") == 0) {
1154 **dest = '?';
1155 (*dest)++;
1156 return 0;
1158 else {
1159 PyErr_Format(PyExc_ValueError,
1160 "UTF-8 encoding error; "
1161 "unknown error handling code: %.400s",
1162 errors);
1163 return -1;
1166 #endif
1168 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169 int size,
1170 const char *errors)
1172 PyObject *v;
1173 char *p;
1174 char *q;
1175 Py_UCS4 ch2;
1176 unsigned int cbAllocated = 3 * size;
1177 int i = 0;
1179 v = PyString_FromStringAndSize(NULL, cbAllocated);
1180 if (v == NULL)
1181 return NULL;
1182 if (size == 0)
1183 return v;
1185 p = q = PyString_AS_STRING(v);
1186 while (i < size) {
1187 Py_UCS4 ch = s[i++];
1188 if (ch < 0x80)
1189 *p++ = (char) ch;
1191 else if (ch < 0x0800) {
1192 *p++ = 0xc0 | (ch >> 6);
1193 *p++ = 0x80 | (ch & 0x3f);
1196 else if (ch < 0x10000) {
1197 /* Check for high surrogate */
1198 if (0xD800 <= ch && ch <= 0xDBFF) {
1199 if (i != size) {
1200 ch2 = s[i];
1201 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1203 if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
1204 /* Provide enough room for some more
1205 surrogates */
1206 cbAllocated += 4*10;
1207 if (_PyString_Resize(&v, cbAllocated))
1208 goto onError;
1209 p = PyString_AS_STRING(v) + (p - q);
1210 q = PyString_AS_STRING(v);
1213 /* combine the two values */
1214 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1216 *p++ = (char)((ch >> 18) | 0xf0);
1217 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1218 i++;
1222 else
1223 *p++ = (char)(0xe0 | (ch >> 12));
1225 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1226 *p++ = (char)(0x80 | (ch & 0x3f));
1228 } else {
1229 if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
1230 /* Provide enough room for some more
1231 surrogates */
1232 cbAllocated += 4*10;
1233 if (_PyString_Resize(&v, cbAllocated))
1234 goto onError;
1235 p = PyString_AS_STRING(v) + (p - q);
1236 q = PyString_AS_STRING(v);
1239 *p++ = 0xf0 | (ch>>18);
1240 *p++ = 0x80 | ((ch>>12) & 0x3f);
1241 *p++ = 0x80 | ((ch>>6) & 0x3f);
1242 *p++ = 0x80 | (ch & 0x3f);
1245 *p = '\0';
1246 if (_PyString_Resize(&v, p - q))
1247 goto onError;
1248 return v;
1250 onError:
1251 Py_XDECREF(v);
1252 return NULL;
1255 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1257 if (!PyUnicode_Check(unicode)) {
1258 PyErr_BadArgument();
1259 return NULL;
1261 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1262 PyUnicode_GET_SIZE(unicode),
1263 NULL);
1266 /* --- UTF-16 Codec ------------------------------------------------------- */
1268 static
1269 int utf16_decoding_error(Py_UNICODE **dest,
1270 const char *errors,
1271 const char *details)
1273 if ((errors == NULL) ||
1274 (strcmp(errors,"strict") == 0)) {
1275 PyErr_Format(PyExc_UnicodeError,
1276 "UTF-16 decoding error: %.400s",
1277 details);
1278 return -1;
1280 else if (strcmp(errors,"ignore") == 0) {
1281 return 0;
1283 else if (strcmp(errors,"replace") == 0) {
1284 if (dest) {
1285 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1286 (*dest)++;
1288 return 0;
1290 else {
1291 PyErr_Format(PyExc_ValueError,
1292 "UTF-16 decoding error; "
1293 "unknown error handling code: %.400s",
1294 errors);
1295 return -1;
1299 PyObject *
1300 PyUnicode_DecodeUTF16(const char *s,
1301 int size,
1302 const char *errors,
1303 int *byteorder)
1305 PyUnicodeObject *unicode;
1306 Py_UNICODE *p;
1307 const unsigned char *q, *e;
1308 int bo = 0; /* assume native ordering by default */
1309 const char *errmsg = "";
1310 /* Offsets from q for retrieving byte pairs in the right order. */
1311 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1312 int ihi = 1, ilo = 0;
1313 #else
1314 int ihi = 0, ilo = 1;
1315 #endif
1317 /* size should be an even number */
1318 if (size & 1) {
1319 if (utf16_decoding_error(NULL, errors, "truncated data"))
1320 return NULL;
1321 --size; /* else ignore the oddball byte */
1324 /* Note: size will always be longer than the resulting Unicode
1325 character count */
1326 unicode = _PyUnicode_New(size);
1327 if (!unicode)
1328 return NULL;
1329 if (size == 0)
1330 return (PyObject *)unicode;
1332 /* Unpack UTF-16 encoded data */
1333 p = unicode->str;
1334 q = (unsigned char *)s;
1335 e = q + size;
1337 if (byteorder)
1338 bo = *byteorder;
1340 /* Check for BOM marks (U+FEFF) in the input and adjust current
1341 byte order setting accordingly. In native mode, the leading BOM
1342 mark is skipped, in all other modes, it is copied to the output
1343 stream as-is (giving a ZWNBSP character). */
1344 if (bo == 0) {
1345 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1346 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1347 if (bom == 0xFEFF) {
1348 q += 2;
1349 bo = -1;
1351 else if (bom == 0xFFFE) {
1352 q += 2;
1353 bo = 1;
1355 #else
1356 if (bom == 0xFEFF) {
1357 q += 2;
1358 bo = 1;
1360 else if (bom == 0xFFFE) {
1361 q += 2;
1362 bo = -1;
1364 #endif
1367 if (bo == -1) {
1368 /* force LE */
1369 ihi = 1;
1370 ilo = 0;
1372 else if (bo == 1) {
1373 /* force BE */
1374 ihi = 0;
1375 ilo = 1;
1378 while (q < e) {
1379 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1380 q += 2;
1382 if (ch < 0xD800 || ch > 0xDFFF) {
1383 *p++ = ch;
1384 continue;
1387 /* UTF-16 code pair: */
1388 if (q >= e) {
1389 errmsg = "unexpected end of data";
1390 goto utf16Error;
1392 if (0xD800 <= ch && ch <= 0xDBFF) {
1393 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1394 q += 2;
1395 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1396 #ifndef Py_UNICODE_WIDE
1397 *p++ = ch;
1398 *p++ = ch2;
1399 #else
1400 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1401 #endif
1402 continue;
1404 else {
1405 errmsg = "illegal UTF-16 surrogate";
1406 goto utf16Error;
1410 errmsg = "illegal encoding";
1411 /* Fall through to report the error */
1413 utf16Error:
1414 if (utf16_decoding_error(&p, errors, errmsg))
1415 goto onError;
1418 if (byteorder)
1419 *byteorder = bo;
1421 /* Adjust length */
1422 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1423 goto onError;
1425 return (PyObject *)unicode;
1427 onError:
1428 Py_DECREF(unicode);
1429 return NULL;
1432 PyObject *
1433 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1434 int size,
1435 const char *errors,
1436 int byteorder)
1438 PyObject *v;
1439 unsigned char *p;
1440 int i, pairs;
1441 /* Offsets from p for storing byte pairs in the right order. */
1442 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1443 int ihi = 1, ilo = 0;
1444 #else
1445 int ihi = 0, ilo = 1;
1446 #endif
1448 #define STORECHAR(CH) \
1449 do { \
1450 p[ihi] = ((CH) >> 8) & 0xff; \
1451 p[ilo] = (CH) & 0xff; \
1452 p += 2; \
1453 } while(0)
1455 for (i = pairs = 0; i < size; i++)
1456 if (s[i] >= 0x10000)
1457 pairs++;
1458 v = PyString_FromStringAndSize(NULL,
1459 2 * (size + pairs + (byteorder == 0)));
1460 if (v == NULL)
1461 return NULL;
1463 p = (unsigned char *)PyString_AS_STRING(v);
1464 if (byteorder == 0)
1465 STORECHAR(0xFEFF);
1466 if (size == 0)
1467 return v;
1469 if (byteorder == -1) {
1470 /* force LE */
1471 ihi = 1;
1472 ilo = 0;
1474 else if (byteorder == 1) {
1475 /* force BE */
1476 ihi = 0;
1477 ilo = 1;
1480 while (size-- > 0) {
1481 Py_UNICODE ch = *s++;
1482 Py_UNICODE ch2 = 0;
1483 if (ch >= 0x10000) {
1484 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1485 ch = 0xD800 | ((ch-0x10000) >> 10);
1487 STORECHAR(ch);
1488 if (ch2)
1489 STORECHAR(ch2);
1491 return v;
1492 #undef STORECHAR
1495 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1497 if (!PyUnicode_Check(unicode)) {
1498 PyErr_BadArgument();
1499 return NULL;
1501 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1502 PyUnicode_GET_SIZE(unicode),
1503 NULL,
1507 /* --- Unicode Escape Codec ----------------------------------------------- */
1509 static
1510 int unicodeescape_decoding_error(Py_UNICODE **x,
1511 const char *errors,
1512 const char *details)
1514 if ((errors == NULL) ||
1515 (strcmp(errors,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError,
1517 "Unicode-Escape decoding error: %.400s",
1518 details);
1519 return -1;
1521 else if (strcmp(errors,"ignore") == 0) {
1522 return 0;
1524 else if (strcmp(errors,"replace") == 0) {
1525 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1526 (*x)++;
1527 return 0;
1529 else {
1530 PyErr_Format(PyExc_ValueError,
1531 "Unicode-Escape decoding error; "
1532 "unknown error handling code: %.400s",
1533 errors);
1534 return -1;
1538 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1540 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1541 int size,
1542 const char *errors)
1544 PyUnicodeObject *v;
1545 Py_UNICODE *p, *buf;
1546 const char *end;
1547 char* message;
1548 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1550 /* Escaped strings will always be longer than the resulting
1551 Unicode string, so we start with size here and then reduce the
1552 length after conversion to the true value. */
1553 v = _PyUnicode_New(size);
1554 if (v == NULL)
1555 goto onError;
1556 if (size == 0)
1557 return (PyObject *)v;
1559 p = buf = PyUnicode_AS_UNICODE(v);
1560 end = s + size;
1562 while (s < end) {
1563 unsigned char c;
1564 Py_UNICODE x;
1565 int i, digits;
1567 /* Non-escape characters are interpreted as Unicode ordinals */
1568 if (*s != '\\') {
1569 *p++ = (unsigned char) *s++;
1570 continue;
1573 /* \ - Escapes */
1574 s++;
1575 switch (*s++) {
1577 /* \x escapes */
1578 case '\n': break;
1579 case '\\': *p++ = '\\'; break;
1580 case '\'': *p++ = '\''; break;
1581 case '\"': *p++ = '\"'; break;
1582 case 'b': *p++ = '\b'; break;
1583 case 'f': *p++ = '\014'; break; /* FF */
1584 case 't': *p++ = '\t'; break;
1585 case 'n': *p++ = '\n'; break;
1586 case 'r': *p++ = '\r'; break;
1587 case 'v': *p++ = '\013'; break; /* VT */
1588 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1590 /* \OOO (octal) escapes */
1591 case '0': case '1': case '2': case '3':
1592 case '4': case '5': case '6': case '7':
1593 x = s[-1] - '0';
1594 if ('0' <= *s && *s <= '7') {
1595 x = (x<<3) + *s++ - '0';
1596 if ('0' <= *s && *s <= '7')
1597 x = (x<<3) + *s++ - '0';
1599 *p++ = x;
1600 break;
1602 /* hex escapes */
1603 /* \xXX */
1604 case 'x':
1605 digits = 2;
1606 message = "truncated \\xXX escape";
1607 goto hexescape;
1609 /* \uXXXX */
1610 case 'u':
1611 digits = 4;
1612 message = "truncated \\uXXXX escape";
1613 goto hexescape;
1615 /* \UXXXXXXXX */
1616 case 'U':
1617 digits = 8;
1618 message = "truncated \\UXXXXXXXX escape";
1619 hexescape:
1620 chr = 0;
1621 for (i = 0; i < digits; i++) {
1622 c = (unsigned char) s[i];
1623 if (!isxdigit(c)) {
1624 if (unicodeescape_decoding_error(&p, errors, message))
1625 goto onError;
1626 chr = 0xffffffff;
1627 i++;
1628 break;
1630 chr = (chr<<4) & ~0xF;
1631 if (c >= '0' && c <= '9')
1632 chr += c - '0';
1633 else if (c >= 'a' && c <= 'f')
1634 chr += 10 + c - 'a';
1635 else
1636 chr += 10 + c - 'A';
1638 s += i;
1639 if (chr == 0xffffffff)
1640 /* _decoding_error will have already written into the
1641 target buffer. */
1642 break;
1643 store:
1644 /* when we get here, chr is a 32-bit unicode character */
1645 if (chr <= 0xffff)
1646 /* UCS-2 character */
1647 *p++ = (Py_UNICODE) chr;
1648 else if (chr <= 0x10ffff) {
1649 /* UCS-4 character. Either store directly, or as
1650 surrogate pair. */
1651 #ifdef Py_UNICODE_WIDE
1652 *p++ = chr;
1653 #else
1654 chr -= 0x10000L;
1655 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1656 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1657 #endif
1658 } else {
1659 if (unicodeescape_decoding_error(
1660 &p, errors,
1661 "illegal Unicode character")
1663 goto onError;
1665 break;
1667 /* \N{name} */
1668 case 'N':
1669 message = "malformed \\N character escape";
1670 if (ucnhash_CAPI == NULL) {
1671 /* load the unicode data module */
1672 PyObject *m, *v;
1673 m = PyImport_ImportModule("unicodedata");
1674 if (m == NULL)
1675 goto ucnhashError;
1676 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1677 Py_DECREF(m);
1678 if (v == NULL)
1679 goto ucnhashError;
1680 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1681 Py_DECREF(v);
1682 if (ucnhash_CAPI == NULL)
1683 goto ucnhashError;
1685 if (*s == '{') {
1686 const char *start = s+1;
1687 /* look for the closing brace */
1688 while (*s != '}' && s < end)
1689 s++;
1690 if (s > start && s < end && *s == '}') {
1691 /* found a name. look it up in the unicode database */
1692 message = "unknown Unicode character name";
1693 s++;
1694 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1695 goto store;
1698 if (unicodeescape_decoding_error(&p, errors, message))
1699 goto onError;
1700 break;
1702 default:
1703 if (s > end) {
1704 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1705 goto onError;
1707 else {
1708 *p++ = '\\';
1709 *p++ = (unsigned char)s[-1];
1711 break;
1714 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1715 goto onError;
1716 return (PyObject *)v;
1718 ucnhashError:
1719 PyErr_SetString(
1720 PyExc_UnicodeError,
1721 "\\N escapes not supported (can't load unicodedata module)"
1723 return NULL;
1725 onError:
1726 Py_XDECREF(v);
1727 return NULL;
1730 /* Return a Unicode-Escape string version of the Unicode object.
1732 If quotes is true, the string is enclosed in u"" or u'' quotes as
1733 appropriate.
1737 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1738 int size,
1739 Py_UNICODE ch);
1741 static
1742 PyObject *unicodeescape_string(const Py_UNICODE *s,
1743 int size,
1744 int quotes)
1746 PyObject *repr;
1747 char *p;
1749 static const char *hexdigit = "0123456789abcdef";
1751 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1752 if (repr == NULL)
1753 return NULL;
1755 p = PyString_AS_STRING(repr);
1757 if (quotes) {
1758 *p++ = 'u';
1759 *p++ = (findchar(s, size, '\'') &&
1760 !findchar(s, size, '"')) ? '"' : '\'';
1762 while (size-- > 0) {
1763 Py_UNICODE ch = *s++;
1765 /* Escape quotes */
1766 if (quotes &&
1767 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1768 *p++ = '\\';
1769 *p++ = (char) ch;
1770 continue;
1773 #ifdef Py_UNICODE_WIDE
1774 /* Map 21-bit characters to '\U00xxxxxx' */
1775 else if (ch >= 0x10000) {
1776 int offset = p - PyString_AS_STRING(repr);
1778 /* Resize the string if necessary */
1779 if (offset + 12 > PyString_GET_SIZE(repr)) {
1780 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1781 goto onError;
1782 p = PyString_AS_STRING(repr) + offset;
1785 *p++ = '\\';
1786 *p++ = 'U';
1787 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1788 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1789 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1790 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1791 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1792 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1793 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1794 *p++ = hexdigit[ch & 0x0000000F];
1795 continue;
1797 #endif
1798 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1799 else if (ch >= 0xD800 && ch < 0xDC00) {
1800 Py_UNICODE ch2;
1801 Py_UCS4 ucs;
1803 ch2 = *s++;
1804 size--;
1805 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1806 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1807 *p++ = '\\';
1808 *p++ = 'U';
1809 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1810 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1811 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1812 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1813 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1814 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1815 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1816 *p++ = hexdigit[ucs & 0x0000000F];
1817 continue;
1819 /* Fall through: isolated surrogates are copied as-is */
1820 s--;
1821 size++;
1824 /* Map 16-bit characters to '\uxxxx' */
1825 if (ch >= 256) {
1826 *p++ = '\\';
1827 *p++ = 'u';
1828 *p++ = hexdigit[(ch >> 12) & 0x000F];
1829 *p++ = hexdigit[(ch >> 8) & 0x000F];
1830 *p++ = hexdigit[(ch >> 4) & 0x000F];
1831 *p++ = hexdigit[ch & 0x000F];
1834 /* Map special whitespace to '\t', \n', '\r' */
1835 else if (ch == '\t') {
1836 *p++ = '\\';
1837 *p++ = 't';
1839 else if (ch == '\n') {
1840 *p++ = '\\';
1841 *p++ = 'n';
1843 else if (ch == '\r') {
1844 *p++ = '\\';
1845 *p++ = 'r';
1848 /* Map non-printable US ASCII to '\xhh' */
1849 else if (ch < ' ' || ch >= 0x7F) {
1850 *p++ = '\\';
1851 *p++ = 'x';
1852 *p++ = hexdigit[(ch >> 4) & 0x000F];
1853 *p++ = hexdigit[ch & 0x000F];
1856 /* Copy everything else as-is */
1857 else
1858 *p++ = (char) ch;
1860 if (quotes)
1861 *p++ = PyString_AS_STRING(repr)[1];
1863 *p = '\0';
1864 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1865 goto onError;
1867 return repr;
1869 onError:
1870 Py_DECREF(repr);
1871 return NULL;
1874 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1875 int size)
1877 return unicodeescape_string(s, size, 0);
1880 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1882 if (!PyUnicode_Check(unicode)) {
1883 PyErr_BadArgument();
1884 return NULL;
1886 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1887 PyUnicode_GET_SIZE(unicode));
1890 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1892 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1893 int size,
1894 const char *errors)
1896 PyUnicodeObject *v;
1897 Py_UNICODE *p, *buf;
1898 const char *end;
1899 const char *bs;
1901 /* Escaped strings will always be longer than the resulting
1902 Unicode string, so we start with size here and then reduce the
1903 length after conversion to the true value. */
1904 v = _PyUnicode_New(size);
1905 if (v == NULL)
1906 goto onError;
1907 if (size == 0)
1908 return (PyObject *)v;
1909 p = buf = PyUnicode_AS_UNICODE(v);
1910 end = s + size;
1911 while (s < end) {
1912 unsigned char c;
1913 Py_UCS4 x;
1914 int i;
1916 /* Non-escape characters are interpreted as Unicode ordinals */
1917 if (*s != '\\') {
1918 *p++ = (unsigned char)*s++;
1919 continue;
1922 /* \u-escapes are only interpreted iff the number of leading
1923 backslashes if odd */
1924 bs = s;
1925 for (;s < end;) {
1926 if (*s != '\\')
1927 break;
1928 *p++ = (unsigned char)*s++;
1930 if (((s - bs) & 1) == 0 ||
1931 s >= end ||
1932 *s != 'u') {
1933 continue;
1935 p--;
1936 s++;
1938 /* \uXXXX with 4 hex digits */
1939 for (x = 0, i = 0; i < 4; i++) {
1940 c = (unsigned char)s[i];
1941 if (!isxdigit(c)) {
1942 if (unicodeescape_decoding_error(&p, errors,
1943 "truncated \\uXXXX"))
1944 goto onError;
1945 x = 0xffffffff;
1946 i++;
1947 break;
1949 x = (x<<4) & ~0xF;
1950 if (c >= '0' && c <= '9')
1951 x += c - '0';
1952 else if (c >= 'a' && c <= 'f')
1953 x += 10 + c - 'a';
1954 else
1955 x += 10 + c - 'A';
1957 s += i;
1958 if (x != 0xffffffff)
1959 *p++ = x;
1961 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1962 goto onError;
1963 return (PyObject *)v;
1965 onError:
1966 Py_XDECREF(v);
1967 return NULL;
1970 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1971 int size)
1973 PyObject *repr;
1974 char *p;
1975 char *q;
1977 static const char *hexdigit = "0123456789abcdef";
1979 repr = PyString_FromStringAndSize(NULL, 6 * size);
1980 if (repr == NULL)
1981 return NULL;
1982 if (size == 0)
1983 return repr;
1985 p = q = PyString_AS_STRING(repr);
1986 while (size-- > 0) {
1987 Py_UNICODE ch = *s++;
1988 /* Map 16-bit characters to '\uxxxx' */
1989 if (ch >= 256) {
1990 *p++ = '\\';
1991 *p++ = 'u';
1992 *p++ = hexdigit[(ch >> 12) & 0xf];
1993 *p++ = hexdigit[(ch >> 8) & 0xf];
1994 *p++ = hexdigit[(ch >> 4) & 0xf];
1995 *p++ = hexdigit[ch & 15];
1997 /* Copy everything else as-is */
1998 else
1999 *p++ = (char) ch;
2001 *p = '\0';
2002 if (_PyString_Resize(&repr, p - q))
2003 goto onError;
2005 return repr;
2007 onError:
2008 Py_DECREF(repr);
2009 return NULL;
2012 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2014 if (!PyUnicode_Check(unicode)) {
2015 PyErr_BadArgument();
2016 return NULL;
2018 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2019 PyUnicode_GET_SIZE(unicode));
2022 /* --- Latin-1 Codec ------------------------------------------------------ */
2024 PyObject *PyUnicode_DecodeLatin1(const char *s,
2025 int size,
2026 const char *errors)
2028 PyUnicodeObject *v;
2029 Py_UNICODE *p;
2031 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2032 if (size == 1 && *(unsigned char*)s < 256) {
2033 Py_UNICODE r = *(unsigned char*)s;
2034 return PyUnicode_FromUnicode(&r, 1);
2037 v = _PyUnicode_New(size);
2038 if (v == NULL)
2039 goto onError;
2040 if (size == 0)
2041 return (PyObject *)v;
2042 p = PyUnicode_AS_UNICODE(v);
2043 while (size-- > 0)
2044 *p++ = (unsigned char)*s++;
2045 return (PyObject *)v;
2047 onError:
2048 Py_XDECREF(v);
2049 return NULL;
2052 static
2053 int latin1_encoding_error(const Py_UNICODE **source,
2054 char **dest,
2055 const char *errors,
2056 const char *details)
2058 if ((errors == NULL) ||
2059 (strcmp(errors,"strict") == 0)) {
2060 PyErr_Format(PyExc_UnicodeError,
2061 "Latin-1 encoding error: %.400s",
2062 details);
2063 return -1;
2065 else if (strcmp(errors,"ignore") == 0) {
2066 return 0;
2068 else if (strcmp(errors,"replace") == 0) {
2069 **dest = '?';
2070 (*dest)++;
2071 return 0;
2073 else {
2074 PyErr_Format(PyExc_ValueError,
2075 "Latin-1 encoding error; "
2076 "unknown error handling code: %.400s",
2077 errors);
2078 return -1;
2082 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2083 int size,
2084 const char *errors)
2086 PyObject *repr;
2087 char *s, *start;
2089 repr = PyString_FromStringAndSize(NULL, size);
2090 if (repr == NULL)
2091 return NULL;
2092 if (size == 0)
2093 return repr;
2095 s = PyString_AS_STRING(repr);
2096 start = s;
2097 while (size-- > 0) {
2098 Py_UNICODE ch = *p++;
2099 if (ch >= 256) {
2100 if (latin1_encoding_error(&p, &s, errors,
2101 "ordinal not in range(256)"))
2102 goto onError;
2104 else
2105 *s++ = (char)ch;
2107 /* Resize if error handling skipped some characters */
2108 if (s - start < PyString_GET_SIZE(repr))
2109 if (_PyString_Resize(&repr, s - start))
2110 goto onError;
2111 return repr;
2113 onError:
2114 Py_DECREF(repr);
2115 return NULL;
2118 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2120 if (!PyUnicode_Check(unicode)) {
2121 PyErr_BadArgument();
2122 return NULL;
2124 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2125 PyUnicode_GET_SIZE(unicode),
2126 NULL);
2129 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2131 static
2132 int ascii_decoding_error(const char **source,
2133 Py_UNICODE **dest,
2134 const char *errors,
2135 const char *details)
2137 if ((errors == NULL) ||
2138 (strcmp(errors,"strict") == 0)) {
2139 PyErr_Format(PyExc_UnicodeError,
2140 "ASCII decoding error: %.400s",
2141 details);
2142 return -1;
2144 else if (strcmp(errors,"ignore") == 0) {
2145 return 0;
2147 else if (strcmp(errors,"replace") == 0) {
2148 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2149 (*dest)++;
2150 return 0;
2152 else {
2153 PyErr_Format(PyExc_ValueError,
2154 "ASCII decoding error; "
2155 "unknown error handling code: %.400s",
2156 errors);
2157 return -1;
2161 PyObject *PyUnicode_DecodeASCII(const char *s,
2162 int size,
2163 const char *errors)
2165 PyUnicodeObject *v;
2166 Py_UNICODE *p;
2168 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2169 if (size == 1 && *(unsigned char*)s < 128) {
2170 Py_UNICODE r = *(unsigned char*)s;
2171 return PyUnicode_FromUnicode(&r, 1);
2174 v = _PyUnicode_New(size);
2175 if (v == NULL)
2176 goto onError;
2177 if (size == 0)
2178 return (PyObject *)v;
2179 p = PyUnicode_AS_UNICODE(v);
2180 while (size-- > 0) {
2181 register unsigned char c;
2183 c = (unsigned char)*s++;
2184 if (c < 128)
2185 *p++ = c;
2186 else if (ascii_decoding_error(&s, &p, errors,
2187 "ordinal not in range(128)"))
2188 goto onError;
2190 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2191 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2192 goto onError;
2193 return (PyObject *)v;
2195 onError:
2196 Py_XDECREF(v);
2197 return NULL;
2200 static
2201 int ascii_encoding_error(const Py_UNICODE **source,
2202 char **dest,
2203 const char *errors,
2204 const char *details)
2206 if ((errors == NULL) ||
2207 (strcmp(errors,"strict") == 0)) {
2208 PyErr_Format(PyExc_UnicodeError,
2209 "ASCII encoding error: %.400s",
2210 details);
2211 return -1;
2213 else if (strcmp(errors,"ignore") == 0) {
2214 return 0;
2216 else if (strcmp(errors,"replace") == 0) {
2217 **dest = '?';
2218 (*dest)++;
2219 return 0;
2221 else {
2222 PyErr_Format(PyExc_ValueError,
2223 "ASCII encoding error; "
2224 "unknown error handling code: %.400s",
2225 errors);
2226 return -1;
2230 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2231 int size,
2232 const char *errors)
2234 PyObject *repr;
2235 char *s, *start;
2237 repr = PyString_FromStringAndSize(NULL, size);
2238 if (repr == NULL)
2239 return NULL;
2240 if (size == 0)
2241 return repr;
2243 s = PyString_AS_STRING(repr);
2244 start = s;
2245 while (size-- > 0) {
2246 Py_UNICODE ch = *p++;
2247 if (ch >= 128) {
2248 if (ascii_encoding_error(&p, &s, errors,
2249 "ordinal not in range(128)"))
2250 goto onError;
2252 else
2253 *s++ = (char)ch;
2255 /* Resize if error handling skipped some characters */
2256 if (s - start < PyString_GET_SIZE(repr))
2257 if (_PyString_Resize(&repr, s - start))
2258 goto onError;
2259 return repr;
2261 onError:
2262 Py_DECREF(repr);
2263 return NULL;
2266 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2268 if (!PyUnicode_Check(unicode)) {
2269 PyErr_BadArgument();
2270 return NULL;
2272 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2273 PyUnicode_GET_SIZE(unicode),
2274 NULL);
2277 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2279 /* --- MBCS codecs for Windows -------------------------------------------- */
2281 PyObject *PyUnicode_DecodeMBCS(const char *s,
2282 int size,
2283 const char *errors)
2285 PyUnicodeObject *v;
2286 Py_UNICODE *p;
2288 /* First get the size of the result */
2289 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2290 if (size > 0 && usize==0)
2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2293 v = _PyUnicode_New(usize);
2294 if (v == NULL)
2295 return NULL;
2296 if (usize == 0)
2297 return (PyObject *)v;
2298 p = PyUnicode_AS_UNICODE(v);
2299 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2300 Py_DECREF(v);
2301 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2304 return (PyObject *)v;
2307 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2308 int size,
2309 const char *errors)
2311 PyObject *repr;
2312 char *s;
2313 DWORD mbcssize;
2315 /* If there are no characters, bail now! */
2316 if (size==0)
2317 return PyString_FromString("");
2319 /* First get the size of the result */
2320 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2321 if (mbcssize==0)
2322 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2324 repr = PyString_FromStringAndSize(NULL, mbcssize);
2325 if (repr == NULL)
2326 return NULL;
2327 if (mbcssize == 0)
2328 return repr;
2330 /* Do the conversion */
2331 s = PyString_AS_STRING(repr);
2332 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2333 Py_DECREF(repr);
2334 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2336 return repr;
2339 #endif /* MS_WIN32 */
2341 /* --- Character Mapping Codec -------------------------------------------- */
2343 static
2344 int charmap_decoding_error(const char **source,
2345 Py_UNICODE **dest,
2346 const char *errors,
2347 const char *details)
2349 if ((errors == NULL) ||
2350 (strcmp(errors,"strict") == 0)) {
2351 PyErr_Format(PyExc_UnicodeError,
2352 "charmap decoding error: %.400s",
2353 details);
2354 return -1;
2356 else if (strcmp(errors,"ignore") == 0) {
2357 return 0;
2359 else if (strcmp(errors,"replace") == 0) {
2360 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2361 (*dest)++;
2362 return 0;
2364 else {
2365 PyErr_Format(PyExc_ValueError,
2366 "charmap decoding error; "
2367 "unknown error handling code: %.400s",
2368 errors);
2369 return -1;
2373 PyObject *PyUnicode_DecodeCharmap(const char *s,
2374 int size,
2375 PyObject *mapping,
2376 const char *errors)
2378 PyUnicodeObject *v;
2379 Py_UNICODE *p;
2380 int extrachars = 0;
2382 /* Default to Latin-1 */
2383 if (mapping == NULL)
2384 return PyUnicode_DecodeLatin1(s, size, errors);
2386 v = _PyUnicode_New(size);
2387 if (v == NULL)
2388 goto onError;
2389 if (size == 0)
2390 return (PyObject *)v;
2391 p = PyUnicode_AS_UNICODE(v);
2392 while (size-- > 0) {
2393 unsigned char ch = *s++;
2394 PyObject *w, *x;
2396 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2397 w = PyInt_FromLong((long)ch);
2398 if (w == NULL)
2399 goto onError;
2400 x = PyObject_GetItem(mapping, w);
2401 Py_DECREF(w);
2402 if (x == NULL) {
2403 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2404 /* No mapping found means: mapping is undefined. */
2405 PyErr_Clear();
2406 x = Py_None;
2407 Py_INCREF(x);
2408 } else
2409 goto onError;
2412 /* Apply mapping */
2413 if (PyInt_Check(x)) {
2414 long value = PyInt_AS_LONG(x);
2415 if (value < 0 || value > 65535) {
2416 PyErr_SetString(PyExc_TypeError,
2417 "character mapping must be in range(65536)");
2418 Py_DECREF(x);
2419 goto onError;
2421 *p++ = (Py_UNICODE)value;
2423 else if (x == Py_None) {
2424 /* undefined mapping */
2425 if (charmap_decoding_error(&s, &p, errors,
2426 "character maps to <undefined>")) {
2427 Py_DECREF(x);
2428 goto onError;
2431 else if (PyUnicode_Check(x)) {
2432 int targetsize = PyUnicode_GET_SIZE(x);
2434 if (targetsize == 1)
2435 /* 1-1 mapping */
2436 *p++ = *PyUnicode_AS_UNICODE(x);
2438 else if (targetsize > 1) {
2439 /* 1-n mapping */
2440 if (targetsize > extrachars) {
2441 /* resize first */
2442 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2443 int needed = (targetsize - extrachars) + \
2444 (targetsize << 2);
2445 extrachars += needed;
2446 if (_PyUnicode_Resize(&v,
2447 PyUnicode_GET_SIZE(v) + needed)) {
2448 Py_DECREF(x);
2449 goto onError;
2451 p = PyUnicode_AS_UNICODE(v) + oldpos;
2453 Py_UNICODE_COPY(p,
2454 PyUnicode_AS_UNICODE(x),
2455 targetsize);
2456 p += targetsize;
2457 extrachars -= targetsize;
2459 /* 1-0 mapping: skip the character */
2461 else {
2462 /* wrong return value */
2463 PyErr_SetString(PyExc_TypeError,
2464 "character mapping must return integer, None or unicode");
2465 Py_DECREF(x);
2466 goto onError;
2468 Py_DECREF(x);
2470 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2471 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2472 goto onError;
2473 return (PyObject *)v;
2475 onError:
2476 Py_XDECREF(v);
2477 return NULL;
2480 static
2481 int charmap_encoding_error(const Py_UNICODE **source,
2482 char **dest,
2483 const char *errors,
2484 const char *details)
2486 if ((errors == NULL) ||
2487 (strcmp(errors,"strict") == 0)) {
2488 PyErr_Format(PyExc_UnicodeError,
2489 "charmap encoding error: %.400s",
2490 details);
2491 return -1;
2493 else if (strcmp(errors,"ignore") == 0) {
2494 return 0;
2496 else if (strcmp(errors,"replace") == 0) {
2497 **dest = '?';
2498 (*dest)++;
2499 return 0;
2501 else {
2502 PyErr_Format(PyExc_ValueError,
2503 "charmap encoding error; "
2504 "unknown error handling code: %.400s",
2505 errors);
2506 return -1;
2510 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2511 int size,
2512 PyObject *mapping,
2513 const char *errors)
2515 PyObject *v;
2516 char *s;
2517 int extrachars = 0;
2519 /* Default to Latin-1 */
2520 if (mapping == NULL)
2521 return PyUnicode_EncodeLatin1(p, size, errors);
2523 v = PyString_FromStringAndSize(NULL, size);
2524 if (v == NULL)
2525 return NULL;
2526 if (size == 0)
2527 return v;
2528 s = PyString_AS_STRING(v);
2529 while (size-- > 0) {
2530 Py_UNICODE ch = *p++;
2531 PyObject *w, *x;
2533 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2534 w = PyInt_FromLong((long)ch);
2535 if (w == NULL)
2536 goto onError;
2537 x = PyObject_GetItem(mapping, w);
2538 Py_DECREF(w);
2539 if (x == NULL) {
2540 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2541 /* No mapping found means: mapping is undefined. */
2542 PyErr_Clear();
2543 x = Py_None;
2544 Py_INCREF(x);
2545 } else
2546 goto onError;
2549 /* Apply mapping */
2550 if (PyInt_Check(x)) {
2551 long value = PyInt_AS_LONG(x);
2552 if (value < 0 || value > 255) {
2553 PyErr_SetString(PyExc_TypeError,
2554 "character mapping must be in range(256)");
2555 Py_DECREF(x);
2556 goto onError;
2558 *s++ = (char)value;
2560 else if (x == Py_None) {
2561 /* undefined mapping */
2562 if (charmap_encoding_error(&p, &s, errors,
2563 "character maps to <undefined>")) {
2564 Py_DECREF(x);
2565 goto onError;
2568 else if (PyString_Check(x)) {
2569 int targetsize = PyString_GET_SIZE(x);
2571 if (targetsize == 1)
2572 /* 1-1 mapping */
2573 *s++ = *PyString_AS_STRING(x);
2575 else if (targetsize > 1) {
2576 /* 1-n mapping */
2577 if (targetsize > extrachars) {
2578 /* resize first */
2579 int oldpos = (int)(s - PyString_AS_STRING(v));
2580 int needed = (targetsize - extrachars) + \
2581 (targetsize << 2);
2582 extrachars += needed;
2583 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2584 Py_DECREF(x);
2585 goto onError;
2587 s = PyString_AS_STRING(v) + oldpos;
2589 memcpy(s, PyString_AS_STRING(x), targetsize);
2590 s += targetsize;
2591 extrachars -= targetsize;
2593 /* 1-0 mapping: skip the character */
2595 else {
2596 /* wrong return value */
2597 PyErr_SetString(PyExc_TypeError,
2598 "character mapping must return integer, None or unicode");
2599 Py_DECREF(x);
2600 goto onError;
2602 Py_DECREF(x);
2604 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2605 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2606 goto onError;
2607 return v;
2609 onError:
2610 Py_DECREF(v);
2611 return NULL;
2614 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2615 PyObject *mapping)
2617 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2618 PyErr_BadArgument();
2619 return NULL;
2621 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2622 PyUnicode_GET_SIZE(unicode),
2623 mapping,
2624 NULL);
2627 static
2628 int translate_error(const Py_UNICODE **source,
2629 Py_UNICODE **dest,
2630 const char *errors,
2631 const char *details)
2633 if ((errors == NULL) ||
2634 (strcmp(errors,"strict") == 0)) {
2635 PyErr_Format(PyExc_UnicodeError,
2636 "translate error: %.400s",
2637 details);
2638 return -1;
2640 else if (strcmp(errors,"ignore") == 0) {
2641 return 0;
2643 else if (strcmp(errors,"replace") == 0) {
2644 **dest = '?';
2645 (*dest)++;
2646 return 0;
2648 else {
2649 PyErr_Format(PyExc_ValueError,
2650 "translate error; "
2651 "unknown error handling code: %.400s",
2652 errors);
2653 return -1;
2657 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2658 int size,
2659 PyObject *mapping,
2660 const char *errors)
2662 PyUnicodeObject *v;
2663 Py_UNICODE *p;
2665 if (mapping == NULL) {
2666 PyErr_BadArgument();
2667 return NULL;
2670 /* Output will never be longer than input */
2671 v = _PyUnicode_New(size);
2672 if (v == NULL)
2673 goto onError;
2674 if (size == 0)
2675 goto done;
2676 p = PyUnicode_AS_UNICODE(v);
2677 while (size-- > 0) {
2678 Py_UNICODE ch = *s++;
2679 PyObject *w, *x;
2681 /* Get mapping */
2682 w = PyInt_FromLong(ch);
2683 if (w == NULL)
2684 goto onError;
2685 x = PyObject_GetItem(mapping, w);
2686 Py_DECREF(w);
2687 if (x == NULL) {
2688 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2689 /* No mapping found: default to 1-1 mapping */
2690 PyErr_Clear();
2691 *p++ = ch;
2692 continue;
2694 goto onError;
2697 /* Apply mapping */
2698 if (PyInt_Check(x))
2699 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2700 else if (x == Py_None) {
2701 /* undefined mapping */
2702 if (translate_error(&s, &p, errors,
2703 "character maps to <undefined>")) {
2704 Py_DECREF(x);
2705 goto onError;
2708 else if (PyUnicode_Check(x)) {
2709 if (PyUnicode_GET_SIZE(x) != 1) {
2710 /* 1-n mapping */
2711 PyErr_SetString(PyExc_NotImplementedError,
2712 "1-n mappings are currently not implemented");
2713 Py_DECREF(x);
2714 goto onError;
2716 *p++ = *PyUnicode_AS_UNICODE(x);
2718 else {
2719 /* wrong return value */
2720 PyErr_SetString(PyExc_TypeError,
2721 "translate mapping must return integer, None or unicode");
2722 Py_DECREF(x);
2723 goto onError;
2725 Py_DECREF(x);
2727 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2728 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2729 goto onError;
2731 done:
2732 return (PyObject *)v;
2734 onError:
2735 Py_XDECREF(v);
2736 return NULL;
2739 PyObject *PyUnicode_Translate(PyObject *str,
2740 PyObject *mapping,
2741 const char *errors)
2743 PyObject *result;
2745 str = PyUnicode_FromObject(str);
2746 if (str == NULL)
2747 goto onError;
2748 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2749 PyUnicode_GET_SIZE(str),
2750 mapping,
2751 errors);
2752 Py_DECREF(str);
2753 return result;
2755 onError:
2756 Py_XDECREF(str);
2757 return NULL;
2760 /* --- Decimal Encoder ---------------------------------------------------- */
2762 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2763 int length,
2764 char *output,
2765 const char *errors)
2767 Py_UNICODE *p, *end;
2769 if (output == NULL) {
2770 PyErr_BadArgument();
2771 return -1;
2774 p = s;
2775 end = s + length;
2776 while (p < end) {
2777 register Py_UNICODE ch = *p++;
2778 int decimal;
2780 if (Py_UNICODE_ISSPACE(ch)) {
2781 *output++ = ' ';
2782 continue;
2784 decimal = Py_UNICODE_TODECIMAL(ch);
2785 if (decimal >= 0) {
2786 *output++ = '0' + decimal;
2787 continue;
2789 if (0 < ch && ch < 256) {
2790 *output++ = (char)ch;
2791 continue;
2793 /* All other characters are considered invalid */
2794 if (errors == NULL || strcmp(errors, "strict") == 0) {
2795 PyErr_SetString(PyExc_ValueError,
2796 "invalid decimal Unicode string");
2797 goto onError;
2799 else if (strcmp(errors, "ignore") == 0)
2800 continue;
2801 else if (strcmp(errors, "replace") == 0) {
2802 *output++ = '?';
2803 continue;
2806 /* 0-terminate the output string */
2807 *output++ = '\0';
2808 return 0;
2810 onError:
2811 return -1;
2814 /* --- Helpers ------------------------------------------------------------ */
2816 static
2817 int count(PyUnicodeObject *self,
2818 int start,
2819 int end,
2820 PyUnicodeObject *substring)
2822 int count = 0;
2824 if (start < 0)
2825 start += self->length;
2826 if (start < 0)
2827 start = 0;
2828 if (end > self->length)
2829 end = self->length;
2830 if (end < 0)
2831 end += self->length;
2832 if (end < 0)
2833 end = 0;
2835 if (substring->length == 0)
2836 return (end - start + 1);
2838 end -= substring->length;
2840 while (start <= end)
2841 if (Py_UNICODE_MATCH(self, start, substring)) {
2842 count++;
2843 start += substring->length;
2844 } else
2845 start++;
2847 return count;
2850 int PyUnicode_Count(PyObject *str,
2851 PyObject *substr,
2852 int start,
2853 int end)
2855 int result;
2857 str = PyUnicode_FromObject(str);
2858 if (str == NULL)
2859 return -1;
2860 substr = PyUnicode_FromObject(substr);
2861 if (substr == NULL) {
2862 Py_DECREF(str);
2863 return -1;
2866 result = count((PyUnicodeObject *)str,
2867 start, end,
2868 (PyUnicodeObject *)substr);
2870 Py_DECREF(str);
2871 Py_DECREF(substr);
2872 return result;
2875 static
2876 int findstring(PyUnicodeObject *self,
2877 PyUnicodeObject *substring,
2878 int start,
2879 int end,
2880 int direction)
2882 if (start < 0)
2883 start += self->length;
2884 if (start < 0)
2885 start = 0;
2887 if (substring->length == 0)
2888 return start;
2890 if (end > self->length)
2891 end = self->length;
2892 if (end < 0)
2893 end += self->length;
2894 if (end < 0)
2895 end = 0;
2897 end -= substring->length;
2899 if (direction < 0) {
2900 for (; end >= start; end--)
2901 if (Py_UNICODE_MATCH(self, end, substring))
2902 return end;
2903 } else {
2904 for (; start <= end; start++)
2905 if (Py_UNICODE_MATCH(self, start, substring))
2906 return start;
2909 return -1;
2912 int PyUnicode_Find(PyObject *str,
2913 PyObject *substr,
2914 int start,
2915 int end,
2916 int direction)
2918 int result;
2920 str = PyUnicode_FromObject(str);
2921 if (str == NULL)
2922 return -1;
2923 substr = PyUnicode_FromObject(substr);
2924 if (substr == NULL) {
2925 Py_DECREF(substr);
2926 return -1;
2929 result = findstring((PyUnicodeObject *)str,
2930 (PyUnicodeObject *)substr,
2931 start, end, direction);
2932 Py_DECREF(str);
2933 Py_DECREF(substr);
2934 return result;
2937 static
2938 int tailmatch(PyUnicodeObject *self,
2939 PyUnicodeObject *substring,
2940 int start,
2941 int end,
2942 int direction)
2944 if (start < 0)
2945 start += self->length;
2946 if (start < 0)
2947 start = 0;
2949 if (substring->length == 0)
2950 return 1;
2952 if (end > self->length)
2953 end = self->length;
2954 if (end < 0)
2955 end += self->length;
2956 if (end < 0)
2957 end = 0;
2959 end -= substring->length;
2960 if (end < start)
2961 return 0;
2963 if (direction > 0) {
2964 if (Py_UNICODE_MATCH(self, end, substring))
2965 return 1;
2966 } else {
2967 if (Py_UNICODE_MATCH(self, start, substring))
2968 return 1;
2971 return 0;
2974 int PyUnicode_Tailmatch(PyObject *str,
2975 PyObject *substr,
2976 int start,
2977 int end,
2978 int direction)
2980 int result;
2982 str = PyUnicode_FromObject(str);
2983 if (str == NULL)
2984 return -1;
2985 substr = PyUnicode_FromObject(substr);
2986 if (substr == NULL) {
2987 Py_DECREF(substr);
2988 return -1;
2991 result = tailmatch((PyUnicodeObject *)str,
2992 (PyUnicodeObject *)substr,
2993 start, end, direction);
2994 Py_DECREF(str);
2995 Py_DECREF(substr);
2996 return result;
2999 static
3000 const Py_UNICODE *findchar(const Py_UNICODE *s,
3001 int size,
3002 Py_UNICODE ch)
3004 /* like wcschr, but doesn't stop at NULL characters */
3006 while (size-- > 0) {
3007 if (*s == ch)
3008 return s;
3009 s++;
3012 return NULL;
3015 /* Apply fixfct filter to the Unicode object self and return a
3016 reference to the modified object */
3018 static
3019 PyObject *fixup(PyUnicodeObject *self,
3020 int (*fixfct)(PyUnicodeObject *s))
3023 PyUnicodeObject *u;
3025 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3026 if (u == NULL)
3027 return NULL;
3029 Py_UNICODE_COPY(u->str, self->str, self->length);
3031 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3032 /* fixfct should return TRUE if it modified the buffer. If
3033 FALSE, return a reference to the original buffer instead
3034 (to save space, not time) */
3035 Py_INCREF(self);
3036 Py_DECREF(u);
3037 return (PyObject*) self;
3039 return (PyObject*) u;
3042 static
3043 int fixupper(PyUnicodeObject *self)
3045 int len = self->length;
3046 Py_UNICODE *s = self->str;
3047 int status = 0;
3049 while (len-- > 0) {
3050 register Py_UNICODE ch;
3052 ch = Py_UNICODE_TOUPPER(*s);
3053 if (ch != *s) {
3054 status = 1;
3055 *s = ch;
3057 s++;
3060 return status;
3063 static
3064 int fixlower(PyUnicodeObject *self)
3066 int len = self->length;
3067 Py_UNICODE *s = self->str;
3068 int status = 0;
3070 while (len-- > 0) {
3071 register Py_UNICODE ch;
3073 ch = Py_UNICODE_TOLOWER(*s);
3074 if (ch != *s) {
3075 status = 1;
3076 *s = ch;
3078 s++;
3081 return status;
3084 static
3085 int fixswapcase(PyUnicodeObject *self)
3087 int len = self->length;
3088 Py_UNICODE *s = self->str;
3089 int status = 0;
3091 while (len-- > 0) {
3092 if (Py_UNICODE_ISUPPER(*s)) {
3093 *s = Py_UNICODE_TOLOWER(*s);
3094 status = 1;
3095 } else if (Py_UNICODE_ISLOWER(*s)) {
3096 *s = Py_UNICODE_TOUPPER(*s);
3097 status = 1;
3099 s++;
3102 return status;
3105 static
3106 int fixcapitalize(PyUnicodeObject *self)
3108 int len = self->length;
3109 Py_UNICODE *s = self->str;
3110 int status = 0;
3112 if (len == 0)
3113 return 0;
3114 if (Py_UNICODE_ISLOWER(*s)) {
3115 *s = Py_UNICODE_TOUPPER(*s);
3116 status = 1;
3118 s++;
3119 while (--len > 0) {
3120 if (Py_UNICODE_ISUPPER(*s)) {
3121 *s = Py_UNICODE_TOLOWER(*s);
3122 status = 1;
3124 s++;
3126 return status;
3129 static
3130 int fixtitle(PyUnicodeObject *self)
3132 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3133 register Py_UNICODE *e;
3134 int previous_is_cased;
3136 /* Shortcut for single character strings */
3137 if (PyUnicode_GET_SIZE(self) == 1) {
3138 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3139 if (*p != ch) {
3140 *p = ch;
3141 return 1;
3143 else
3144 return 0;
3147 e = p + PyUnicode_GET_SIZE(self);
3148 previous_is_cased = 0;
3149 for (; p < e; p++) {
3150 register const Py_UNICODE ch = *p;
3152 if (previous_is_cased)
3153 *p = Py_UNICODE_TOLOWER(ch);
3154 else
3155 *p = Py_UNICODE_TOTITLE(ch);
3157 if (Py_UNICODE_ISLOWER(ch) ||
3158 Py_UNICODE_ISUPPER(ch) ||
3159 Py_UNICODE_ISTITLE(ch))
3160 previous_is_cased = 1;
3161 else
3162 previous_is_cased = 0;
3164 return 1;
3167 PyObject *PyUnicode_Join(PyObject *separator,
3168 PyObject *seq)
3170 Py_UNICODE *sep;
3171 int seplen;
3172 PyUnicodeObject *res = NULL;
3173 int reslen = 0;
3174 Py_UNICODE *p;
3175 int sz = 100;
3176 int i;
3177 PyObject *it;
3179 it = PyObject_GetIter(seq);
3180 if (it == NULL)
3181 return NULL;
3183 if (separator == NULL) {
3184 Py_UNICODE blank = ' ';
3185 sep = &blank;
3186 seplen = 1;
3188 else {
3189 separator = PyUnicode_FromObject(separator);
3190 if (separator == NULL)
3191 goto onError;
3192 sep = PyUnicode_AS_UNICODE(separator);
3193 seplen = PyUnicode_GET_SIZE(separator);
3196 res = _PyUnicode_New(sz);
3197 if (res == NULL)
3198 goto onError;
3199 p = PyUnicode_AS_UNICODE(res);
3200 reslen = 0;
3202 for (i = 0; ; ++i) {
3203 int itemlen;
3204 PyObject *item = PyIter_Next(it);
3205 if (item == NULL) {
3206 if (PyErr_Occurred())
3207 goto onError;
3208 break;
3210 if (!PyUnicode_Check(item)) {
3211 PyObject *v;
3212 if (!PyString_Check(item)) {
3213 PyErr_Format(PyExc_TypeError,
3214 "sequence item %i: expected string or Unicode,"
3215 " %.80s found",
3216 i, item->ob_type->tp_name);
3217 Py_DECREF(item);
3218 goto onError;
3220 v = PyUnicode_FromObject(item);
3221 Py_DECREF(item);
3222 item = v;
3223 if (item == NULL)
3224 goto onError;
3226 itemlen = PyUnicode_GET_SIZE(item);
3227 while (reslen + itemlen + seplen >= sz) {
3228 if (_PyUnicode_Resize(&res, sz*2)) {
3229 Py_DECREF(item);
3230 goto onError;
3232 sz *= 2;
3233 p = PyUnicode_AS_UNICODE(res) + reslen;
3235 if (i > 0) {
3236 Py_UNICODE_COPY(p, sep, seplen);
3237 p += seplen;
3238 reslen += seplen;
3240 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3241 p += itemlen;
3242 reslen += itemlen;
3243 Py_DECREF(item);
3245 if (_PyUnicode_Resize(&res, reslen))
3246 goto onError;
3248 Py_XDECREF(separator);
3249 Py_DECREF(it);
3250 return (PyObject *)res;
3252 onError:
3253 Py_XDECREF(separator);
3254 Py_XDECREF(res);
3255 Py_DECREF(it);
3256 return NULL;
3259 static
3260 PyUnicodeObject *pad(PyUnicodeObject *self,
3261 int left,
3262 int right,
3263 Py_UNICODE fill)
3265 PyUnicodeObject *u;
3267 if (left < 0)
3268 left = 0;
3269 if (right < 0)
3270 right = 0;
3272 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3273 Py_INCREF(self);
3274 return self;
3277 u = _PyUnicode_New(left + self->length + right);
3278 if (u) {
3279 if (left)
3280 Py_UNICODE_FILL(u->str, fill, left);
3281 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3282 if (right)
3283 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3286 return u;
3289 #define SPLIT_APPEND(data, left, right) \
3290 str = PyUnicode_FromUnicode(data + left, right - left); \
3291 if (!str) \
3292 goto onError; \
3293 if (PyList_Append(list, str)) { \
3294 Py_DECREF(str); \
3295 goto onError; \
3297 else \
3298 Py_DECREF(str);
3300 static
3301 PyObject *split_whitespace(PyUnicodeObject *self,
3302 PyObject *list,
3303 int maxcount)
3305 register int i;
3306 register int j;
3307 int len = self->length;
3308 PyObject *str;
3310 for (i = j = 0; i < len; ) {
3311 /* find a token */
3312 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3313 i++;
3314 j = i;
3315 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3316 i++;
3317 if (j < i) {
3318 if (maxcount-- <= 0)
3319 break;
3320 SPLIT_APPEND(self->str, j, i);
3321 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3322 i++;
3323 j = i;
3326 if (j < len) {
3327 SPLIT_APPEND(self->str, j, len);
3329 return list;
3331 onError:
3332 Py_DECREF(list);
3333 return NULL;
3336 PyObject *PyUnicode_Splitlines(PyObject *string,
3337 int keepends)
3339 register int i;
3340 register int j;
3341 int len;
3342 PyObject *list;
3343 PyObject *str;
3344 Py_UNICODE *data;
3346 string = PyUnicode_FromObject(string);
3347 if (string == NULL)
3348 return NULL;
3349 data = PyUnicode_AS_UNICODE(string);
3350 len = PyUnicode_GET_SIZE(string);
3352 list = PyList_New(0);
3353 if (!list)
3354 goto onError;
3356 for (i = j = 0; i < len; ) {
3357 int eol;
3359 /* Find a line and append it */
3360 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3361 i++;
3363 /* Skip the line break reading CRLF as one line break */
3364 eol = i;
3365 if (i < len) {
3366 if (data[i] == '\r' && i + 1 < len &&
3367 data[i+1] == '\n')
3368 i += 2;
3369 else
3370 i++;
3371 if (keepends)
3372 eol = i;
3374 SPLIT_APPEND(data, j, eol);
3375 j = i;
3377 if (j < len) {
3378 SPLIT_APPEND(data, j, len);
3381 Py_DECREF(string);
3382 return list;
3384 onError:
3385 Py_DECREF(list);
3386 Py_DECREF(string);
3387 return NULL;
3390 static
3391 PyObject *split_char(PyUnicodeObject *self,
3392 PyObject *list,
3393 Py_UNICODE ch,
3394 int maxcount)
3396 register int i;
3397 register int j;
3398 int len = self->length;
3399 PyObject *str;
3401 for (i = j = 0; i < len; ) {
3402 if (self->str[i] == ch) {
3403 if (maxcount-- <= 0)
3404 break;
3405 SPLIT_APPEND(self->str, j, i);
3406 i = j = i + 1;
3407 } else
3408 i++;
3410 if (j <= len) {
3411 SPLIT_APPEND(self->str, j, len);
3413 return list;
3415 onError:
3416 Py_DECREF(list);
3417 return NULL;
3420 static
3421 PyObject *split_substring(PyUnicodeObject *self,
3422 PyObject *list,
3423 PyUnicodeObject *substring,
3424 int maxcount)
3426 register int i;
3427 register int j;
3428 int len = self->length;
3429 int sublen = substring->length;
3430 PyObject *str;
3432 for (i = j = 0; i <= len - sublen; ) {
3433 if (Py_UNICODE_MATCH(self, i, substring)) {
3434 if (maxcount-- <= 0)
3435 break;
3436 SPLIT_APPEND(self->str, j, i);
3437 i = j = i + sublen;
3438 } else
3439 i++;
3441 if (j <= len) {
3442 SPLIT_APPEND(self->str, j, len);
3444 return list;
3446 onError:
3447 Py_DECREF(list);
3448 return NULL;
3451 #undef SPLIT_APPEND
3453 static
3454 PyObject *split(PyUnicodeObject *self,
3455 PyUnicodeObject *substring,
3456 int maxcount)
3458 PyObject *list;
3460 if (maxcount < 0)
3461 maxcount = INT_MAX;
3463 list = PyList_New(0);
3464 if (!list)
3465 return NULL;
3467 if (substring == NULL)
3468 return split_whitespace(self,list,maxcount);
3470 else if (substring->length == 1)
3471 return split_char(self,list,substring->str[0],maxcount);
3473 else if (substring->length == 0) {
3474 Py_DECREF(list);
3475 PyErr_SetString(PyExc_ValueError, "empty separator");
3476 return NULL;
3478 else
3479 return split_substring(self,list,substring,maxcount);
3482 static
3483 PyObject *strip(PyUnicodeObject *self,
3484 int left,
3485 int right)
3487 Py_UNICODE *p = self->str;
3488 int start = 0;
3489 int end = self->length;
3491 if (left)
3492 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3493 start++;
3495 if (right)
3496 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3497 end--;
3499 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3500 /* couldn't strip anything off, return original string */
3501 Py_INCREF(self);
3502 return (PyObject*) self;
3505 return (PyObject*) PyUnicode_FromUnicode(
3506 self->str + start,
3507 end - start
3511 static
3512 PyObject *replace(PyUnicodeObject *self,
3513 PyUnicodeObject *str1,
3514 PyUnicodeObject *str2,
3515 int maxcount)
3517 PyUnicodeObject *u;
3519 if (maxcount < 0)
3520 maxcount = INT_MAX;
3522 if (str1->length == 1 && str2->length == 1) {
3523 int i;
3525 /* replace characters */
3526 if (!findchar(self->str, self->length, str1->str[0]) &&
3527 PyUnicode_CheckExact(self)) {
3528 /* nothing to replace, return original string */
3529 Py_INCREF(self);
3530 u = self;
3531 } else {
3532 Py_UNICODE u1 = str1->str[0];
3533 Py_UNICODE u2 = str2->str[0];
3535 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3536 NULL,
3537 self->length
3539 if (u != NULL) {
3540 Py_UNICODE_COPY(u->str, self->str,
3541 self->length);
3542 for (i = 0; i < u->length; i++)
3543 if (u->str[i] == u1) {
3544 if (--maxcount < 0)
3545 break;
3546 u->str[i] = u2;
3551 } else {
3552 int n, i;
3553 Py_UNICODE *p;
3555 /* replace strings */
3556 n = count(self, 0, self->length, str1);
3557 if (n > maxcount)
3558 n = maxcount;
3559 if (n == 0 && PyUnicode_CheckExact(self)) {
3560 /* nothing to replace, return original string */
3561 Py_INCREF(self);
3562 u = self;
3563 } else {
3564 u = _PyUnicode_New(
3565 self->length + n * (str2->length - str1->length));
3566 if (u) {
3567 i = 0;
3568 p = u->str;
3569 while (i <= self->length - str1->length)
3570 if (Py_UNICODE_MATCH(self, i, str1)) {
3571 /* replace string segment */
3572 Py_UNICODE_COPY(p, str2->str, str2->length);
3573 p += str2->length;
3574 i += str1->length;
3575 if (--n <= 0) {
3576 /* copy remaining part */
3577 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3578 break;
3580 } else
3581 *p++ = self->str[i++];
3586 return (PyObject *) u;
3589 /* --- Unicode Object Methods --------------------------------------------- */
3591 static char title__doc__[] =
3592 "S.title() -> unicode\n\
3594 Return a titlecased version of S, i.e. words start with title case\n\
3595 characters, all remaining cased characters have lower case.";
3597 static PyObject*
3598 unicode_title(PyUnicodeObject *self)
3600 return fixup(self, fixtitle);
3603 static char capitalize__doc__[] =
3604 "S.capitalize() -> unicode\n\
3606 Return a capitalized version of S, i.e. make the first character\n\
3607 have upper case.";
3609 static PyObject*
3610 unicode_capitalize(PyUnicodeObject *self)
3612 return fixup(self, fixcapitalize);
3615 #if 0
3616 static char capwords__doc__[] =
3617 "S.capwords() -> unicode\n\
3619 Apply .capitalize() to all words in S and return the result with\n\
3620 normalized whitespace (all whitespace strings are replaced by ' ').";
3622 static PyObject*
3623 unicode_capwords(PyUnicodeObject *self)
3625 PyObject *list;
3626 PyObject *item;
3627 int i;
3629 /* Split into words */
3630 list = split(self, NULL, -1);
3631 if (!list)
3632 return NULL;
3634 /* Capitalize each word */
3635 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3636 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3637 fixcapitalize);
3638 if (item == NULL)
3639 goto onError;
3640 Py_DECREF(PyList_GET_ITEM(list, i));
3641 PyList_SET_ITEM(list, i, item);
3644 /* Join the words to form a new string */
3645 item = PyUnicode_Join(NULL, list);
3647 onError:
3648 Py_DECREF(list);
3649 return (PyObject *)item;
3651 #endif
3653 static char center__doc__[] =
3654 "S.center(width) -> unicode\n\
3656 Return S centered in a Unicode string of length width. Padding is done\n\
3657 using spaces.";
3659 static PyObject *
3660 unicode_center(PyUnicodeObject *self, PyObject *args)
3662 int marg, left;
3663 int width;
3665 if (!PyArg_ParseTuple(args, "i:center", &width))
3666 return NULL;
3668 if (self->length >= width && PyUnicode_CheckExact(self)) {
3669 Py_INCREF(self);
3670 return (PyObject*) self;
3673 marg = width - self->length;
3674 left = marg / 2 + (marg & width & 1);
3676 return (PyObject*) pad(self, left, marg - left, ' ');
3679 #if 0
3681 /* This code should go into some future Unicode collation support
3682 module. The basic comparison should compare ordinals on a naive
3683 basis (this is what Java does and thus JPython too). */
3685 /* speedy UTF-16 code point order comparison */
3686 /* gleaned from: */
3687 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3689 static short utf16Fixup[32] =
3691 0, 0, 0, 0, 0, 0, 0, 0,
3692 0, 0, 0, 0, 0, 0, 0, 0,
3693 0, 0, 0, 0, 0, 0, 0, 0,
3694 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3697 static int
3698 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3700 int len1, len2;
3702 Py_UNICODE *s1 = str1->str;
3703 Py_UNICODE *s2 = str2->str;
3705 len1 = str1->length;
3706 len2 = str2->length;
3708 while (len1 > 0 && len2 > 0) {
3709 Py_UNICODE c1, c2;
3711 c1 = *s1++;
3712 c2 = *s2++;
3714 if (c1 > (1<<11) * 26)
3715 c1 += utf16Fixup[c1>>11];
3716 if (c2 > (1<<11) * 26)
3717 c2 += utf16Fixup[c2>>11];
3718 /* now c1 and c2 are in UTF-32-compatible order */
3720 if (c1 != c2)
3721 return (c1 < c2) ? -1 : 1;
3723 len1--; len2--;
3726 return (len1 < len2) ? -1 : (len1 != len2);
3729 #else
3731 static int
3732 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3734 register int len1, len2;
3736 Py_UNICODE *s1 = str1->str;
3737 Py_UNICODE *s2 = str2->str;
3739 len1 = str1->length;
3740 len2 = str2->length;
3742 while (len1 > 0 && len2 > 0) {
3743 Py_UNICODE c1, c2;
3745 c1 = *s1++;
3746 c2 = *s2++;
3748 if (c1 != c2)
3749 return (c1 < c2) ? -1 : 1;
3751 len1--; len2--;
3754 return (len1 < len2) ? -1 : (len1 != len2);
3757 #endif
3759 int PyUnicode_Compare(PyObject *left,
3760 PyObject *right)
3762 PyUnicodeObject *u = NULL, *v = NULL;
3763 int result;
3765 /* Coerce the two arguments */
3766 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3767 if (u == NULL)
3768 goto onError;
3769 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3770 if (v == NULL)
3771 goto onError;
3773 /* Shortcut for empty or interned objects */
3774 if (v == u) {
3775 Py_DECREF(u);
3776 Py_DECREF(v);
3777 return 0;
3780 result = unicode_compare(u, v);
3782 Py_DECREF(u);
3783 Py_DECREF(v);
3784 return result;
3786 onError:
3787 Py_XDECREF(u);
3788 Py_XDECREF(v);
3789 return -1;
3792 int PyUnicode_Contains(PyObject *container,
3793 PyObject *element)
3795 PyUnicodeObject *u = NULL, *v = NULL;
3796 int result;
3797 register const Py_UNICODE *p, *e;
3798 register Py_UNICODE ch;
3800 /* Coerce the two arguments */
3801 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3802 if (v == NULL) {
3803 PyErr_SetString(PyExc_TypeError,
3804 "'in <string>' requires character as left operand");
3805 goto onError;
3807 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3808 if (u == NULL) {
3809 Py_DECREF(v);
3810 goto onError;
3813 /* Check v in u */
3814 if (PyUnicode_GET_SIZE(v) != 1) {
3815 PyErr_SetString(PyExc_TypeError,
3816 "'in <string>' requires character as left operand");
3817 goto onError;
3819 ch = *PyUnicode_AS_UNICODE(v);
3820 p = PyUnicode_AS_UNICODE(u);
3821 e = p + PyUnicode_GET_SIZE(u);
3822 result = 0;
3823 while (p < e) {
3824 if (*p++ == ch) {
3825 result = 1;
3826 break;
3830 Py_DECREF(u);
3831 Py_DECREF(v);
3832 return result;
3834 onError:
3835 Py_XDECREF(u);
3836 Py_XDECREF(v);
3837 return -1;
3840 /* Concat to string or Unicode object giving a new Unicode object. */
3842 PyObject *PyUnicode_Concat(PyObject *left,
3843 PyObject *right)
3845 PyUnicodeObject *u = NULL, *v = NULL, *w;
3847 /* Coerce the two arguments */
3848 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3849 if (u == NULL)
3850 goto onError;
3851 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3852 if (v == NULL)
3853 goto onError;
3855 /* Shortcuts */
3856 if (v == unicode_empty) {
3857 Py_DECREF(v);
3858 return (PyObject *)u;
3860 if (u == unicode_empty) {
3861 Py_DECREF(u);
3862 return (PyObject *)v;
3865 /* Concat the two Unicode strings */
3866 w = _PyUnicode_New(u->length + v->length);
3867 if (w == NULL)
3868 goto onError;
3869 Py_UNICODE_COPY(w->str, u->str, u->length);
3870 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3872 Py_DECREF(u);
3873 Py_DECREF(v);
3874 return (PyObject *)w;
3876 onError:
3877 Py_XDECREF(u);
3878 Py_XDECREF(v);
3879 return NULL;
3882 static char count__doc__[] =
3883 "S.count(sub[, start[, end]]) -> int\n\
3885 Return the number of occurrences of substring sub in Unicode string\n\
3886 S[start:end]. Optional arguments start and end are\n\
3887 interpreted as in slice notation.";
3889 static PyObject *
3890 unicode_count(PyUnicodeObject *self, PyObject *args)
3892 PyUnicodeObject *substring;
3893 int start = 0;
3894 int end = INT_MAX;
3895 PyObject *result;
3897 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3898 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3899 return NULL;
3901 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3902 (PyObject *)substring);
3903 if (substring == NULL)
3904 return NULL;
3906 if (start < 0)
3907 start += self->length;
3908 if (start < 0)
3909 start = 0;
3910 if (end > self->length)
3911 end = self->length;
3912 if (end < 0)
3913 end += self->length;
3914 if (end < 0)
3915 end = 0;
3917 result = PyInt_FromLong((long) count(self, start, end, substring));
3919 Py_DECREF(substring);
3920 return result;
3923 static char encode__doc__[] =
3924 "S.encode([encoding[,errors]]) -> string\n\
3926 Return an encoded string version of S. Default encoding is the current\n\
3927 default string encoding. errors may be given to set a different error\n\
3928 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3929 a ValueError. Other possible values are 'ignore' and 'replace'.";
3931 static PyObject *
3932 unicode_encode(PyUnicodeObject *self, PyObject *args)
3934 char *encoding = NULL;
3935 char *errors = NULL;
3936 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3937 return NULL;
3938 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3941 static char expandtabs__doc__[] =
3942 "S.expandtabs([tabsize]) -> unicode\n\
3944 Return a copy of S where all tab characters are expanded using spaces.\n\
3945 If tabsize is not given, a tab size of 8 characters is assumed.";
3947 static PyObject*
3948 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3950 Py_UNICODE *e;
3951 Py_UNICODE *p;
3952 Py_UNICODE *q;
3953 int i, j;
3954 PyUnicodeObject *u;
3955 int tabsize = 8;
3957 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3958 return NULL;
3960 /* First pass: determine size of output string */
3961 i = j = 0;
3962 e = self->str + self->length;
3963 for (p = self->str; p < e; p++)
3964 if (*p == '\t') {
3965 if (tabsize > 0)
3966 j += tabsize - (j % tabsize);
3968 else {
3969 j++;
3970 if (*p == '\n' || *p == '\r') {
3971 i += j;
3972 j = 0;
3976 /* Second pass: create output string and fill it */
3977 u = _PyUnicode_New(i + j);
3978 if (!u)
3979 return NULL;
3981 j = 0;
3982 q = u->str;
3984 for (p = self->str; p < e; p++)
3985 if (*p == '\t') {
3986 if (tabsize > 0) {
3987 i = tabsize - (j % tabsize);
3988 j += i;
3989 while (i--)
3990 *q++ = ' ';
3993 else {
3994 j++;
3995 *q++ = *p;
3996 if (*p == '\n' || *p == '\r')
3997 j = 0;
4000 return (PyObject*) u;
4003 static char find__doc__[] =
4004 "S.find(sub [,start [,end]]) -> int\n\
4006 Return the lowest index in S where substring sub is found,\n\
4007 such that sub is contained within s[start,end]. Optional\n\
4008 arguments start and end are interpreted as in slice notation.\n\
4010 Return -1 on failure.";
4012 static PyObject *
4013 unicode_find(PyUnicodeObject *self, PyObject *args)
4015 PyUnicodeObject *substring;
4016 int start = 0;
4017 int end = INT_MAX;
4018 PyObject *result;
4020 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4021 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4022 return NULL;
4023 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4024 (PyObject *)substring);
4025 if (substring == NULL)
4026 return NULL;
4028 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4030 Py_DECREF(substring);
4031 return result;
4034 static PyObject *
4035 unicode_getitem(PyUnicodeObject *self, int index)
4037 if (index < 0 || index >= self->length) {
4038 PyErr_SetString(PyExc_IndexError, "string index out of range");
4039 return NULL;
4042 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4045 static long
4046 unicode_hash(PyUnicodeObject *self)
4048 /* Since Unicode objects compare equal to their ASCII string
4049 counterparts, they should use the individual character values
4050 as basis for their hash value. This is needed to assure that
4051 strings and Unicode objects behave in the same way as
4052 dictionary keys. */
4054 register int len;
4055 register Py_UNICODE *p;
4056 register long x;
4058 if (self->hash != -1)
4059 return self->hash;
4060 len = PyUnicode_GET_SIZE(self);
4061 p = PyUnicode_AS_UNICODE(self);
4062 x = *p << 7;
4063 while (--len >= 0)
4064 x = (1000003*x) ^ *p++;
4065 x ^= PyUnicode_GET_SIZE(self);
4066 if (x == -1)
4067 x = -2;
4068 self->hash = x;
4069 return x;
4072 static char index__doc__[] =
4073 "S.index(sub [,start [,end]]) -> int\n\
4075 Like S.find() but raise ValueError when the substring is not found.";
4077 static PyObject *
4078 unicode_index(PyUnicodeObject *self, PyObject *args)
4080 int result;
4081 PyUnicodeObject *substring;
4082 int start = 0;
4083 int end = INT_MAX;
4085 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4086 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4087 return NULL;
4089 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4090 (PyObject *)substring);
4091 if (substring == NULL)
4092 return NULL;
4094 result = findstring(self, substring, start, end, 1);
4096 Py_DECREF(substring);
4097 if (result < 0) {
4098 PyErr_SetString(PyExc_ValueError, "substring not found");
4099 return NULL;
4101 return PyInt_FromLong(result);
4104 static char islower__doc__[] =
4105 "S.islower() -> int\n\
4107 Return 1 if all cased characters in S are lowercase and there is\n\
4108 at least one cased character in S, 0 otherwise.";
4110 static PyObject*
4111 unicode_islower(PyUnicodeObject *self)
4113 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4114 register const Py_UNICODE *e;
4115 int cased;
4117 /* Shortcut for single character strings */
4118 if (PyUnicode_GET_SIZE(self) == 1)
4119 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4121 /* Special case for empty strings */
4122 if (PyString_GET_SIZE(self) == 0)
4123 return PyInt_FromLong(0);
4125 e = p + PyUnicode_GET_SIZE(self);
4126 cased = 0;
4127 for (; p < e; p++) {
4128 register const Py_UNICODE ch = *p;
4130 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4131 return PyInt_FromLong(0);
4132 else if (!cased && Py_UNICODE_ISLOWER(ch))
4133 cased = 1;
4135 return PyInt_FromLong(cased);
4138 static char isupper__doc__[] =
4139 "S.isupper() -> int\n\
4141 Return 1 if all cased characters in S are uppercase and there is\n\
4142 at least one cased character in S, 0 otherwise.";
4144 static PyObject*
4145 unicode_isupper(PyUnicodeObject *self)
4147 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4148 register const Py_UNICODE *e;
4149 int cased;
4151 /* Shortcut for single character strings */
4152 if (PyUnicode_GET_SIZE(self) == 1)
4153 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4155 /* Special case for empty strings */
4156 if (PyString_GET_SIZE(self) == 0)
4157 return PyInt_FromLong(0);
4159 e = p + PyUnicode_GET_SIZE(self);
4160 cased = 0;
4161 for (; p < e; p++) {
4162 register const Py_UNICODE ch = *p;
4164 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4165 return PyInt_FromLong(0);
4166 else if (!cased && Py_UNICODE_ISUPPER(ch))
4167 cased = 1;
4169 return PyInt_FromLong(cased);
4172 static char istitle__doc__[] =
4173 "S.istitle() -> int\n\
4175 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4176 may only follow uncased characters and lowercase characters only cased\n\
4177 ones. Return 0 otherwise.";
4179 static PyObject*
4180 unicode_istitle(PyUnicodeObject *self)
4182 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4183 register const Py_UNICODE *e;
4184 int cased, previous_is_cased;
4186 /* Shortcut for single character strings */
4187 if (PyUnicode_GET_SIZE(self) == 1)
4188 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4189 (Py_UNICODE_ISUPPER(*p) != 0));
4191 /* Special case for empty strings */
4192 if (PyString_GET_SIZE(self) == 0)
4193 return PyInt_FromLong(0);
4195 e = p + PyUnicode_GET_SIZE(self);
4196 cased = 0;
4197 previous_is_cased = 0;
4198 for (; p < e; p++) {
4199 register const Py_UNICODE ch = *p;
4201 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4202 if (previous_is_cased)
4203 return PyInt_FromLong(0);
4204 previous_is_cased = 1;
4205 cased = 1;
4207 else if (Py_UNICODE_ISLOWER(ch)) {
4208 if (!previous_is_cased)
4209 return PyInt_FromLong(0);
4210 previous_is_cased = 1;
4211 cased = 1;
4213 else
4214 previous_is_cased = 0;
4216 return PyInt_FromLong(cased);
4219 static char isspace__doc__[] =
4220 "S.isspace() -> int\n\
4222 Return 1 if there are only whitespace characters in S,\n\
4223 0 otherwise.";
4225 static PyObject*
4226 unicode_isspace(PyUnicodeObject *self)
4228 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4229 register const Py_UNICODE *e;
4231 /* Shortcut for single character strings */
4232 if (PyUnicode_GET_SIZE(self) == 1 &&
4233 Py_UNICODE_ISSPACE(*p))
4234 return PyInt_FromLong(1);
4236 /* Special case for empty strings */
4237 if (PyString_GET_SIZE(self) == 0)
4238 return PyInt_FromLong(0);
4240 e = p + PyUnicode_GET_SIZE(self);
4241 for (; p < e; p++) {
4242 if (!Py_UNICODE_ISSPACE(*p))
4243 return PyInt_FromLong(0);
4245 return PyInt_FromLong(1);
4248 static char isalpha__doc__[] =
4249 "S.isalpha() -> int\n\
4251 Return 1 if all characters in S are alphabetic\n\
4252 and there is at least one character in S, 0 otherwise.";
4254 static PyObject*
4255 unicode_isalpha(PyUnicodeObject *self)
4257 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4258 register const Py_UNICODE *e;
4260 /* Shortcut for single character strings */
4261 if (PyUnicode_GET_SIZE(self) == 1 &&
4262 Py_UNICODE_ISALPHA(*p))
4263 return PyInt_FromLong(1);
4265 /* Special case for empty strings */
4266 if (PyString_GET_SIZE(self) == 0)
4267 return PyInt_FromLong(0);
4269 e = p + PyUnicode_GET_SIZE(self);
4270 for (; p < e; p++) {
4271 if (!Py_UNICODE_ISALPHA(*p))
4272 return PyInt_FromLong(0);
4274 return PyInt_FromLong(1);
4277 static char isalnum__doc__[] =
4278 "S.isalnum() -> int\n\
4280 Return 1 if all characters in S are alphanumeric\n\
4281 and there is at least one character in S, 0 otherwise.";
4283 static PyObject*
4284 unicode_isalnum(PyUnicodeObject *self)
4286 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4287 register const Py_UNICODE *e;
4289 /* Shortcut for single character strings */
4290 if (PyUnicode_GET_SIZE(self) == 1 &&
4291 Py_UNICODE_ISALNUM(*p))
4292 return PyInt_FromLong(1);
4294 /* Special case for empty strings */
4295 if (PyString_GET_SIZE(self) == 0)
4296 return PyInt_FromLong(0);
4298 e = p + PyUnicode_GET_SIZE(self);
4299 for (; p < e; p++) {
4300 if (!Py_UNICODE_ISALNUM(*p))
4301 return PyInt_FromLong(0);
4303 return PyInt_FromLong(1);
4306 static char isdecimal__doc__[] =
4307 "S.isdecimal() -> int\n\
4309 Return 1 if there are only decimal characters in S,\n\
4310 0 otherwise.";
4312 static PyObject*
4313 unicode_isdecimal(PyUnicodeObject *self)
4315 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4316 register const Py_UNICODE *e;
4318 /* Shortcut for single character strings */
4319 if (PyUnicode_GET_SIZE(self) == 1 &&
4320 Py_UNICODE_ISDECIMAL(*p))
4321 return PyInt_FromLong(1);
4323 /* Special case for empty strings */
4324 if (PyString_GET_SIZE(self) == 0)
4325 return PyInt_FromLong(0);
4327 e = p + PyUnicode_GET_SIZE(self);
4328 for (; p < e; p++) {
4329 if (!Py_UNICODE_ISDECIMAL(*p))
4330 return PyInt_FromLong(0);
4332 return PyInt_FromLong(1);
4335 static char isdigit__doc__[] =
4336 "S.isdigit() -> int\n\
4338 Return 1 if there are only digit characters in S,\n\
4339 0 otherwise.";
4341 static PyObject*
4342 unicode_isdigit(PyUnicodeObject *self)
4344 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4345 register const Py_UNICODE *e;
4347 /* Shortcut for single character strings */
4348 if (PyUnicode_GET_SIZE(self) == 1 &&
4349 Py_UNICODE_ISDIGIT(*p))
4350 return PyInt_FromLong(1);
4352 /* Special case for empty strings */
4353 if (PyString_GET_SIZE(self) == 0)
4354 return PyInt_FromLong(0);
4356 e = p + PyUnicode_GET_SIZE(self);
4357 for (; p < e; p++) {
4358 if (!Py_UNICODE_ISDIGIT(*p))
4359 return PyInt_FromLong(0);
4361 return PyInt_FromLong(1);
4364 static char isnumeric__doc__[] =
4365 "S.isnumeric() -> int\n\
4367 Return 1 if there are only numeric characters in S,\n\
4368 0 otherwise.";
4370 static PyObject*
4371 unicode_isnumeric(PyUnicodeObject *self)
4373 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4374 register const Py_UNICODE *e;
4376 /* Shortcut for single character strings */
4377 if (PyUnicode_GET_SIZE(self) == 1 &&
4378 Py_UNICODE_ISNUMERIC(*p))
4379 return PyInt_FromLong(1);
4381 /* Special case for empty strings */
4382 if (PyString_GET_SIZE(self) == 0)
4383 return PyInt_FromLong(0);
4385 e = p + PyUnicode_GET_SIZE(self);
4386 for (; p < e; p++) {
4387 if (!Py_UNICODE_ISNUMERIC(*p))
4388 return PyInt_FromLong(0);
4390 return PyInt_FromLong(1);
4393 static char join__doc__[] =
4394 "S.join(sequence) -> unicode\n\
4396 Return a string which is the concatenation of the strings in the\n\
4397 sequence. The separator between elements is S.";
4399 static PyObject*
4400 unicode_join(PyObject *self, PyObject *data)
4402 return PyUnicode_Join(self, data);
4405 static int
4406 unicode_length(PyUnicodeObject *self)
4408 return self->length;
4411 static char ljust__doc__[] =
4412 "S.ljust(width) -> unicode\n\
4414 Return S left justified in a Unicode string of length width. Padding is\n\
4415 done using spaces.";
4417 static PyObject *
4418 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4420 int width;
4421 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4422 return NULL;
4424 if (self->length >= width && PyUnicode_CheckExact(self)) {
4425 Py_INCREF(self);
4426 return (PyObject*) self;
4429 return (PyObject*) pad(self, 0, width - self->length, ' ');
4432 static char lower__doc__[] =
4433 "S.lower() -> unicode\n\
4435 Return a copy of the string S converted to lowercase.";
4437 static PyObject*
4438 unicode_lower(PyUnicodeObject *self)
4440 return fixup(self, fixlower);
4443 static char lstrip__doc__[] =
4444 "S.lstrip() -> unicode\n\
4446 Return a copy of the string S with leading whitespace removed.";
4448 static PyObject *
4449 unicode_lstrip(PyUnicodeObject *self)
4451 return strip(self, 1, 0);
4454 static PyObject*
4455 unicode_repeat(PyUnicodeObject *str, int len)
4457 PyUnicodeObject *u;
4458 Py_UNICODE *p;
4459 int nchars;
4460 size_t nbytes;
4462 if (len < 0)
4463 len = 0;
4465 if (len == 1 && PyUnicode_CheckExact(str)) {
4466 /* no repeat, return original string */
4467 Py_INCREF(str);
4468 return (PyObject*) str;
4471 /* ensure # of chars needed doesn't overflow int and # of bytes
4472 * needed doesn't overflow size_t
4474 nchars = len * str->length;
4475 if (len && nchars / len != str->length) {
4476 PyErr_SetString(PyExc_OverflowError,
4477 "repeated string is too long");
4478 return NULL;
4480 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4481 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4482 PyErr_SetString(PyExc_OverflowError,
4483 "repeated string is too long");
4484 return NULL;
4486 u = _PyUnicode_New(nchars);
4487 if (!u)
4488 return NULL;
4490 p = u->str;
4492 while (len-- > 0) {
4493 Py_UNICODE_COPY(p, str->str, str->length);
4494 p += str->length;
4497 return (PyObject*) u;
4500 PyObject *PyUnicode_Replace(PyObject *obj,
4501 PyObject *subobj,
4502 PyObject *replobj,
4503 int maxcount)
4505 PyObject *self;
4506 PyObject *str1;
4507 PyObject *str2;
4508 PyObject *result;
4510 self = PyUnicode_FromObject(obj);
4511 if (self == NULL)
4512 return NULL;
4513 str1 = PyUnicode_FromObject(subobj);
4514 if (str1 == NULL) {
4515 Py_DECREF(self);
4516 return NULL;
4518 str2 = PyUnicode_FromObject(replobj);
4519 if (str2 == NULL) {
4520 Py_DECREF(self);
4521 Py_DECREF(str1);
4522 return NULL;
4524 result = replace((PyUnicodeObject *)self,
4525 (PyUnicodeObject *)str1,
4526 (PyUnicodeObject *)str2,
4527 maxcount);
4528 Py_DECREF(self);
4529 Py_DECREF(str1);
4530 Py_DECREF(str2);
4531 return result;
4534 static char replace__doc__[] =
4535 "S.replace (old, new[, maxsplit]) -> unicode\n\
4537 Return a copy of S with all occurrences of substring\n\
4538 old replaced by new. If the optional argument maxsplit is\n\
4539 given, only the first maxsplit occurrences are replaced.";
4541 static PyObject*
4542 unicode_replace(PyUnicodeObject *self, PyObject *args)
4544 PyUnicodeObject *str1;
4545 PyUnicodeObject *str2;
4546 int maxcount = -1;
4547 PyObject *result;
4549 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4550 return NULL;
4551 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4552 if (str1 == NULL)
4553 return NULL;
4554 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4555 if (str2 == NULL)
4556 return NULL;
4558 result = replace(self, str1, str2, maxcount);
4560 Py_DECREF(str1);
4561 Py_DECREF(str2);
4562 return result;
4565 static
4566 PyObject *unicode_repr(PyObject *unicode)
4568 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4569 PyUnicode_GET_SIZE(unicode),
4573 static char rfind__doc__[] =
4574 "S.rfind(sub [,start [,end]]) -> int\n\
4576 Return the highest index in S where substring sub is found,\n\
4577 such that sub is contained within s[start,end]. Optional\n\
4578 arguments start and end are interpreted as in slice notation.\n\
4580 Return -1 on failure.";
4582 static PyObject *
4583 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4585 PyUnicodeObject *substring;
4586 int start = 0;
4587 int end = INT_MAX;
4588 PyObject *result;
4590 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4591 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4592 return NULL;
4593 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4594 (PyObject *)substring);
4595 if (substring == NULL)
4596 return NULL;
4598 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4600 Py_DECREF(substring);
4601 return result;
4604 static char rindex__doc__[] =
4605 "S.rindex(sub [,start [,end]]) -> int\n\
4607 Like S.rfind() but raise ValueError when the substring is not found.";
4609 static PyObject *
4610 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4612 int result;
4613 PyUnicodeObject *substring;
4614 int start = 0;
4615 int end = INT_MAX;
4617 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4618 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4619 return NULL;
4620 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4621 (PyObject *)substring);
4622 if (substring == NULL)
4623 return NULL;
4625 result = findstring(self, substring, start, end, -1);
4627 Py_DECREF(substring);
4628 if (result < 0) {
4629 PyErr_SetString(PyExc_ValueError, "substring not found");
4630 return NULL;
4632 return PyInt_FromLong(result);
4635 static char rjust__doc__[] =
4636 "S.rjust(width) -> unicode\n\
4638 Return S right justified in a Unicode string of length width. Padding is\n\
4639 done using spaces.";
4641 static PyObject *
4642 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4644 int width;
4645 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4646 return NULL;
4648 if (self->length >= width && PyUnicode_CheckExact(self)) {
4649 Py_INCREF(self);
4650 return (PyObject*) self;
4653 return (PyObject*) pad(self, width - self->length, 0, ' ');
4656 static char rstrip__doc__[] =
4657 "S.rstrip() -> unicode\n\
4659 Return a copy of the string S with trailing whitespace removed.";
4661 static PyObject *
4662 unicode_rstrip(PyUnicodeObject *self)
4664 return strip(self, 0, 1);
4667 static PyObject*
4668 unicode_slice(PyUnicodeObject *self, int start, int end)
4670 /* standard clamping */
4671 if (start < 0)
4672 start = 0;
4673 if (end < 0)
4674 end = 0;
4675 if (end > self->length)
4676 end = self->length;
4677 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4678 /* full slice, return original string */
4679 Py_INCREF(self);
4680 return (PyObject*) self;
4682 if (start > end)
4683 start = end;
4684 /* copy slice */
4685 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4686 end - start);
4689 PyObject *PyUnicode_Split(PyObject *s,
4690 PyObject *sep,
4691 int maxsplit)
4693 PyObject *result;
4695 s = PyUnicode_FromObject(s);
4696 if (s == NULL)
4697 return NULL;
4698 if (sep != NULL) {
4699 sep = PyUnicode_FromObject(sep);
4700 if (sep == NULL) {
4701 Py_DECREF(s);
4702 return NULL;
4706 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4708 Py_DECREF(s);
4709 Py_XDECREF(sep);
4710 return result;
4713 static char split__doc__[] =
4714 "S.split([sep [,maxsplit]]) -> list of strings\n\
4716 Return a list of the words in S, using sep as the\n\
4717 delimiter string. If maxsplit is given, at most maxsplit\n\
4718 splits are done. If sep is not specified, any whitespace string\n\
4719 is a separator.";
4721 static PyObject*
4722 unicode_split(PyUnicodeObject *self, PyObject *args)
4724 PyObject *substring = Py_None;
4725 int maxcount = -1;
4727 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4728 return NULL;
4730 if (substring == Py_None)
4731 return split(self, NULL, maxcount);
4732 else if (PyUnicode_Check(substring))
4733 return split(self, (PyUnicodeObject *)substring, maxcount);
4734 else
4735 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4738 static char splitlines__doc__[] =
4739 "S.splitlines([keepends]]) -> list of strings\n\
4741 Return a list of the lines in S, breaking at line boundaries.\n\
4742 Line breaks are not included in the resulting list unless keepends\n\
4743 is given and true.";
4745 static PyObject*
4746 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4748 int keepends = 0;
4750 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4751 return NULL;
4753 return PyUnicode_Splitlines((PyObject *)self, keepends);
4756 static
4757 PyObject *unicode_str(PyUnicodeObject *self)
4759 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4762 static char strip__doc__[] =
4763 "S.strip() -> unicode\n\
4765 Return a copy of S with leading and trailing whitespace removed.";
4767 static PyObject *
4768 unicode_strip(PyUnicodeObject *self)
4770 return strip(self, 1, 1);
4773 static char swapcase__doc__[] =
4774 "S.swapcase() -> unicode\n\
4776 Return a copy of S with uppercase characters converted to lowercase\n\
4777 and vice versa.";
4779 static PyObject*
4780 unicode_swapcase(PyUnicodeObject *self)
4782 return fixup(self, fixswapcase);
4785 static char translate__doc__[] =
4786 "S.translate(table) -> unicode\n\
4788 Return a copy of the string S, where all characters have been mapped\n\
4789 through the given translation table, which must be a mapping of\n\
4790 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4791 are left untouched. Characters mapped to None are deleted.";
4793 static PyObject*
4794 unicode_translate(PyUnicodeObject *self, PyObject *table)
4796 return PyUnicode_TranslateCharmap(self->str,
4797 self->length,
4798 table,
4799 "ignore");
4802 static char upper__doc__[] =
4803 "S.upper() -> unicode\n\
4805 Return a copy of S converted to uppercase.";
4807 static PyObject*
4808 unicode_upper(PyUnicodeObject *self)
4810 return fixup(self, fixupper);
4813 #if 0
4814 static char zfill__doc__[] =
4815 "S.zfill(width) -> unicode\n\
4817 Pad a numeric string x with zeros on the left, to fill a field\n\
4818 of the specified width. The string x is never truncated.";
4820 static PyObject *
4821 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4823 int fill;
4824 PyUnicodeObject *u;
4826 int width;
4827 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4828 return NULL;
4830 if (self->length >= width) {
4831 Py_INCREF(self);
4832 return (PyObject*) self;
4835 fill = width - self->length;
4837 u = pad(self, fill, 0, '0');
4839 if (u->str[fill] == '+' || u->str[fill] == '-') {
4840 /* move sign to beginning of string */
4841 u->str[0] = u->str[fill];
4842 u->str[fill] = '0';
4845 return (PyObject*) u;
4847 #endif
4849 #if 0
4850 static PyObject*
4851 unicode_freelistsize(PyUnicodeObject *self)
4853 return PyInt_FromLong(unicode_freelist_size);
4855 #endif
4857 static char startswith__doc__[] =
4858 "S.startswith(prefix[, start[, end]]) -> int\n\
4860 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4861 optional start, test S beginning at that position. With optional end, stop\n\
4862 comparing S at that position.";
4864 static PyObject *
4865 unicode_startswith(PyUnicodeObject *self,
4866 PyObject *args)
4868 PyUnicodeObject *substring;
4869 int start = 0;
4870 int end = INT_MAX;
4871 PyObject *result;
4873 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4874 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4875 return NULL;
4876 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4877 (PyObject *)substring);
4878 if (substring == NULL)
4879 return NULL;
4881 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4883 Py_DECREF(substring);
4884 return result;
4888 static char endswith__doc__[] =
4889 "S.endswith(suffix[, start[, end]]) -> int\n\
4891 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4892 optional start, test S beginning at that position. With optional end, stop\n\
4893 comparing S at that position.";
4895 static PyObject *
4896 unicode_endswith(PyUnicodeObject *self,
4897 PyObject *args)
4899 PyUnicodeObject *substring;
4900 int start = 0;
4901 int end = INT_MAX;
4902 PyObject *result;
4904 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4905 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4906 return NULL;
4907 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4908 (PyObject *)substring);
4909 if (substring == NULL)
4910 return NULL;
4912 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4914 Py_DECREF(substring);
4915 return result;
4919 static PyMethodDef unicode_methods[] = {
4921 /* Order is according to common usage: often used methods should
4922 appear first, since lookup is done sequentially. */
4924 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4925 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4926 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4927 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4928 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4929 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4930 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4931 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4932 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4933 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4934 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4935 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4936 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4937 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4938 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4939 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4940 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4941 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4942 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4943 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4944 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4945 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4946 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4947 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4948 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4949 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4950 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4951 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4952 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4953 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4954 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4955 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4956 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4957 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4958 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4959 #if 0
4960 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4961 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4962 #endif
4964 #if 0
4965 /* This one is just used for debugging the implementation. */
4966 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4967 #endif
4969 {NULL, NULL}
4972 static PySequenceMethods unicode_as_sequence = {
4973 (inquiry) unicode_length, /* sq_length */
4974 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4975 (intargfunc) unicode_repeat, /* sq_repeat */
4976 (intargfunc) unicode_getitem, /* sq_item */
4977 (intintargfunc) unicode_slice, /* sq_slice */
4978 0, /* sq_ass_item */
4979 0, /* sq_ass_slice */
4980 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4983 static int
4984 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4985 int index,
4986 const void **ptr)
4988 if (index != 0) {
4989 PyErr_SetString(PyExc_SystemError,
4990 "accessing non-existent unicode segment");
4991 return -1;
4993 *ptr = (void *) self->str;
4994 return PyUnicode_GET_DATA_SIZE(self);
4997 static int
4998 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4999 const void **ptr)
5001 PyErr_SetString(PyExc_TypeError,
5002 "cannot use unicode as modifyable buffer");
5003 return -1;
5006 static int
5007 unicode_buffer_getsegcount(PyUnicodeObject *self,
5008 int *lenp)
5010 if (lenp)
5011 *lenp = PyUnicode_GET_DATA_SIZE(self);
5012 return 1;
5015 static int
5016 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5017 int index,
5018 const void **ptr)
5020 PyObject *str;
5022 if (index != 0) {
5023 PyErr_SetString(PyExc_SystemError,
5024 "accessing non-existent unicode segment");
5025 return -1;
5027 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5028 if (str == NULL)
5029 return -1;
5030 *ptr = (void *) PyString_AS_STRING(str);
5031 return PyString_GET_SIZE(str);
5034 /* Helpers for PyUnicode_Format() */
5036 static PyObject *
5037 getnextarg(PyObject *args, int arglen, int *p_argidx)
5039 int argidx = *p_argidx;
5040 if (argidx < arglen) {
5041 (*p_argidx)++;
5042 if (arglen < 0)
5043 return args;
5044 else
5045 return PyTuple_GetItem(args, argidx);
5047 PyErr_SetString(PyExc_TypeError,
5048 "not enough arguments for format string");
5049 return NULL;
5052 #define F_LJUST (1<<0)
5053 #define F_SIGN (1<<1)
5054 #define F_BLANK (1<<2)
5055 #define F_ALT (1<<3)
5056 #define F_ZERO (1<<4)
5058 static
5059 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5061 register int i;
5062 int len;
5063 va_list va;
5064 char *charbuffer;
5065 va_start(va, format);
5067 /* First, format the string as char array, then expand to Py_UNICODE
5068 array. */
5069 charbuffer = (char *)buffer;
5070 len = vsprintf(charbuffer, format, va);
5071 for (i = len - 1; i >= 0; i--)
5072 buffer[i] = (Py_UNICODE) charbuffer[i];
5074 va_end(va);
5075 return len;
5078 static int
5079 formatfloat(Py_UNICODE *buf,
5080 size_t buflen,
5081 int flags,
5082 int prec,
5083 int type,
5084 PyObject *v)
5086 /* fmt = '%#.' + `prec` + `type`
5087 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5088 char fmt[20];
5089 double x;
5091 x = PyFloat_AsDouble(v);
5092 if (x == -1.0 && PyErr_Occurred())
5093 return -1;
5094 if (prec < 0)
5095 prec = 6;
5096 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5097 type = 'g';
5098 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5099 (flags & F_ALT) ? "#" : "", prec, type);
5100 /* worst case length calc to ensure no buffer overrun:
5101 fmt = %#.<prec>g
5102 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5103 for any double rep.)
5104 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5105 If prec=0 the effective precision is 1 (the leading digit is
5106 always given), therefore increase by one to 10+prec. */
5107 if (buflen <= (size_t)10 + (size_t)prec) {
5108 PyErr_SetString(PyExc_OverflowError,
5109 "formatted float is too long (precision too long?)");
5110 return -1;
5112 return usprintf(buf, fmt, x);
5115 static PyObject*
5116 formatlong(PyObject *val, int flags, int prec, int type)
5118 char *buf;
5119 int i, len;
5120 PyObject *str; /* temporary string object. */
5121 PyUnicodeObject *result;
5123 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5124 if (!str)
5125 return NULL;
5126 result = _PyUnicode_New(len);
5127 for (i = 0; i < len; i++)
5128 result->str[i] = buf[i];
5129 result->str[len] = 0;
5130 Py_DECREF(str);
5131 return (PyObject*)result;
5134 static int
5135 formatint(Py_UNICODE *buf,
5136 size_t buflen,
5137 int flags,
5138 int prec,
5139 int type,
5140 PyObject *v)
5142 /* fmt = '%#.' + `prec` + 'l' + `type`
5143 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5144 + 1 + 1 = 24*/
5145 char fmt[64]; /* plenty big enough! */
5146 long x;
5147 int use_native_c_format = 1;
5149 x = PyInt_AsLong(v);
5150 if (x == -1 && PyErr_Occurred())
5151 return -1;
5152 if (prec < 0)
5153 prec = 1;
5154 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5155 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5156 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5157 PyErr_SetString(PyExc_OverflowError,
5158 "formatted integer is too long (precision too long?)");
5159 return -1;
5161 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5162 * but we want it (for consistency with other %#x conversions, and
5163 * for consistency with Python's hex() function).
5164 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5165 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5166 * So add it only if the platform doesn't already.
5168 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5169 /* Only way to know what the platform does is to try it. */
5170 PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
5171 if (fmt[1] != (char)type) {
5172 /* Supply our own leading 0x/0X -- needed under std C */
5173 use_native_c_format = 0;
5174 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
5177 if (use_native_c_format)
5178 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5179 (flags & F_ALT) ? "#" : "", prec, type);
5180 return usprintf(buf, fmt, x);
5183 static int
5184 formatchar(Py_UNICODE *buf,
5185 size_t buflen,
5186 PyObject *v)
5188 /* presume that the buffer is at least 2 characters long */
5189 if (PyUnicode_Check(v)) {
5190 if (PyUnicode_GET_SIZE(v) != 1)
5191 goto onError;
5192 buf[0] = PyUnicode_AS_UNICODE(v)[0];
5195 else if (PyString_Check(v)) {
5196 if (PyString_GET_SIZE(v) != 1)
5197 goto onError;
5198 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5201 else {
5202 /* Integer input truncated to a character */
5203 long x;
5204 x = PyInt_AsLong(v);
5205 if (x == -1 && PyErr_Occurred())
5206 goto onError;
5207 buf[0] = (char) x;
5209 buf[1] = '\0';
5210 return 1;
5212 onError:
5213 PyErr_SetString(PyExc_TypeError,
5214 "%c requires int or char");
5215 return -1;
5218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5220 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5221 chars are formatted. XXX This is a magic number. Each formatting
5222 routine does bounds checking to ensure no overflow, but a better
5223 solution may be to malloc a buffer of appropriate size for each
5224 format. For now, the current solution is sufficient.
5226 #define FORMATBUFLEN (size_t)120
5228 PyObject *PyUnicode_Format(PyObject *format,
5229 PyObject *args)
5231 Py_UNICODE *fmt, *res;
5232 int fmtcnt, rescnt, reslen, arglen, argidx;
5233 int args_owned = 0;
5234 PyUnicodeObject *result = NULL;
5235 PyObject *dict = NULL;
5236 PyObject *uformat;
5238 if (format == NULL || args == NULL) {
5239 PyErr_BadInternalCall();
5240 return NULL;
5242 uformat = PyUnicode_FromObject(format);
5243 if (uformat == NULL)
5244 return NULL;
5245 fmt = PyUnicode_AS_UNICODE(uformat);
5246 fmtcnt = PyUnicode_GET_SIZE(uformat);
5248 reslen = rescnt = fmtcnt + 100;
5249 result = _PyUnicode_New(reslen);
5250 if (result == NULL)
5251 goto onError;
5252 res = PyUnicode_AS_UNICODE(result);
5254 if (PyTuple_Check(args)) {
5255 arglen = PyTuple_Size(args);
5256 argidx = 0;
5258 else {
5259 arglen = -1;
5260 argidx = -2;
5262 if (args->ob_type->tp_as_mapping)
5263 dict = args;
5265 while (--fmtcnt >= 0) {
5266 if (*fmt != '%') {
5267 if (--rescnt < 0) {
5268 rescnt = fmtcnt + 100;
5269 reslen += rescnt;
5270 if (_PyUnicode_Resize(&result, reslen) < 0)
5271 return NULL;
5272 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5273 --rescnt;
5275 *res++ = *fmt++;
5277 else {
5278 /* Got a format specifier */
5279 int flags = 0;
5280 int width = -1;
5281 int prec = -1;
5282 Py_UNICODE c = '\0';
5283 Py_UNICODE fill;
5284 PyObject *v = NULL;
5285 PyObject *temp = NULL;
5286 Py_UNICODE *pbuf;
5287 Py_UNICODE sign;
5288 int len;
5289 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5291 fmt++;
5292 if (*fmt == '(') {
5293 Py_UNICODE *keystart;
5294 int keylen;
5295 PyObject *key;
5296 int pcount = 1;
5298 if (dict == NULL) {
5299 PyErr_SetString(PyExc_TypeError,
5300 "format requires a mapping");
5301 goto onError;
5303 ++fmt;
5304 --fmtcnt;
5305 keystart = fmt;
5306 /* Skip over balanced parentheses */
5307 while (pcount > 0 && --fmtcnt >= 0) {
5308 if (*fmt == ')')
5309 --pcount;
5310 else if (*fmt == '(')
5311 ++pcount;
5312 fmt++;
5314 keylen = fmt - keystart - 1;
5315 if (fmtcnt < 0 || pcount > 0) {
5316 PyErr_SetString(PyExc_ValueError,
5317 "incomplete format key");
5318 goto onError;
5320 #if 0
5321 /* keys are converted to strings using UTF-8 and
5322 then looked up since Python uses strings to hold
5323 variables names etc. in its namespaces and we
5324 wouldn't want to break common idioms. */
5325 key = PyUnicode_EncodeUTF8(keystart,
5326 keylen,
5327 NULL);
5328 #else
5329 key = PyUnicode_FromUnicode(keystart, keylen);
5330 #endif
5331 if (key == NULL)
5332 goto onError;
5333 if (args_owned) {
5334 Py_DECREF(args);
5335 args_owned = 0;
5337 args = PyObject_GetItem(dict, key);
5338 Py_DECREF(key);
5339 if (args == NULL) {
5340 goto onError;
5342 args_owned = 1;
5343 arglen = -1;
5344 argidx = -2;
5346 while (--fmtcnt >= 0) {
5347 switch (c = *fmt++) {
5348 case '-': flags |= F_LJUST; continue;
5349 case '+': flags |= F_SIGN; continue;
5350 case ' ': flags |= F_BLANK; continue;
5351 case '#': flags |= F_ALT; continue;
5352 case '0': flags |= F_ZERO; continue;
5354 break;
5356 if (c == '*') {
5357 v = getnextarg(args, arglen, &argidx);
5358 if (v == NULL)
5359 goto onError;
5360 if (!PyInt_Check(v)) {
5361 PyErr_SetString(PyExc_TypeError,
5362 "* wants int");
5363 goto onError;
5365 width = PyInt_AsLong(v);
5366 if (width < 0) {
5367 flags |= F_LJUST;
5368 width = -width;
5370 if (--fmtcnt >= 0)
5371 c = *fmt++;
5373 else if (c >= '0' && c <= '9') {
5374 width = c - '0';
5375 while (--fmtcnt >= 0) {
5376 c = *fmt++;
5377 if (c < '0' || c > '9')
5378 break;
5379 if ((width*10) / 10 != width) {
5380 PyErr_SetString(PyExc_ValueError,
5381 "width too big");
5382 goto onError;
5384 width = width*10 + (c - '0');
5387 if (c == '.') {
5388 prec = 0;
5389 if (--fmtcnt >= 0)
5390 c = *fmt++;
5391 if (c == '*') {
5392 v = getnextarg(args, arglen, &argidx);
5393 if (v == NULL)
5394 goto onError;
5395 if (!PyInt_Check(v)) {
5396 PyErr_SetString(PyExc_TypeError,
5397 "* wants int");
5398 goto onError;
5400 prec = PyInt_AsLong(v);
5401 if (prec < 0)
5402 prec = 0;
5403 if (--fmtcnt >= 0)
5404 c = *fmt++;
5406 else if (c >= '0' && c <= '9') {
5407 prec = c - '0';
5408 while (--fmtcnt >= 0) {
5409 c = Py_CHARMASK(*fmt++);
5410 if (c < '0' || c > '9')
5411 break;
5412 if ((prec*10) / 10 != prec) {
5413 PyErr_SetString(PyExc_ValueError,
5414 "prec too big");
5415 goto onError;
5417 prec = prec*10 + (c - '0');
5420 } /* prec */
5421 if (fmtcnt >= 0) {
5422 if (c == 'h' || c == 'l' || c == 'L') {
5423 if (--fmtcnt >= 0)
5424 c = *fmt++;
5427 if (fmtcnt < 0) {
5428 PyErr_SetString(PyExc_ValueError,
5429 "incomplete format");
5430 goto onError;
5432 if (c != '%') {
5433 v = getnextarg(args, arglen, &argidx);
5434 if (v == NULL)
5435 goto onError;
5437 sign = 0;
5438 fill = ' ';
5439 switch (c) {
5441 case '%':
5442 pbuf = formatbuf;
5443 /* presume that buffer length is at least 1 */
5444 pbuf[0] = '%';
5445 len = 1;
5446 break;
5448 case 's':
5449 case 'r':
5450 if (PyUnicode_Check(v) && c == 's') {
5451 temp = v;
5452 Py_INCREF(temp);
5454 else {
5455 PyObject *unicode;
5456 if (c == 's')
5457 temp = PyObject_Str(v);
5458 else
5459 temp = PyObject_Repr(v);
5460 if (temp == NULL)
5461 goto onError;
5462 if (!PyString_Check(temp)) {
5463 /* XXX Note: this should never happen, since
5464 PyObject_Repr() and PyObject_Str() assure
5465 this */
5466 Py_DECREF(temp);
5467 PyErr_SetString(PyExc_TypeError,
5468 "%s argument has non-string str()");
5469 goto onError;
5471 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5472 PyString_GET_SIZE(temp),
5473 NULL,
5474 "strict");
5475 Py_DECREF(temp);
5476 temp = unicode;
5477 if (temp == NULL)
5478 goto onError;
5480 pbuf = PyUnicode_AS_UNICODE(temp);
5481 len = PyUnicode_GET_SIZE(temp);
5482 if (prec >= 0 && len > prec)
5483 len = prec;
5484 break;
5486 case 'i':
5487 case 'd':
5488 case 'u':
5489 case 'o':
5490 case 'x':
5491 case 'X':
5492 if (c == 'i')
5493 c = 'd';
5494 if (PyLong_Check(v)) {
5495 temp = formatlong(v, flags, prec, c);
5496 if (!temp)
5497 goto onError;
5498 pbuf = PyUnicode_AS_UNICODE(temp);
5499 len = PyUnicode_GET_SIZE(temp);
5500 /* unbounded ints can always produce
5501 a sign character! */
5502 sign = 1;
5504 else {
5505 pbuf = formatbuf;
5506 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5507 flags, prec, c, v);
5508 if (len < 0)
5509 goto onError;
5510 /* only d conversion is signed */
5511 sign = c == 'd';
5513 if (flags & F_ZERO)
5514 fill = '0';
5515 break;
5517 case 'e':
5518 case 'E':
5519 case 'f':
5520 case 'g':
5521 case 'G':
5522 pbuf = formatbuf;
5523 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5524 flags, prec, c, v);
5525 if (len < 0)
5526 goto onError;
5527 sign = 1;
5528 if (flags & F_ZERO)
5529 fill = '0';
5530 break;
5532 case 'c':
5533 pbuf = formatbuf;
5534 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5535 if (len < 0)
5536 goto onError;
5537 break;
5539 default:
5540 PyErr_Format(PyExc_ValueError,
5541 "unsupported format character '%c' (0x%x) "
5542 "at index %i",
5543 (31<=c && c<=126) ? (int)c : '?',
5544 (int)c, (fmt -1 - PyUnicode_AS_UNICODE(uformat)));
5545 goto onError;
5547 if (sign) {
5548 if (*pbuf == '-' || *pbuf == '+') {
5549 sign = *pbuf++;
5550 len--;
5552 else if (flags & F_SIGN)
5553 sign = '+';
5554 else if (flags & F_BLANK)
5555 sign = ' ';
5556 else
5557 sign = 0;
5559 if (width < len)
5560 width = len;
5561 if (rescnt < width + (sign != 0)) {
5562 reslen -= rescnt;
5563 rescnt = width + fmtcnt + 100;
5564 reslen += rescnt;
5565 if (_PyUnicode_Resize(&result, reslen) < 0)
5566 return NULL;
5567 res = PyUnicode_AS_UNICODE(result)
5568 + reslen - rescnt;
5570 if (sign) {
5571 if (fill != ' ')
5572 *res++ = sign;
5573 rescnt--;
5574 if (width > len)
5575 width--;
5577 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5578 assert(pbuf[0] == '0');
5579 assert(pbuf[1] == c);
5580 if (fill != ' ') {
5581 *res++ = *pbuf++;
5582 *res++ = *pbuf++;
5584 rescnt -= 2;
5585 width -= 2;
5586 if (width < 0)
5587 width = 0;
5588 len -= 2;
5590 if (width > len && !(flags & F_LJUST)) {
5591 do {
5592 --rescnt;
5593 *res++ = fill;
5594 } while (--width > len);
5596 if (fill == ' ') {
5597 if (sign)
5598 *res++ = sign;
5599 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5600 assert(pbuf[0] == '0');
5601 assert(pbuf[1] == c);
5602 *res++ = *pbuf++;
5603 *res++ = *pbuf++;
5606 Py_UNICODE_COPY(res, pbuf, len);
5607 res += len;
5608 rescnt -= len;
5609 while (--width >= len) {
5610 --rescnt;
5611 *res++ = ' ';
5613 if (dict && (argidx < arglen) && c != '%') {
5614 PyErr_SetString(PyExc_TypeError,
5615 "not all arguments converted");
5616 goto onError;
5618 Py_XDECREF(temp);
5619 } /* '%' */
5620 } /* until end */
5621 if (argidx < arglen && !dict) {
5622 PyErr_SetString(PyExc_TypeError,
5623 "not all arguments converted");
5624 goto onError;
5627 if (args_owned) {
5628 Py_DECREF(args);
5630 Py_DECREF(uformat);
5631 if (_PyUnicode_Resize(&result, reslen - rescnt))
5632 goto onError;
5633 return (PyObject *)result;
5635 onError:
5636 Py_XDECREF(result);
5637 Py_DECREF(uformat);
5638 if (args_owned) {
5639 Py_DECREF(args);
5641 return NULL;
5644 static PyBufferProcs unicode_as_buffer = {
5645 (getreadbufferproc) unicode_buffer_getreadbuf,
5646 (getwritebufferproc) unicode_buffer_getwritebuf,
5647 (getsegcountproc) unicode_buffer_getsegcount,
5648 (getcharbufferproc) unicode_buffer_getcharbuf,
5651 staticforward PyObject *
5652 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5654 static PyObject *
5655 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5657 PyObject *x = NULL;
5658 static char *kwlist[] = {"string", "encoding", "errors", 0};
5659 char *encoding = NULL;
5660 char *errors = NULL;
5662 if (type != &PyUnicode_Type)
5663 return unicode_subtype_new(type, args, kwds);
5664 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5665 kwlist, &x, &encoding, &errors))
5666 return NULL;
5667 if (x == NULL)
5668 return (PyObject *)_PyUnicode_New(0);
5669 if (encoding == NULL && errors == NULL)
5670 return PyObject_Unicode(x);
5671 else
5672 return PyUnicode_FromEncodedObject(x, encoding, errors);
5675 static PyObject *
5676 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5678 PyUnicodeObject *tmp, *pnew;
5679 int n;
5681 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5682 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5683 if (tmp == NULL)
5684 return NULL;
5685 assert(PyUnicode_Check(tmp));
5686 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5687 if (pnew == NULL)
5688 return NULL;
5689 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5690 if (pnew->str == NULL) {
5691 _Py_ForgetReference((PyObject *)pnew);
5692 PyObject_DEL(pnew);
5693 return NULL;
5695 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5696 pnew->length = n;
5697 pnew->hash = tmp->hash;
5698 Py_DECREF(tmp);
5699 return (PyObject *)pnew;
5702 static char unicode_doc[] =
5703 "unicode(string [, encoding[, errors]]) -> object\n\
5705 Create a new Unicode object from the given encoded string.\n\
5706 encoding defaults to the current default string encoding and \n\
5707 errors, defining the error handling, to 'strict'.";
5709 PyTypeObject PyUnicode_Type = {
5710 PyObject_HEAD_INIT(&PyType_Type)
5711 0, /* ob_size */
5712 "unicode", /* tp_name */
5713 sizeof(PyUnicodeObject), /* tp_size */
5714 0, /* tp_itemsize */
5715 /* Slots */
5716 (destructor)unicode_dealloc, /* tp_dealloc */
5717 0, /* tp_print */
5718 0, /* tp_getattr */
5719 0, /* tp_setattr */
5720 (cmpfunc) unicode_compare, /* tp_compare */
5721 (reprfunc) unicode_repr, /* tp_repr */
5722 0, /* tp_as_number */
5723 &unicode_as_sequence, /* tp_as_sequence */
5724 0, /* tp_as_mapping */
5725 (hashfunc) unicode_hash, /* tp_hash*/
5726 0, /* tp_call*/
5727 (reprfunc) unicode_str, /* tp_str */
5728 PyObject_GenericGetAttr, /* tp_getattro */
5729 0, /* tp_setattro */
5730 &unicode_as_buffer, /* tp_as_buffer */
5731 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5732 unicode_doc, /* tp_doc */
5733 0, /* tp_traverse */
5734 0, /* tp_clear */
5735 0, /* tp_richcompare */
5736 0, /* tp_weaklistoffset */
5737 0, /* tp_iter */
5738 0, /* tp_iternext */
5739 unicode_methods, /* tp_methods */
5740 0, /* tp_members */
5741 0, /* tp_getset */
5742 0, /* tp_base */
5743 0, /* tp_dict */
5744 0, /* tp_descr_get */
5745 0, /* tp_descr_set */
5746 0, /* tp_dictoffset */
5747 0, /* tp_init */
5748 0, /* tp_alloc */
5749 unicode_new, /* tp_new */
5750 _PyObject_Del, /* tp_free */
5753 /* Initialize the Unicode implementation */
5755 void _PyUnicode_Init(void)
5757 int i;
5759 /* Init the implementation */
5760 unicode_freelist = NULL;
5761 unicode_freelist_size = 0;
5762 unicode_empty = _PyUnicode_New(0);
5763 strcpy(unicode_default_encoding, "ascii");
5764 for (i = 0; i < 256; i++)
5765 unicode_latin1[i] = NULL;
5768 /* Finalize the Unicode implementation */
5770 void
5771 _PyUnicode_Fini(void)
5773 PyUnicodeObject *u;
5774 int i;
5776 Py_XDECREF(unicode_empty);
5777 unicode_empty = NULL;
5779 for (i = 0; i < 256; i++) {
5780 if (unicode_latin1[i]) {
5781 Py_DECREF(unicode_latin1[i]);
5782 unicode_latin1[i] = NULL;
5786 for (u = unicode_freelist; u != NULL;) {
5787 PyUnicodeObject *v = u;
5788 u = *(PyUnicodeObject **)u;
5789 if (v->str)
5790 PyMem_DEL(v->str);
5791 Py_XDECREF(v->defenc);
5792 PyObject_DEL(v);
5794 unicode_freelist = NULL;
5795 unicode_freelist_size = 0;