Merged release21-maint changes.
[python/dscho.git] / Objects / unicodeobject.c
blob8bd1287124c9b99b9cada71f02548f0c2b338245
1 /*
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
26 permission.
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
39 #include "Python.h"
41 #include "unicodeobject.h"
42 #include "ucnhash.h"
44 #ifdef MS_WIN32
45 #include <windows.h>
46 #endif
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
75 #else
76 # define BYTEORDER_IS_LITTLE_ENDIAN
77 #endif
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject *unicode_freelist;
88 static int unicode_freelist_size;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject *unicode_empty;
93 /* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95 static PyUnicodeObject *unicode_latin1[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding[100];
106 Py_UNICODE
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
110 return 0x10FFFF;
111 #else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115 #endif
118 /* --- Unicode Object ----------------------------------------------------- */
120 static
121 int unicode_resize(register PyUnicodeObject *unicode,
122 int length)
124 void *oldstr;
126 /* Shortcut if there's nothing much to do. */
127 if (unicode->length == length)
128 goto reset;
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
137 PyErr_SetString(PyExc_SystemError,
138 "can't resize shared unicode objects");
139 return -1;
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
151 unicode->str[length] = 0;
152 unicode->length = length;
154 reset:
155 /* Reset the object caches */
156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
160 unicode->hash = -1;
162 return 0;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
173 static
174 PyUnicodeObject *_PyUnicode_New(int length)
176 register PyUnicodeObject *unicode;
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
187 unicode_freelist = *(PyUnicodeObject **)unicode;
188 unicode_freelist_size--;
189 if (unicode->str) {
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
193 unicode_resize(unicode, length)) {
194 PyMem_DEL(unicode->str);
195 goto onError;
198 else {
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
201 PyObject_INIT(unicode, &PyUnicode_Type);
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
210 if (!unicode->str) {
211 PyErr_NoMemory();
212 goto onError;
214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
217 unicode->defenc = NULL;
218 return unicode;
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
222 PyObject_DEL(unicode);
223 return NULL;
226 static
227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
232 PyMem_DEL(unicode->str);
233 unicode->str = NULL;
234 unicode->length = 0;
236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
240 /* Add to free list */
241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
245 else {
246 PyMem_DEL(unicode->str);
247 Py_XDECREF(unicode->defenc);
248 PyObject_DEL(unicode);
252 int PyUnicode_Resize(PyObject **unicode,
253 int length)
255 register PyUnicodeObject *v;
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
294 PyUnicodeObject *unicode;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
312 if (!unicode)
313 return NULL;
314 unicode->str[0] = *u;
315 unicode_latin1[*u] = unicode;
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
328 Py_UNICODE_COPY(unicode->str, u, size);
330 return (PyObject *)unicode;
333 #ifdef HAVE_WCHAR_H
335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
338 PyUnicodeObject *unicode;
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352 #else
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
360 #endif
362 return (PyObject *)unicode;
365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377 #else
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
385 #endif
387 return size;
390 #endif
392 PyObject *PyUnicode_FromObject(register PyObject *obj)
394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
401 const char *s;
402 int len;
403 int owned = 0;
404 PyObject *v;
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
426 if (PyUnicode_Check(obj)) {
427 Py_INCREF(obj);
428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
434 goto done;
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
448 goto onError;
451 /* Convert to Unicode */
452 if (len == 0) {
453 Py_INCREF(unicode_empty);
454 v = (PyObject *)unicode_empty;
456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
459 done:
460 if (owned) {
461 Py_DECREF(obj);
463 return v;
465 onError:
466 if (owned) {
467 Py_DECREF(obj);
469 return NULL;
472 PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
477 PyObject *buffer = NULL, *unicode;
479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
484 return PyUnicode_DecodeUTF8(s, size, errors);
485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
499 "decoder did not return an unicode object (type=%.400s)",
500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
504 Py_DECREF(buffer);
505 return unicode;
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
512 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
517 PyObject *v, *unicode;
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
527 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
531 PyObject *v;
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
544 return PyUnicode_AsUTF8String(unicode);
545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
558 "encoder did not return a string object (type=%.400s)",
559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
563 return v;
565 onError:
566 return NULL;
569 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
570 const char *errors)
572 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
574 if (v)
575 return v;
576 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
577 if (v && errors == NULL)
578 ((PyUnicodeObject *)unicode)->defenc = v;
579 return v;
582 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
584 if (!PyUnicode_Check(unicode)) {
585 PyErr_BadArgument();
586 goto onError;
588 return PyUnicode_AS_UNICODE(unicode);
590 onError:
591 return NULL;
594 int PyUnicode_GetSize(PyObject *unicode)
596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
600 return PyUnicode_GET_SIZE(unicode);
602 onError:
603 return -1;
606 const char *PyUnicode_GetDefaultEncoding(void)
608 return unicode_default_encoding;
611 int PyUnicode_SetDefaultEncoding(const char *encoding)
613 PyObject *v;
615 /* Make sure the encoding is valid. As side effect, this also
616 loads the encoding into the codec registry cache. */
617 v = _PyCodec_Lookup(encoding);
618 if (v == NULL)
619 goto onError;
620 Py_DECREF(v);
621 strncpy(unicode_default_encoding,
622 encoding,
623 sizeof(unicode_default_encoding));
624 return 0;
626 onError:
627 return -1;
630 /* --- UTF-8 Codec -------------------------------------------------------- */
632 static
633 char utf8_code_length[256] = {
634 /* Map UTF-8 encoded prefix byte to sequence length. zero means
635 illegal prefix. see RFC 2279 for details */
636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
649 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
650 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654 static
655 int utf8_decoding_error(const char **source,
656 Py_UNICODE **dest,
657 const char *errors,
658 const char *details)
660 if ((errors == NULL) ||
661 (strcmp(errors,"strict") == 0)) {
662 PyErr_Format(PyExc_UnicodeError,
663 "UTF-8 decoding error: %.400s",
664 details);
665 return -1;
667 else if (strcmp(errors,"ignore") == 0) {
668 (*source)++;
669 return 0;
671 else if (strcmp(errors,"replace") == 0) {
672 (*source)++;
673 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
674 (*dest)++;
675 return 0;
677 else {
678 PyErr_Format(PyExc_ValueError,
679 "UTF-8 decoding error; unknown error handling code: %.400s",
680 errors);
681 return -1;
685 PyObject *PyUnicode_DecodeUTF8(const char *s,
686 int size,
687 const char *errors)
689 int n;
690 const char *e;
691 PyUnicodeObject *unicode;
692 Py_UNICODE *p;
693 const char *errmsg = "";
695 /* Note: size will always be longer than the resulting Unicode
696 character count */
697 unicode = _PyUnicode_New(size);
698 if (!unicode)
699 return NULL;
700 if (size == 0)
701 return (PyObject *)unicode;
703 /* Unpack UTF-8 encoded data */
704 p = unicode->str;
705 e = s + size;
707 while (s < e) {
708 Py_UCS4 ch = (unsigned char)*s;
710 if (ch < 0x80) {
711 *p++ = (Py_UNICODE)ch;
712 s++;
713 continue;
716 n = utf8_code_length[ch];
718 if (s + n > e) {
719 errmsg = "unexpected end of data";
720 goto utf8Error;
723 switch (n) {
725 case 0:
726 errmsg = "unexpected code byte";
727 goto utf8Error;
729 case 1:
730 errmsg = "internal error";
731 goto utf8Error;
733 case 2:
734 if ((s[1] & 0xc0) != 0x80) {
735 errmsg = "invalid data";
736 goto utf8Error;
738 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
739 if (ch < 0x80) {
740 errmsg = "illegal encoding";
741 goto utf8Error;
743 else
744 *p++ = (Py_UNICODE)ch;
745 break;
747 case 3:
748 if ((s[1] & 0xc0) != 0x80 ||
749 (s[2] & 0xc0) != 0x80) {
750 errmsg = "invalid data";
751 goto utf8Error;
753 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
754 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
755 errmsg = "illegal encoding";
756 goto utf8Error;
758 else
759 *p++ = (Py_UNICODE)ch;
760 break;
762 case 4:
763 if ((s[1] & 0xc0) != 0x80 ||
764 (s[2] & 0xc0) != 0x80 ||
765 (s[3] & 0xc0) != 0x80) {
766 errmsg = "invalid data";
767 goto utf8Error;
769 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
770 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
771 /* validate and convert to UTF-16 */
772 if ((ch < 0x10000) /* minimum value allowed for 4
773 byte encoding */
774 || (ch > 0x10ffff)) /* maximum value allowed for
775 UTF-16 */
777 errmsg = "illegal encoding";
778 goto utf8Error;
780 #ifdef Py_UNICODE_WIDE
781 *p++ = (Py_UNICODE)ch;
782 #else
783 /* compute and append the two surrogates: */
785 /* translate from 10000..10FFFF to 0..FFFF */
786 ch -= 0x10000;
788 /* high surrogate = top 10 bits added to D800 */
789 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
791 /* low surrogate = bottom 10 bits added to DC00 */
792 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
793 #endif
794 break;
796 default:
797 /* Other sizes are only needed for UCS-4 */
798 errmsg = "unsupported Unicode code range";
799 goto utf8Error;
801 s += n;
802 continue;
804 utf8Error:
805 if (utf8_decoding_error(&s, &p, errors, errmsg))
806 goto onError;
809 /* Adjust length */
810 if (_PyUnicode_Resize(&unicode, p - unicode->str))
811 goto onError;
813 return (PyObject *)unicode;
815 onError:
816 Py_DECREF(unicode);
817 return NULL;
820 /* Not used anymore, now that the encoder supports UTF-16
821 surrogates. */
822 #if 0
823 static
824 int utf8_encoding_error(const Py_UNICODE **source,
825 char **dest,
826 const char *errors,
827 const char *details)
829 if ((errors == NULL) ||
830 (strcmp(errors,"strict") == 0)) {
831 PyErr_Format(PyExc_UnicodeError,
832 "UTF-8 encoding error: %.400s",
833 details);
834 return -1;
836 else if (strcmp(errors,"ignore") == 0) {
837 return 0;
839 else if (strcmp(errors,"replace") == 0) {
840 **dest = '?';
841 (*dest)++;
842 return 0;
844 else {
845 PyErr_Format(PyExc_ValueError,
846 "UTF-8 encoding error; "
847 "unknown error handling code: %.400s",
848 errors);
849 return -1;
852 #endif
854 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
855 int size,
856 const char *errors)
858 PyObject *v;
859 char *p;
860 char *q;
861 Py_UCS4 ch2;
862 unsigned int cbAllocated = 3 * size;
863 unsigned int cbWritten = 0;
864 int i = 0;
866 v = PyString_FromStringAndSize(NULL, cbAllocated);
867 if (v == NULL)
868 return NULL;
869 if (size == 0)
870 return v;
872 p = q = PyString_AS_STRING(v);
873 while (i < size) {
874 Py_UCS4 ch = s[i++];
875 if (ch < 0x80) {
876 *p++ = (char) ch;
877 cbWritten++;
879 else if (ch < 0x0800) {
880 *p++ = 0xc0 | (ch >> 6);
881 *p++ = 0x80 | (ch & 0x3f);
882 cbWritten += 2;
884 else if (ch < 0x10000) {
885 /* Check for high surrogate */
886 if (0xD800 <= ch && ch <= 0xDBFF) {
887 if (i != size) {
888 ch2 = s[i];
889 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
891 if (cbWritten >= (cbAllocated - 4)) {
892 /* Provide enough room for some more
893 surrogates */
894 cbAllocated += 4*10;
895 if (_PyString_Resize(&v, cbAllocated))
896 goto onError;
899 /* combine the two values */
900 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
902 *p++ = (char)((ch >> 18) | 0xf0);
903 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
904 i++;
905 cbWritten += 4;
909 else {
910 *p++ = (char)(0xe0 | (ch >> 12));
911 cbWritten += 3;
913 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
914 *p++ = (char)(0x80 | (ch & 0x3f));
915 } else {
916 *p++ = 0xf0 | (ch>>18);
917 *p++ = 0x80 | ((ch>>12) & 0x3f);
918 *p++ = 0x80 | ((ch>>6) & 0x3f);
919 *p++ = 0x80 | (ch & 0x3f);
920 cbWritten += 4;
923 *p = '\0';
924 if (_PyString_Resize(&v, p - q))
925 goto onError;
926 return v;
928 onError:
929 Py_DECREF(v);
930 return NULL;
933 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
935 if (!PyUnicode_Check(unicode)) {
936 PyErr_BadArgument();
937 return NULL;
939 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
940 PyUnicode_GET_SIZE(unicode),
941 NULL);
944 /* --- UTF-16 Codec ------------------------------------------------------- */
946 static
947 int utf16_decoding_error(Py_UNICODE **dest,
948 const char *errors,
949 const char *details)
951 if ((errors == NULL) ||
952 (strcmp(errors,"strict") == 0)) {
953 PyErr_Format(PyExc_UnicodeError,
954 "UTF-16 decoding error: %.400s",
955 details);
956 return -1;
958 else if (strcmp(errors,"ignore") == 0) {
959 return 0;
961 else if (strcmp(errors,"replace") == 0) {
962 if (dest) {
963 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
964 (*dest)++;
966 return 0;
968 else {
969 PyErr_Format(PyExc_ValueError,
970 "UTF-16 decoding error; "
971 "unknown error handling code: %.400s",
972 errors);
973 return -1;
977 PyObject *
978 PyUnicode_DecodeUTF16(const char *s,
979 int size,
980 const char *errors,
981 int *byteorder)
983 PyUnicodeObject *unicode;
984 Py_UNICODE *p;
985 const unsigned char *q, *e;
986 int bo = 0; /* assume native ordering by default */
987 const char *errmsg = "";
988 /* Offsets from q for retrieving byte pairs in the right order. */
989 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
990 int ihi = 1, ilo = 0;
991 #else
992 int ihi = 0, ilo = 1;
993 #endif
995 /* size should be an even number */
996 if (size & 1) {
997 if (utf16_decoding_error(NULL, errors, "truncated data"))
998 return NULL;
999 --size; /* else ignore the oddball byte */
1002 /* Note: size will always be longer than the resulting Unicode
1003 character count */
1004 unicode = _PyUnicode_New(size);
1005 if (!unicode)
1006 return NULL;
1007 if (size == 0)
1008 return (PyObject *)unicode;
1010 /* Unpack UTF-16 encoded data */
1011 p = unicode->str;
1012 q = (unsigned char *)s;
1013 e = q + size;
1015 if (byteorder)
1016 bo = *byteorder;
1018 /* Check for BOM marks (U+FEFF) in the input and adjust current
1019 byte order setting accordingly. In native mode, the leading BOM
1020 mark is skipped, in all other modes, it is copied to the output
1021 stream as-is (giving a ZWNBSP character). */
1022 if (bo == 0) {
1023 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1024 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1025 if (bom == 0xFEFF) {
1026 q += 2;
1027 bo = -1;
1029 else if (bom == 0xFFFE) {
1030 q += 2;
1031 bo = 1;
1033 #else
1034 if (bom == 0xFEFF) {
1035 q += 2;
1036 bo = 1;
1038 else if (bom == 0xFFFE) {
1039 q += 2;
1040 bo = -1;
1042 #endif
1045 if (bo == -1) {
1046 /* force LE */
1047 ihi = 1;
1048 ilo = 0;
1050 else if (bo == 1) {
1051 /* force BE */
1052 ihi = 0;
1053 ilo = 1;
1056 while (q < e) {
1057 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1058 q += 2;
1060 if (ch < 0xD800 || ch > 0xDFFF) {
1061 *p++ = ch;
1062 continue;
1065 /* UTF-16 code pair: */
1066 if (q >= e) {
1067 errmsg = "unexpected end of data";
1068 goto utf16Error;
1070 if (0xD800 <= ch && ch <= 0xDBFF) {
1071 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1072 q += 2;
1073 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1074 #ifndef Py_UNICODE_WIDE
1075 *p++ = ch;
1076 *p++ = ch2;
1077 #else
1078 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1079 #endif
1080 continue;
1082 else {
1083 errmsg = "illegal UTF-16 surrogate";
1084 goto utf16Error;
1088 errmsg = "illegal encoding";
1089 /* Fall through to report the error */
1091 utf16Error:
1092 if (utf16_decoding_error(&p, errors, errmsg))
1093 goto onError;
1096 if (byteorder)
1097 *byteorder = bo;
1099 /* Adjust length */
1100 if (_PyUnicode_Resize(&unicode, p - unicode->str))
1101 goto onError;
1103 return (PyObject *)unicode;
1105 onError:
1106 Py_DECREF(unicode);
1107 return NULL;
1110 PyObject *
1111 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1112 int size,
1113 const char *errors,
1114 int byteorder)
1116 PyObject *v;
1117 unsigned char *p;
1118 int i, pairs;
1119 /* Offsets from p for storing byte pairs in the right order. */
1120 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1121 int ihi = 1, ilo = 0;
1122 #else
1123 int ihi = 0, ilo = 1;
1124 #endif
1126 #define STORECHAR(CH) \
1127 do { \
1128 p[ihi] = ((CH) >> 8) & 0xff; \
1129 p[ilo] = (CH) & 0xff; \
1130 p += 2; \
1131 } while(0)
1133 for (i = pairs = 0; i < size; i++)
1134 if (s[i] >= 0x10000)
1135 pairs++;
1136 v = PyString_FromStringAndSize(NULL,
1137 2 * (size + pairs + (byteorder == 0)));
1138 if (v == NULL)
1139 return NULL;
1141 p = (unsigned char *)PyString_AS_STRING(v);
1142 if (byteorder == 0)
1143 STORECHAR(0xFEFF);
1144 if (size == 0)
1145 return v;
1147 if (byteorder == -1) {
1148 /* force LE */
1149 ihi = 1;
1150 ilo = 0;
1152 else if (byteorder == 1) {
1153 /* force BE */
1154 ihi = 0;
1155 ilo = 1;
1158 while (size-- > 0) {
1159 Py_UNICODE ch = *s++;
1160 Py_UNICODE ch2 = 0;
1161 if (ch >= 0x10000) {
1162 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1163 ch = 0xD800 | ((ch-0x10000) >> 10);
1165 STORECHAR(ch);
1166 if (ch2)
1167 STORECHAR(ch2);
1169 return v;
1170 #undef STORECHAR
1173 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 return NULL;
1179 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1180 PyUnicode_GET_SIZE(unicode),
1181 NULL,
1185 /* --- Unicode Escape Codec ----------------------------------------------- */
1187 static
1188 int unicodeescape_decoding_error(const char **source,
1189 Py_UNICODE *x,
1190 const char *errors,
1191 const char *details)
1193 if ((errors == NULL) ||
1194 (strcmp(errors,"strict") == 0)) {
1195 PyErr_Format(PyExc_UnicodeError,
1196 "Unicode-Escape decoding error: %.400s",
1197 details);
1198 return -1;
1200 else if (strcmp(errors,"ignore") == 0) {
1201 return 0;
1203 else if (strcmp(errors,"replace") == 0) {
1204 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1205 return 0;
1207 else {
1208 PyErr_Format(PyExc_ValueError,
1209 "Unicode-Escape decoding error; "
1210 "unknown error handling code: %.400s",
1211 errors);
1212 return -1;
1216 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1218 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1219 int size,
1220 const char *errors)
1222 PyUnicodeObject *v;
1223 Py_UNICODE *p, *buf;
1224 const char *end;
1225 char* message;
1226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1228 /* Escaped strings will always be longer than the resulting
1229 Unicode string, so we start with size here and then reduce the
1230 length after conversion to the true value. */
1231 v = _PyUnicode_New(size);
1232 if (v == NULL)
1233 goto onError;
1234 if (size == 0)
1235 return (PyObject *)v;
1237 p = buf = PyUnicode_AS_UNICODE(v);
1238 end = s + size;
1240 while (s < end) {
1241 unsigned char c;
1242 Py_UNICODE x;
1243 int i, digits;
1245 /* Non-escape characters are interpreted as Unicode ordinals */
1246 if (*s != '\\') {
1247 *p++ = (unsigned char) *s++;
1248 continue;
1251 /* \ - Escapes */
1252 s++;
1253 switch (*s++) {
1255 /* \x escapes */
1256 case '\n': break;
1257 case '\\': *p++ = '\\'; break;
1258 case '\'': *p++ = '\''; break;
1259 case '\"': *p++ = '\"'; break;
1260 case 'b': *p++ = '\b'; break;
1261 case 'f': *p++ = '\014'; break; /* FF */
1262 case 't': *p++ = '\t'; break;
1263 case 'n': *p++ = '\n'; break;
1264 case 'r': *p++ = '\r'; break;
1265 case 'v': *p++ = '\013'; break; /* VT */
1266 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1268 /* \OOO (octal) escapes */
1269 case '0': case '1': case '2': case '3':
1270 case '4': case '5': case '6': case '7':
1271 x = s[-1] - '0';
1272 if ('0' <= *s && *s <= '7') {
1273 x = (x<<3) + *s++ - '0';
1274 if ('0' <= *s && *s <= '7')
1275 x = (x<<3) + *s++ - '0';
1277 *p++ = x;
1278 break;
1280 /* hex escapes */
1281 /* \xXX */
1282 case 'x':
1283 digits = 2;
1284 message = "truncated \\xXX escape";
1285 goto hexescape;
1287 /* \uXXXX */
1288 case 'u':
1289 digits = 4;
1290 message = "truncated \\uXXXX escape";
1291 goto hexescape;
1293 /* \UXXXXXXXX */
1294 case 'U':
1295 digits = 8;
1296 message = "truncated \\UXXXXXXXX escape";
1297 hexescape:
1298 chr = 0;
1299 for (i = 0; i < digits; i++) {
1300 c = (unsigned char) s[i];
1301 if (!isxdigit(c)) {
1302 if (unicodeescape_decoding_error(&s, &x, errors, message))
1303 goto onError;
1304 chr = x;
1305 i++;
1306 break;
1308 chr = (chr<<4) & ~0xF;
1309 if (c >= '0' && c <= '9')
1310 chr += c - '0';
1311 else if (c >= 'a' && c <= 'f')
1312 chr += 10 + c - 'a';
1313 else
1314 chr += 10 + c - 'A';
1316 s += i;
1317 store:
1318 /* when we get here, chr is a 32-bit unicode character */
1319 if (chr <= 0xffff)
1320 /* UCS-2 character */
1321 *p++ = (Py_UNICODE) chr;
1322 else if (chr <= 0x10ffff) {
1323 /* UCS-4 character. Either store directly, or as
1324 surrogate pair. */
1325 #ifdef Py_UNICODE_WIDE
1326 *p++ = chr;
1327 #else
1328 chr -= 0x10000L;
1329 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1330 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1331 #endif
1332 } else {
1333 if (unicodeescape_decoding_error(
1334 &s, &x, errors,
1335 "illegal Unicode character")
1337 goto onError;
1338 *p++ = x; /* store replacement character */
1340 break;
1342 /* \N{name} */
1343 case 'N':
1344 message = "malformed \\N character escape";
1345 if (ucnhash_CAPI == NULL) {
1346 /* load the unicode data module */
1347 PyObject *m, *v;
1348 m = PyImport_ImportModule("unicodedata");
1349 if (m == NULL)
1350 goto ucnhashError;
1351 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1352 Py_DECREF(m);
1353 if (v == NULL)
1354 goto ucnhashError;
1355 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1356 Py_DECREF(v);
1357 if (ucnhash_CAPI == NULL)
1358 goto ucnhashError;
1360 if (*s == '{') {
1361 const char *start = s+1;
1362 /* look for the closing brace */
1363 while (*s != '}' && s < end)
1364 s++;
1365 if (s > start && s < end && *s == '}') {
1366 /* found a name. look it up in the unicode database */
1367 message = "unknown Unicode character name";
1368 s++;
1369 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1370 goto store;
1373 if (unicodeescape_decoding_error(&s, &x, errors, message))
1374 goto onError;
1375 *p++ = x;
1376 break;
1378 default:
1379 *p++ = '\\';
1380 *p++ = (unsigned char)s[-1];
1381 break;
1384 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1385 goto onError;
1386 return (PyObject *)v;
1388 ucnhashError:
1389 PyErr_SetString(
1390 PyExc_UnicodeError,
1391 "\\N escapes not supported (can't load unicodedata module)"
1393 return NULL;
1395 onError:
1396 Py_XDECREF(v);
1397 return NULL;
1400 /* Return a Unicode-Escape string version of the Unicode object.
1402 If quotes is true, the string is enclosed in u"" or u'' quotes as
1403 appropriate.
1407 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1408 int size,
1409 Py_UNICODE ch);
1411 static
1412 PyObject *unicodeescape_string(const Py_UNICODE *s,
1413 int size,
1414 int quotes)
1416 PyObject *repr;
1417 char *p;
1419 static const char *hexdigit = "0123456789abcdef";
1421 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1422 if (repr == NULL)
1423 return NULL;
1425 p = PyString_AS_STRING(repr);
1427 if (quotes) {
1428 *p++ = 'u';
1429 *p++ = (findchar(s, size, '\'') &&
1430 !findchar(s, size, '"')) ? '"' : '\'';
1432 while (size-- > 0) {
1433 Py_UNICODE ch = *s++;
1435 /* Escape quotes */
1436 if (quotes &&
1437 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1438 *p++ = '\\';
1439 *p++ = (char) ch;
1442 #ifdef Py_UNICODE_WIDE
1443 /* Map 21-bit characters to '\U00xxxxxx' */
1444 else if (ch >= 0x10000) {
1445 int offset = p - PyString_AS_STRING(repr);
1447 /* Resize the string if necessary */
1448 if (offset + 12 > PyString_GET_SIZE(repr)) {
1449 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1450 goto onError;
1451 p = PyString_AS_STRING(repr) + offset;
1454 *p++ = '\\';
1455 *p++ = 'U';
1456 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1457 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1458 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1459 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1460 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1461 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1462 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1463 *p++ = hexdigit[ch & 0x0000000F];
1464 continue;
1466 #endif
1467 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1468 else if (ch >= 0xD800 && ch < 0xDC00) {
1469 Py_UNICODE ch2;
1470 Py_UCS4 ucs;
1472 ch2 = *s++;
1473 size--;
1474 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1475 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1476 *p++ = '\\';
1477 *p++ = 'U';
1478 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1479 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1480 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1481 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1482 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1483 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1484 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1485 *p++ = hexdigit[ucs & 0x0000000F];
1486 continue;
1488 /* Fall through: isolated surrogates are copied as-is */
1489 s--;
1490 size++;
1493 /* Map 16-bit characters to '\uxxxx' */
1494 if (ch >= 256) {
1495 *p++ = '\\';
1496 *p++ = 'u';
1497 *p++ = hexdigit[(ch >> 12) & 0x000F];
1498 *p++ = hexdigit[(ch >> 8) & 0x000F];
1499 *p++ = hexdigit[(ch >> 4) & 0x000F];
1500 *p++ = hexdigit[ch & 0x000F];
1503 /* Map special whitespace to '\t', \n', '\r' */
1504 else if (ch == '\t') {
1505 *p++ = '\\';
1506 *p++ = 't';
1508 else if (ch == '\n') {
1509 *p++ = '\\';
1510 *p++ = 'n';
1512 else if (ch == '\r') {
1513 *p++ = '\\';
1514 *p++ = 'r';
1517 /* Map non-printable US ASCII to '\xhh' */
1518 else if (ch < ' ' || ch >= 128) {
1519 *p++ = '\\';
1520 *p++ = 'x';
1521 *p++ = hexdigit[(ch >> 4) & 0x000F];
1522 *p++ = hexdigit[ch & 0x000F];
1525 /* Copy everything else as-is */
1526 else
1527 *p++ = (char) ch;
1529 if (quotes)
1530 *p++ = PyString_AS_STRING(repr)[1];
1532 *p = '\0';
1533 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1534 goto onError;
1536 return repr;
1538 onError:
1539 Py_DECREF(repr);
1540 return NULL;
1543 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1544 int size)
1546 return unicodeescape_string(s, size, 0);
1549 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1551 if (!PyUnicode_Check(unicode)) {
1552 PyErr_BadArgument();
1553 return NULL;
1555 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1556 PyUnicode_GET_SIZE(unicode));
1559 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1561 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1562 int size,
1563 const char *errors)
1565 PyUnicodeObject *v;
1566 Py_UNICODE *p, *buf;
1567 const char *end;
1568 const char *bs;
1570 /* Escaped strings will always be longer than the resulting
1571 Unicode string, so we start with size here and then reduce the
1572 length after conversion to the true value. */
1573 v = _PyUnicode_New(size);
1574 if (v == NULL)
1575 goto onError;
1576 if (size == 0)
1577 return (PyObject *)v;
1578 p = buf = PyUnicode_AS_UNICODE(v);
1579 end = s + size;
1580 while (s < end) {
1581 unsigned char c;
1582 Py_UNICODE x;
1583 int i;
1585 /* Non-escape characters are interpreted as Unicode ordinals */
1586 if (*s != '\\') {
1587 *p++ = (unsigned char)*s++;
1588 continue;
1591 /* \u-escapes are only interpreted iff the number of leading
1592 backslashes if odd */
1593 bs = s;
1594 for (;s < end;) {
1595 if (*s != '\\')
1596 break;
1597 *p++ = (unsigned char)*s++;
1599 if (((s - bs) & 1) == 0 ||
1600 s >= end ||
1601 *s != 'u') {
1602 continue;
1604 p--;
1605 s++;
1607 /* \uXXXX with 4 hex digits */
1608 for (x = 0, i = 0; i < 4; i++) {
1609 c = (unsigned char)s[i];
1610 if (!isxdigit(c)) {
1611 if (unicodeescape_decoding_error(&s, &x, errors,
1612 "truncated \\uXXXX"))
1613 goto onError;
1614 i++;
1615 break;
1617 x = (x<<4) & ~0xF;
1618 if (c >= '0' && c <= '9')
1619 x += c - '0';
1620 else if (c >= 'a' && c <= 'f')
1621 x += 10 + c - 'a';
1622 else
1623 x += 10 + c - 'A';
1625 s += i;
1626 *p++ = x;
1628 if (_PyUnicode_Resize(&v, (int)(p - buf)))
1629 goto onError;
1630 return (PyObject *)v;
1632 onError:
1633 Py_XDECREF(v);
1634 return NULL;
1637 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1638 int size)
1640 PyObject *repr;
1641 char *p;
1642 char *q;
1644 static const char *hexdigit = "0123456789abcdef";
1646 repr = PyString_FromStringAndSize(NULL, 6 * size);
1647 if (repr == NULL)
1648 return NULL;
1649 if (size == 0)
1650 return repr;
1652 p = q = PyString_AS_STRING(repr);
1653 while (size-- > 0) {
1654 Py_UNICODE ch = *s++;
1655 /* Map 16-bit characters to '\uxxxx' */
1656 if (ch >= 256) {
1657 *p++ = '\\';
1658 *p++ = 'u';
1659 *p++ = hexdigit[(ch >> 12) & 0xf];
1660 *p++ = hexdigit[(ch >> 8) & 0xf];
1661 *p++ = hexdigit[(ch >> 4) & 0xf];
1662 *p++ = hexdigit[ch & 15];
1664 /* Copy everything else as-is */
1665 else
1666 *p++ = (char) ch;
1668 *p = '\0';
1669 if (_PyString_Resize(&repr, p - q))
1670 goto onError;
1672 return repr;
1674 onError:
1675 Py_DECREF(repr);
1676 return NULL;
1679 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1681 if (!PyUnicode_Check(unicode)) {
1682 PyErr_BadArgument();
1683 return NULL;
1685 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1686 PyUnicode_GET_SIZE(unicode));
1689 /* --- Latin-1 Codec ------------------------------------------------------ */
1691 PyObject *PyUnicode_DecodeLatin1(const char *s,
1692 int size,
1693 const char *errors)
1695 PyUnicodeObject *v;
1696 Py_UNICODE *p;
1698 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1699 if (size == 1 && *(unsigned char*)s < 256) {
1700 Py_UNICODE r = *(unsigned char*)s;
1701 return PyUnicode_FromUnicode(&r, 1);
1704 v = _PyUnicode_New(size);
1705 if (v == NULL)
1706 goto onError;
1707 if (size == 0)
1708 return (PyObject *)v;
1709 p = PyUnicode_AS_UNICODE(v);
1710 while (size-- > 0)
1711 *p++ = (unsigned char)*s++;
1712 return (PyObject *)v;
1714 onError:
1715 Py_XDECREF(v);
1716 return NULL;
1719 static
1720 int latin1_encoding_error(const Py_UNICODE **source,
1721 char **dest,
1722 const char *errors,
1723 const char *details)
1725 if ((errors == NULL) ||
1726 (strcmp(errors,"strict") == 0)) {
1727 PyErr_Format(PyExc_UnicodeError,
1728 "Latin-1 encoding error: %.400s",
1729 details);
1730 return -1;
1732 else if (strcmp(errors,"ignore") == 0) {
1733 return 0;
1735 else if (strcmp(errors,"replace") == 0) {
1736 **dest = '?';
1737 (*dest)++;
1738 return 0;
1740 else {
1741 PyErr_Format(PyExc_ValueError,
1742 "Latin-1 encoding error; "
1743 "unknown error handling code: %.400s",
1744 errors);
1745 return -1;
1749 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1750 int size,
1751 const char *errors)
1753 PyObject *repr;
1754 char *s, *start;
1756 repr = PyString_FromStringAndSize(NULL, size);
1757 if (repr == NULL)
1758 return NULL;
1759 if (size == 0)
1760 return repr;
1762 s = PyString_AS_STRING(repr);
1763 start = s;
1764 while (size-- > 0) {
1765 Py_UNICODE ch = *p++;
1766 if (ch >= 256) {
1767 if (latin1_encoding_error(&p, &s, errors,
1768 "ordinal not in range(256)"))
1769 goto onError;
1771 else
1772 *s++ = (char)ch;
1774 /* Resize if error handling skipped some characters */
1775 if (s - start < PyString_GET_SIZE(repr))
1776 if (_PyString_Resize(&repr, s - start))
1777 goto onError;
1778 return repr;
1780 onError:
1781 Py_DECREF(repr);
1782 return NULL;
1785 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1787 if (!PyUnicode_Check(unicode)) {
1788 PyErr_BadArgument();
1789 return NULL;
1791 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1792 PyUnicode_GET_SIZE(unicode),
1793 NULL);
1796 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1798 static
1799 int ascii_decoding_error(const char **source,
1800 Py_UNICODE **dest,
1801 const char *errors,
1802 const char *details)
1804 if ((errors == NULL) ||
1805 (strcmp(errors,"strict") == 0)) {
1806 PyErr_Format(PyExc_UnicodeError,
1807 "ASCII decoding error: %.400s",
1808 details);
1809 return -1;
1811 else if (strcmp(errors,"ignore") == 0) {
1812 return 0;
1814 else if (strcmp(errors,"replace") == 0) {
1815 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1816 (*dest)++;
1817 return 0;
1819 else {
1820 PyErr_Format(PyExc_ValueError,
1821 "ASCII decoding error; "
1822 "unknown error handling code: %.400s",
1823 errors);
1824 return -1;
1828 PyObject *PyUnicode_DecodeASCII(const char *s,
1829 int size,
1830 const char *errors)
1832 PyUnicodeObject *v;
1833 Py_UNICODE *p;
1835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1836 if (size == 1 && *(unsigned char*)s < 128) {
1837 Py_UNICODE r = *(unsigned char*)s;
1838 return PyUnicode_FromUnicode(&r, 1);
1841 v = _PyUnicode_New(size);
1842 if (v == NULL)
1843 goto onError;
1844 if (size == 0)
1845 return (PyObject *)v;
1846 p = PyUnicode_AS_UNICODE(v);
1847 while (size-- > 0) {
1848 register unsigned char c;
1850 c = (unsigned char)*s++;
1851 if (c < 128)
1852 *p++ = c;
1853 else if (ascii_decoding_error(&s, &p, errors,
1854 "ordinal not in range(128)"))
1855 goto onError;
1857 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1858 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1859 goto onError;
1860 return (PyObject *)v;
1862 onError:
1863 Py_XDECREF(v);
1864 return NULL;
1867 static
1868 int ascii_encoding_error(const Py_UNICODE **source,
1869 char **dest,
1870 const char *errors,
1871 const char *details)
1873 if ((errors == NULL) ||
1874 (strcmp(errors,"strict") == 0)) {
1875 PyErr_Format(PyExc_UnicodeError,
1876 "ASCII encoding error: %.400s",
1877 details);
1878 return -1;
1880 else if (strcmp(errors,"ignore") == 0) {
1881 return 0;
1883 else if (strcmp(errors,"replace") == 0) {
1884 **dest = '?';
1885 (*dest)++;
1886 return 0;
1888 else {
1889 PyErr_Format(PyExc_ValueError,
1890 "ASCII encoding error; "
1891 "unknown error handling code: %.400s",
1892 errors);
1893 return -1;
1897 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1898 int size,
1899 const char *errors)
1901 PyObject *repr;
1902 char *s, *start;
1904 repr = PyString_FromStringAndSize(NULL, size);
1905 if (repr == NULL)
1906 return NULL;
1907 if (size == 0)
1908 return repr;
1910 s = PyString_AS_STRING(repr);
1911 start = s;
1912 while (size-- > 0) {
1913 Py_UNICODE ch = *p++;
1914 if (ch >= 128) {
1915 if (ascii_encoding_error(&p, &s, errors,
1916 "ordinal not in range(128)"))
1917 goto onError;
1919 else
1920 *s++ = (char)ch;
1922 /* Resize if error handling skipped some characters */
1923 if (s - start < PyString_GET_SIZE(repr))
1924 if (_PyString_Resize(&repr, s - start))
1925 goto onError;
1926 return repr;
1928 onError:
1929 Py_DECREF(repr);
1930 return NULL;
1933 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1935 if (!PyUnicode_Check(unicode)) {
1936 PyErr_BadArgument();
1937 return NULL;
1939 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1940 PyUnicode_GET_SIZE(unicode),
1941 NULL);
1944 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1946 /* --- MBCS codecs for Windows -------------------------------------------- */
1948 PyObject *PyUnicode_DecodeMBCS(const char *s,
1949 int size,
1950 const char *errors)
1952 PyUnicodeObject *v;
1953 Py_UNICODE *p;
1955 /* First get the size of the result */
1956 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1957 if (size > 0 && usize==0)
1958 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1960 v = _PyUnicode_New(usize);
1961 if (v == NULL)
1962 return NULL;
1963 if (usize == 0)
1964 return (PyObject *)v;
1965 p = PyUnicode_AS_UNICODE(v);
1966 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1967 Py_DECREF(v);
1968 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1971 return (PyObject *)v;
1974 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1975 int size,
1976 const char *errors)
1978 PyObject *repr;
1979 char *s;
1980 DWORD mbcssize;
1982 /* If there are no characters, bail now! */
1983 if (size==0)
1984 return PyString_FromString("");
1986 /* First get the size of the result */
1987 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1988 if (mbcssize==0)
1989 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1991 repr = PyString_FromStringAndSize(NULL, mbcssize);
1992 if (repr == NULL)
1993 return NULL;
1994 if (mbcssize == 0)
1995 return repr;
1997 /* Do the conversion */
1998 s = PyString_AS_STRING(repr);
1999 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2000 Py_DECREF(repr);
2001 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2003 return repr;
2006 #endif /* MS_WIN32 */
2008 /* --- Character Mapping Codec -------------------------------------------- */
2010 static
2011 int charmap_decoding_error(const char **source,
2012 Py_UNICODE **dest,
2013 const char *errors,
2014 const char *details)
2016 if ((errors == NULL) ||
2017 (strcmp(errors,"strict") == 0)) {
2018 PyErr_Format(PyExc_UnicodeError,
2019 "charmap decoding error: %.400s",
2020 details);
2021 return -1;
2023 else if (strcmp(errors,"ignore") == 0) {
2024 return 0;
2026 else if (strcmp(errors,"replace") == 0) {
2027 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2028 (*dest)++;
2029 return 0;
2031 else {
2032 PyErr_Format(PyExc_ValueError,
2033 "charmap decoding error; "
2034 "unknown error handling code: %.400s",
2035 errors);
2036 return -1;
2040 PyObject *PyUnicode_DecodeCharmap(const char *s,
2041 int size,
2042 PyObject *mapping,
2043 const char *errors)
2045 PyUnicodeObject *v;
2046 Py_UNICODE *p;
2047 int extrachars = 0;
2049 /* Default to Latin-1 */
2050 if (mapping == NULL)
2051 return PyUnicode_DecodeLatin1(s, size, errors);
2053 v = _PyUnicode_New(size);
2054 if (v == NULL)
2055 goto onError;
2056 if (size == 0)
2057 return (PyObject *)v;
2058 p = PyUnicode_AS_UNICODE(v);
2059 while (size-- > 0) {
2060 unsigned char ch = *s++;
2061 PyObject *w, *x;
2063 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2064 w = PyInt_FromLong((long)ch);
2065 if (w == NULL)
2066 goto onError;
2067 x = PyObject_GetItem(mapping, w);
2068 Py_DECREF(w);
2069 if (x == NULL) {
2070 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2071 /* No mapping found means: mapping is undefined. */
2072 PyErr_Clear();
2073 x = Py_None;
2074 Py_INCREF(x);
2075 } else
2076 goto onError;
2079 /* Apply mapping */
2080 if (PyInt_Check(x)) {
2081 long value = PyInt_AS_LONG(x);
2082 if (value < 0 || value > 65535) {
2083 PyErr_SetString(PyExc_TypeError,
2084 "character mapping must be in range(65536)");
2085 Py_DECREF(x);
2086 goto onError;
2088 *p++ = (Py_UNICODE)value;
2090 else if (x == Py_None) {
2091 /* undefined mapping */
2092 if (charmap_decoding_error(&s, &p, errors,
2093 "character maps to <undefined>")) {
2094 Py_DECREF(x);
2095 goto onError;
2098 else if (PyUnicode_Check(x)) {
2099 int targetsize = PyUnicode_GET_SIZE(x);
2101 if (targetsize == 1)
2102 /* 1-1 mapping */
2103 *p++ = *PyUnicode_AS_UNICODE(x);
2105 else if (targetsize > 1) {
2106 /* 1-n mapping */
2107 if (targetsize > extrachars) {
2108 /* resize first */
2109 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2110 int needed = (targetsize - extrachars) + \
2111 (targetsize << 2);
2112 extrachars += needed;
2113 if (_PyUnicode_Resize(&v,
2114 PyUnicode_GET_SIZE(v) + needed)) {
2115 Py_DECREF(x);
2116 goto onError;
2118 p = PyUnicode_AS_UNICODE(v) + oldpos;
2120 Py_UNICODE_COPY(p,
2121 PyUnicode_AS_UNICODE(x),
2122 targetsize);
2123 p += targetsize;
2124 extrachars -= targetsize;
2126 /* 1-0 mapping: skip the character */
2128 else {
2129 /* wrong return value */
2130 PyErr_SetString(PyExc_TypeError,
2131 "character mapping must return integer, None or unicode");
2132 Py_DECREF(x);
2133 goto onError;
2135 Py_DECREF(x);
2137 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2138 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2139 goto onError;
2140 return (PyObject *)v;
2142 onError:
2143 Py_XDECREF(v);
2144 return NULL;
2147 static
2148 int charmap_encoding_error(const Py_UNICODE **source,
2149 char **dest,
2150 const char *errors,
2151 const char *details)
2153 if ((errors == NULL) ||
2154 (strcmp(errors,"strict") == 0)) {
2155 PyErr_Format(PyExc_UnicodeError,
2156 "charmap encoding error: %.400s",
2157 details);
2158 return -1;
2160 else if (strcmp(errors,"ignore") == 0) {
2161 return 0;
2163 else if (strcmp(errors,"replace") == 0) {
2164 **dest = '?';
2165 (*dest)++;
2166 return 0;
2168 else {
2169 PyErr_Format(PyExc_ValueError,
2170 "charmap encoding error; "
2171 "unknown error handling code: %.400s",
2172 errors);
2173 return -1;
2177 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2178 int size,
2179 PyObject *mapping,
2180 const char *errors)
2182 PyObject *v;
2183 char *s;
2184 int extrachars = 0;
2186 /* Default to Latin-1 */
2187 if (mapping == NULL)
2188 return PyUnicode_EncodeLatin1(p, size, errors);
2190 v = PyString_FromStringAndSize(NULL, size);
2191 if (v == NULL)
2192 return NULL;
2193 if (size == 0)
2194 return v;
2195 s = PyString_AS_STRING(v);
2196 while (size-- > 0) {
2197 Py_UNICODE ch = *p++;
2198 PyObject *w, *x;
2200 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2201 w = PyInt_FromLong((long)ch);
2202 if (w == NULL)
2203 goto onError;
2204 x = PyObject_GetItem(mapping, w);
2205 Py_DECREF(w);
2206 if (x == NULL) {
2207 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2208 /* No mapping found means: mapping is undefined. */
2209 PyErr_Clear();
2210 x = Py_None;
2211 Py_INCREF(x);
2212 } else
2213 goto onError;
2216 /* Apply mapping */
2217 if (PyInt_Check(x)) {
2218 long value = PyInt_AS_LONG(x);
2219 if (value < 0 || value > 255) {
2220 PyErr_SetString(PyExc_TypeError,
2221 "character mapping must be in range(256)");
2222 Py_DECREF(x);
2223 goto onError;
2225 *s++ = (char)value;
2227 else if (x == Py_None) {
2228 /* undefined mapping */
2229 if (charmap_encoding_error(&p, &s, errors,
2230 "character maps to <undefined>")) {
2231 Py_DECREF(x);
2232 goto onError;
2235 else if (PyString_Check(x)) {
2236 int targetsize = PyString_GET_SIZE(x);
2238 if (targetsize == 1)
2239 /* 1-1 mapping */
2240 *s++ = *PyString_AS_STRING(x);
2242 else if (targetsize > 1) {
2243 /* 1-n mapping */
2244 if (targetsize > extrachars) {
2245 /* resize first */
2246 int oldpos = (int)(s - PyString_AS_STRING(v));
2247 int needed = (targetsize - extrachars) + \
2248 (targetsize << 2);
2249 extrachars += needed;
2250 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2251 Py_DECREF(x);
2252 goto onError;
2254 s = PyString_AS_STRING(v) + oldpos;
2256 memcpy(s, PyString_AS_STRING(x), targetsize);
2257 s += targetsize;
2258 extrachars -= targetsize;
2260 /* 1-0 mapping: skip the character */
2262 else {
2263 /* wrong return value */
2264 PyErr_SetString(PyExc_TypeError,
2265 "character mapping must return integer, None or unicode");
2266 Py_DECREF(x);
2267 goto onError;
2269 Py_DECREF(x);
2271 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2272 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2273 goto onError;
2274 return v;
2276 onError:
2277 Py_DECREF(v);
2278 return NULL;
2281 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2282 PyObject *mapping)
2284 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2285 PyErr_BadArgument();
2286 return NULL;
2288 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2289 PyUnicode_GET_SIZE(unicode),
2290 mapping,
2291 NULL);
2294 static
2295 int translate_error(const Py_UNICODE **source,
2296 Py_UNICODE **dest,
2297 const char *errors,
2298 const char *details)
2300 if ((errors == NULL) ||
2301 (strcmp(errors,"strict") == 0)) {
2302 PyErr_Format(PyExc_UnicodeError,
2303 "translate error: %.400s",
2304 details);
2305 return -1;
2307 else if (strcmp(errors,"ignore") == 0) {
2308 return 0;
2310 else if (strcmp(errors,"replace") == 0) {
2311 **dest = '?';
2312 (*dest)++;
2313 return 0;
2315 else {
2316 PyErr_Format(PyExc_ValueError,
2317 "translate error; "
2318 "unknown error handling code: %.400s",
2319 errors);
2320 return -1;
2324 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2325 int size,
2326 PyObject *mapping,
2327 const char *errors)
2329 PyUnicodeObject *v;
2330 Py_UNICODE *p;
2332 if (mapping == NULL) {
2333 PyErr_BadArgument();
2334 return NULL;
2337 /* Output will never be longer than input */
2338 v = _PyUnicode_New(size);
2339 if (v == NULL)
2340 goto onError;
2341 if (size == 0)
2342 goto done;
2343 p = PyUnicode_AS_UNICODE(v);
2344 while (size-- > 0) {
2345 Py_UNICODE ch = *s++;
2346 PyObject *w, *x;
2348 /* Get mapping */
2349 w = PyInt_FromLong(ch);
2350 if (w == NULL)
2351 goto onError;
2352 x = PyObject_GetItem(mapping, w);
2353 Py_DECREF(w);
2354 if (x == NULL) {
2355 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2356 /* No mapping found: default to 1-1 mapping */
2357 PyErr_Clear();
2358 *p++ = ch;
2359 continue;
2361 goto onError;
2364 /* Apply mapping */
2365 if (PyInt_Check(x))
2366 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2367 else if (x == Py_None) {
2368 /* undefined mapping */
2369 if (translate_error(&s, &p, errors,
2370 "character maps to <undefined>")) {
2371 Py_DECREF(x);
2372 goto onError;
2375 else if (PyUnicode_Check(x)) {
2376 if (PyUnicode_GET_SIZE(x) != 1) {
2377 /* 1-n mapping */
2378 PyErr_SetString(PyExc_NotImplementedError,
2379 "1-n mappings are currently not implemented");
2380 Py_DECREF(x);
2381 goto onError;
2383 *p++ = *PyUnicode_AS_UNICODE(x);
2385 else {
2386 /* wrong return value */
2387 PyErr_SetString(PyExc_TypeError,
2388 "translate mapping must return integer, None or unicode");
2389 Py_DECREF(x);
2390 goto onError;
2392 Py_DECREF(x);
2394 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2395 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2396 goto onError;
2398 done:
2399 return (PyObject *)v;
2401 onError:
2402 Py_XDECREF(v);
2403 return NULL;
2406 PyObject *PyUnicode_Translate(PyObject *str,
2407 PyObject *mapping,
2408 const char *errors)
2410 PyObject *result;
2412 str = PyUnicode_FromObject(str);
2413 if (str == NULL)
2414 goto onError;
2415 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2416 PyUnicode_GET_SIZE(str),
2417 mapping,
2418 errors);
2419 Py_DECREF(str);
2420 return result;
2422 onError:
2423 Py_XDECREF(str);
2424 return NULL;
2427 /* --- Decimal Encoder ---------------------------------------------------- */
2429 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2430 int length,
2431 char *output,
2432 const char *errors)
2434 Py_UNICODE *p, *end;
2436 if (output == NULL) {
2437 PyErr_BadArgument();
2438 return -1;
2441 p = s;
2442 end = s + length;
2443 while (p < end) {
2444 register Py_UNICODE ch = *p++;
2445 int decimal;
2447 if (Py_UNICODE_ISSPACE(ch)) {
2448 *output++ = ' ';
2449 continue;
2451 decimal = Py_UNICODE_TODECIMAL(ch);
2452 if (decimal >= 0) {
2453 *output++ = '0' + decimal;
2454 continue;
2456 if (0 < ch && ch < 256) {
2457 *output++ = (char)ch;
2458 continue;
2460 /* All other characters are considered invalid */
2461 if (errors == NULL || strcmp(errors, "strict") == 0) {
2462 PyErr_SetString(PyExc_ValueError,
2463 "invalid decimal Unicode string");
2464 goto onError;
2466 else if (strcmp(errors, "ignore") == 0)
2467 continue;
2468 else if (strcmp(errors, "replace") == 0) {
2469 *output++ = '?';
2470 continue;
2473 /* 0-terminate the output string */
2474 *output++ = '\0';
2475 return 0;
2477 onError:
2478 return -1;
2481 /* --- Helpers ------------------------------------------------------------ */
2483 static
2484 int count(PyUnicodeObject *self,
2485 int start,
2486 int end,
2487 PyUnicodeObject *substring)
2489 int count = 0;
2491 if (start < 0)
2492 start += self->length;
2493 if (start < 0)
2494 start = 0;
2495 if (end > self->length)
2496 end = self->length;
2497 if (end < 0)
2498 end += self->length;
2499 if (end < 0)
2500 end = 0;
2502 if (substring->length == 0)
2503 return (end - start + 1);
2505 end -= substring->length;
2507 while (start <= end)
2508 if (Py_UNICODE_MATCH(self, start, substring)) {
2509 count++;
2510 start += substring->length;
2511 } else
2512 start++;
2514 return count;
2517 int PyUnicode_Count(PyObject *str,
2518 PyObject *substr,
2519 int start,
2520 int end)
2522 int result;
2524 str = PyUnicode_FromObject(str);
2525 if (str == NULL)
2526 return -1;
2527 substr = PyUnicode_FromObject(substr);
2528 if (substr == NULL) {
2529 Py_DECREF(str);
2530 return -1;
2533 result = count((PyUnicodeObject *)str,
2534 start, end,
2535 (PyUnicodeObject *)substr);
2537 Py_DECREF(str);
2538 Py_DECREF(substr);
2539 return result;
2542 static
2543 int findstring(PyUnicodeObject *self,
2544 PyUnicodeObject *substring,
2545 int start,
2546 int end,
2547 int direction)
2549 if (start < 0)
2550 start += self->length;
2551 if (start < 0)
2552 start = 0;
2554 if (substring->length == 0)
2555 return start;
2557 if (end > self->length)
2558 end = self->length;
2559 if (end < 0)
2560 end += self->length;
2561 if (end < 0)
2562 end = 0;
2564 end -= substring->length;
2566 if (direction < 0) {
2567 for (; end >= start; end--)
2568 if (Py_UNICODE_MATCH(self, end, substring))
2569 return end;
2570 } else {
2571 for (; start <= end; start++)
2572 if (Py_UNICODE_MATCH(self, start, substring))
2573 return start;
2576 return -1;
2579 int PyUnicode_Find(PyObject *str,
2580 PyObject *substr,
2581 int start,
2582 int end,
2583 int direction)
2585 int result;
2587 str = PyUnicode_FromObject(str);
2588 if (str == NULL)
2589 return -1;
2590 substr = PyUnicode_FromObject(substr);
2591 if (substr == NULL) {
2592 Py_DECREF(substr);
2593 return -1;
2596 result = findstring((PyUnicodeObject *)str,
2597 (PyUnicodeObject *)substr,
2598 start, end, direction);
2599 Py_DECREF(str);
2600 Py_DECREF(substr);
2601 return result;
2604 static
2605 int tailmatch(PyUnicodeObject *self,
2606 PyUnicodeObject *substring,
2607 int start,
2608 int end,
2609 int direction)
2611 if (start < 0)
2612 start += self->length;
2613 if (start < 0)
2614 start = 0;
2616 if (substring->length == 0)
2617 return 1;
2619 if (end > self->length)
2620 end = self->length;
2621 if (end < 0)
2622 end += self->length;
2623 if (end < 0)
2624 end = 0;
2626 end -= substring->length;
2627 if (end < start)
2628 return 0;
2630 if (direction > 0) {
2631 if (Py_UNICODE_MATCH(self, end, substring))
2632 return 1;
2633 } else {
2634 if (Py_UNICODE_MATCH(self, start, substring))
2635 return 1;
2638 return 0;
2641 int PyUnicode_Tailmatch(PyObject *str,
2642 PyObject *substr,
2643 int start,
2644 int end,
2645 int direction)
2647 int result;
2649 str = PyUnicode_FromObject(str);
2650 if (str == NULL)
2651 return -1;
2652 substr = PyUnicode_FromObject(substr);
2653 if (substr == NULL) {
2654 Py_DECREF(substr);
2655 return -1;
2658 result = tailmatch((PyUnicodeObject *)str,
2659 (PyUnicodeObject *)substr,
2660 start, end, direction);
2661 Py_DECREF(str);
2662 Py_DECREF(substr);
2663 return result;
2666 static
2667 const Py_UNICODE *findchar(const Py_UNICODE *s,
2668 int size,
2669 Py_UNICODE ch)
2671 /* like wcschr, but doesn't stop at NULL characters */
2673 while (size-- > 0) {
2674 if (*s == ch)
2675 return s;
2676 s++;
2679 return NULL;
2682 /* Apply fixfct filter to the Unicode object self and return a
2683 reference to the modified object */
2685 static
2686 PyObject *fixup(PyUnicodeObject *self,
2687 int (*fixfct)(PyUnicodeObject *s))
2690 PyUnicodeObject *u;
2692 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2693 if (u == NULL)
2694 return NULL;
2696 Py_UNICODE_COPY(u->str, self->str, self->length);
2698 if (!fixfct(u)) {
2699 /* fixfct should return TRUE if it modified the buffer. If
2700 FALSE, return a reference to the original buffer instead
2701 (to save space, not time) */
2702 Py_INCREF(self);
2703 Py_DECREF(u);
2704 return (PyObject*) self;
2706 return (PyObject*) u;
2709 static
2710 int fixupper(PyUnicodeObject *self)
2712 int len = self->length;
2713 Py_UNICODE *s = self->str;
2714 int status = 0;
2716 while (len-- > 0) {
2717 register Py_UNICODE ch;
2719 ch = Py_UNICODE_TOUPPER(*s);
2720 if (ch != *s) {
2721 status = 1;
2722 *s = ch;
2724 s++;
2727 return status;
2730 static
2731 int fixlower(PyUnicodeObject *self)
2733 int len = self->length;
2734 Py_UNICODE *s = self->str;
2735 int status = 0;
2737 while (len-- > 0) {
2738 register Py_UNICODE ch;
2740 ch = Py_UNICODE_TOLOWER(*s);
2741 if (ch != *s) {
2742 status = 1;
2743 *s = ch;
2745 s++;
2748 return status;
2751 static
2752 int fixswapcase(PyUnicodeObject *self)
2754 int len = self->length;
2755 Py_UNICODE *s = self->str;
2756 int status = 0;
2758 while (len-- > 0) {
2759 if (Py_UNICODE_ISUPPER(*s)) {
2760 *s = Py_UNICODE_TOLOWER(*s);
2761 status = 1;
2762 } else if (Py_UNICODE_ISLOWER(*s)) {
2763 *s = Py_UNICODE_TOUPPER(*s);
2764 status = 1;
2766 s++;
2769 return status;
2772 static
2773 int fixcapitalize(PyUnicodeObject *self)
2775 int len = self->length;
2776 Py_UNICODE *s = self->str;
2777 int status = 0;
2779 if (len == 0)
2780 return 0;
2781 if (Py_UNICODE_ISLOWER(*s)) {
2782 *s = Py_UNICODE_TOUPPER(*s);
2783 status = 1;
2785 s++;
2786 while (--len > 0) {
2787 if (Py_UNICODE_ISUPPER(*s)) {
2788 *s = Py_UNICODE_TOLOWER(*s);
2789 status = 1;
2791 s++;
2793 return status;
2796 static
2797 int fixtitle(PyUnicodeObject *self)
2799 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2800 register Py_UNICODE *e;
2801 int previous_is_cased;
2803 /* Shortcut for single character strings */
2804 if (PyUnicode_GET_SIZE(self) == 1) {
2805 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2806 if (*p != ch) {
2807 *p = ch;
2808 return 1;
2810 else
2811 return 0;
2814 e = p + PyUnicode_GET_SIZE(self);
2815 previous_is_cased = 0;
2816 for (; p < e; p++) {
2817 register const Py_UNICODE ch = *p;
2819 if (previous_is_cased)
2820 *p = Py_UNICODE_TOLOWER(ch);
2821 else
2822 *p = Py_UNICODE_TOTITLE(ch);
2824 if (Py_UNICODE_ISLOWER(ch) ||
2825 Py_UNICODE_ISUPPER(ch) ||
2826 Py_UNICODE_ISTITLE(ch))
2827 previous_is_cased = 1;
2828 else
2829 previous_is_cased = 0;
2831 return 1;
2834 PyObject *PyUnicode_Join(PyObject *separator,
2835 PyObject *seq)
2837 Py_UNICODE *sep;
2838 int seplen;
2839 PyUnicodeObject *res = NULL;
2840 int reslen = 0;
2841 Py_UNICODE *p;
2842 int sz = 100;
2843 int i;
2844 PyObject *it;
2846 it = PyObject_GetIter(seq);
2847 if (it == NULL)
2848 return NULL;
2850 if (separator == NULL) {
2851 Py_UNICODE blank = ' ';
2852 sep = &blank;
2853 seplen = 1;
2855 else {
2856 separator = PyUnicode_FromObject(separator);
2857 if (separator == NULL)
2858 goto onError;
2859 sep = PyUnicode_AS_UNICODE(separator);
2860 seplen = PyUnicode_GET_SIZE(separator);
2863 res = _PyUnicode_New(sz);
2864 if (res == NULL)
2865 goto onError;
2866 p = PyUnicode_AS_UNICODE(res);
2867 reslen = 0;
2869 for (i = 0; ; ++i) {
2870 int itemlen;
2871 PyObject *item = PyIter_Next(it);
2872 if (item == NULL) {
2873 if (PyErr_Occurred())
2874 goto onError;
2875 break;
2877 if (!PyUnicode_Check(item)) {
2878 PyObject *v;
2879 v = PyUnicode_FromObject(item);
2880 Py_DECREF(item);
2881 item = v;
2882 if (item == NULL)
2883 goto onError;
2885 itemlen = PyUnicode_GET_SIZE(item);
2886 while (reslen + itemlen + seplen >= sz) {
2887 if (_PyUnicode_Resize(&res, sz*2))
2888 goto onError;
2889 sz *= 2;
2890 p = PyUnicode_AS_UNICODE(res) + reslen;
2892 if (i > 0) {
2893 Py_UNICODE_COPY(p, sep, seplen);
2894 p += seplen;
2895 reslen += seplen;
2897 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2898 p += itemlen;
2899 reslen += itemlen;
2900 Py_DECREF(item);
2902 if (_PyUnicode_Resize(&res, reslen))
2903 goto onError;
2905 Py_XDECREF(separator);
2906 Py_DECREF(it);
2907 return (PyObject *)res;
2909 onError:
2910 Py_XDECREF(separator);
2911 Py_XDECREF(res);
2912 Py_DECREF(it);
2913 return NULL;
2916 static
2917 PyUnicodeObject *pad(PyUnicodeObject *self,
2918 int left,
2919 int right,
2920 Py_UNICODE fill)
2922 PyUnicodeObject *u;
2924 if (left < 0)
2925 left = 0;
2926 if (right < 0)
2927 right = 0;
2929 if (left == 0 && right == 0) {
2930 Py_INCREF(self);
2931 return self;
2934 u = _PyUnicode_New(left + self->length + right);
2935 if (u) {
2936 if (left)
2937 Py_UNICODE_FILL(u->str, fill, left);
2938 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2939 if (right)
2940 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2943 return u;
2946 #define SPLIT_APPEND(data, left, right) \
2947 str = PyUnicode_FromUnicode(data + left, right - left); \
2948 if (!str) \
2949 goto onError; \
2950 if (PyList_Append(list, str)) { \
2951 Py_DECREF(str); \
2952 goto onError; \
2954 else \
2955 Py_DECREF(str);
2957 static
2958 PyObject *split_whitespace(PyUnicodeObject *self,
2959 PyObject *list,
2960 int maxcount)
2962 register int i;
2963 register int j;
2964 int len = self->length;
2965 PyObject *str;
2967 for (i = j = 0; i < len; ) {
2968 /* find a token */
2969 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2970 i++;
2971 j = i;
2972 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2973 i++;
2974 if (j < i) {
2975 if (maxcount-- <= 0)
2976 break;
2977 SPLIT_APPEND(self->str, j, i);
2978 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2979 i++;
2980 j = i;
2983 if (j < len) {
2984 SPLIT_APPEND(self->str, j, len);
2986 return list;
2988 onError:
2989 Py_DECREF(list);
2990 return NULL;
2993 PyObject *PyUnicode_Splitlines(PyObject *string,
2994 int keepends)
2996 register int i;
2997 register int j;
2998 int len;
2999 PyObject *list;
3000 PyObject *str;
3001 Py_UNICODE *data;
3003 string = PyUnicode_FromObject(string);
3004 if (string == NULL)
3005 return NULL;
3006 data = PyUnicode_AS_UNICODE(string);
3007 len = PyUnicode_GET_SIZE(string);
3009 list = PyList_New(0);
3010 if (!list)
3011 goto onError;
3013 for (i = j = 0; i < len; ) {
3014 int eol;
3016 /* Find a line and append it */
3017 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3018 i++;
3020 /* Skip the line break reading CRLF as one line break */
3021 eol = i;
3022 if (i < len) {
3023 if (data[i] == '\r' && i + 1 < len &&
3024 data[i+1] == '\n')
3025 i += 2;
3026 else
3027 i++;
3028 if (keepends)
3029 eol = i;
3031 SPLIT_APPEND(data, j, eol);
3032 j = i;
3034 if (j < len) {
3035 SPLIT_APPEND(data, j, len);
3038 Py_DECREF(string);
3039 return list;
3041 onError:
3042 Py_DECREF(list);
3043 Py_DECREF(string);
3044 return NULL;
3047 static
3048 PyObject *split_char(PyUnicodeObject *self,
3049 PyObject *list,
3050 Py_UNICODE ch,
3051 int maxcount)
3053 register int i;
3054 register int j;
3055 int len = self->length;
3056 PyObject *str;
3058 for (i = j = 0; i < len; ) {
3059 if (self->str[i] == ch) {
3060 if (maxcount-- <= 0)
3061 break;
3062 SPLIT_APPEND(self->str, j, i);
3063 i = j = i + 1;
3064 } else
3065 i++;
3067 if (j <= len) {
3068 SPLIT_APPEND(self->str, j, len);
3070 return list;
3072 onError:
3073 Py_DECREF(list);
3074 return NULL;
3077 static
3078 PyObject *split_substring(PyUnicodeObject *self,
3079 PyObject *list,
3080 PyUnicodeObject *substring,
3081 int maxcount)
3083 register int i;
3084 register int j;
3085 int len = self->length;
3086 int sublen = substring->length;
3087 PyObject *str;
3089 for (i = j = 0; i <= len - sublen; ) {
3090 if (Py_UNICODE_MATCH(self, i, substring)) {
3091 if (maxcount-- <= 0)
3092 break;
3093 SPLIT_APPEND(self->str, j, i);
3094 i = j = i + sublen;
3095 } else
3096 i++;
3098 if (j <= len) {
3099 SPLIT_APPEND(self->str, j, len);
3101 return list;
3103 onError:
3104 Py_DECREF(list);
3105 return NULL;
3108 #undef SPLIT_APPEND
3110 static
3111 PyObject *split(PyUnicodeObject *self,
3112 PyUnicodeObject *substring,
3113 int maxcount)
3115 PyObject *list;
3117 if (maxcount < 0)
3118 maxcount = INT_MAX;
3120 list = PyList_New(0);
3121 if (!list)
3122 return NULL;
3124 if (substring == NULL)
3125 return split_whitespace(self,list,maxcount);
3127 else if (substring->length == 1)
3128 return split_char(self,list,substring->str[0],maxcount);
3130 else if (substring->length == 0) {
3131 Py_DECREF(list);
3132 PyErr_SetString(PyExc_ValueError, "empty separator");
3133 return NULL;
3135 else
3136 return split_substring(self,list,substring,maxcount);
3139 static
3140 PyObject *strip(PyUnicodeObject *self,
3141 int left,
3142 int right)
3144 Py_UNICODE *p = self->str;
3145 int start = 0;
3146 int end = self->length;
3148 if (left)
3149 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3150 start++;
3152 if (right)
3153 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3154 end--;
3156 if (start == 0 && end == self->length) {
3157 /* couldn't strip anything off, return original string */
3158 Py_INCREF(self);
3159 return (PyObject*) self;
3162 return (PyObject*) PyUnicode_FromUnicode(
3163 self->str + start,
3164 end - start
3168 static
3169 PyObject *replace(PyUnicodeObject *self,
3170 PyUnicodeObject *str1,
3171 PyUnicodeObject *str2,
3172 int maxcount)
3174 PyUnicodeObject *u;
3176 if (maxcount < 0)
3177 maxcount = INT_MAX;
3179 if (str1->length == 1 && str2->length == 1) {
3180 int i;
3182 /* replace characters */
3183 if (!findchar(self->str, self->length, str1->str[0])) {
3184 /* nothing to replace, return original string */
3185 Py_INCREF(self);
3186 u = self;
3187 } else {
3188 Py_UNICODE u1 = str1->str[0];
3189 Py_UNICODE u2 = str2->str[0];
3191 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3192 NULL,
3193 self->length
3195 if (u != NULL) {
3196 Py_UNICODE_COPY(u->str, self->str,
3197 self->length);
3198 for (i = 0; i < u->length; i++)
3199 if (u->str[i] == u1) {
3200 if (--maxcount < 0)
3201 break;
3202 u->str[i] = u2;
3207 } else {
3208 int n, i;
3209 Py_UNICODE *p;
3211 /* replace strings */
3212 n = count(self, 0, self->length, str1);
3213 if (n > maxcount)
3214 n = maxcount;
3215 if (n == 0) {
3216 /* nothing to replace, return original string */
3217 Py_INCREF(self);
3218 u = self;
3219 } else {
3220 u = _PyUnicode_New(
3221 self->length + n * (str2->length - str1->length));
3222 if (u) {
3223 i = 0;
3224 p = u->str;
3225 while (i <= self->length - str1->length)
3226 if (Py_UNICODE_MATCH(self, i, str1)) {
3227 /* replace string segment */
3228 Py_UNICODE_COPY(p, str2->str, str2->length);
3229 p += str2->length;
3230 i += str1->length;
3231 if (--n <= 0) {
3232 /* copy remaining part */
3233 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3234 break;
3236 } else
3237 *p++ = self->str[i++];
3242 return (PyObject *) u;
3245 /* --- Unicode Object Methods --------------------------------------------- */
3247 static char title__doc__[] =
3248 "S.title() -> unicode\n\
3250 Return a titlecased version of S, i.e. words start with title case\n\
3251 characters, all remaining cased characters have lower case.";
3253 static PyObject*
3254 unicode_title(PyUnicodeObject *self, PyObject *args)
3256 if (!PyArg_NoArgs(args))
3257 return NULL;
3258 return fixup(self, fixtitle);
3261 static char capitalize__doc__[] =
3262 "S.capitalize() -> unicode\n\
3264 Return a capitalized version of S, i.e. make the first character\n\
3265 have upper case.";
3267 static PyObject*
3268 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3270 if (!PyArg_NoArgs(args))
3271 return NULL;
3272 return fixup(self, fixcapitalize);
3275 #if 0
3276 static char capwords__doc__[] =
3277 "S.capwords() -> unicode\n\
3279 Apply .capitalize() to all words in S and return the result with\n\
3280 normalized whitespace (all whitespace strings are replaced by ' ').";
3282 static PyObject*
3283 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3285 PyObject *list;
3286 PyObject *item;
3287 int i;
3289 if (!PyArg_NoArgs(args))
3290 return NULL;
3292 /* Split into words */
3293 list = split(self, NULL, -1);
3294 if (!list)
3295 return NULL;
3297 /* Capitalize each word */
3298 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3299 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3300 fixcapitalize);
3301 if (item == NULL)
3302 goto onError;
3303 Py_DECREF(PyList_GET_ITEM(list, i));
3304 PyList_SET_ITEM(list, i, item);
3307 /* Join the words to form a new string */
3308 item = PyUnicode_Join(NULL, list);
3310 onError:
3311 Py_DECREF(list);
3312 return (PyObject *)item;
3314 #endif
3316 static char center__doc__[] =
3317 "S.center(width) -> unicode\n\
3319 Return S centered in a Unicode string of length width. Padding is done\n\
3320 using spaces.";
3322 static PyObject *
3323 unicode_center(PyUnicodeObject *self, PyObject *args)
3325 int marg, left;
3326 int width;
3328 if (!PyArg_ParseTuple(args, "i:center", &width))
3329 return NULL;
3331 if (self->length >= width) {
3332 Py_INCREF(self);
3333 return (PyObject*) self;
3336 marg = width - self->length;
3337 left = marg / 2 + (marg & width & 1);
3339 return (PyObject*) pad(self, left, marg - left, ' ');
3342 #if 0
3344 /* This code should go into some future Unicode collation support
3345 module. The basic comparison should compare ordinals on a naive
3346 basis (this is what Java does and thus JPython too). */
3348 /* speedy UTF-16 code point order comparison */
3349 /* gleaned from: */
3350 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3352 static short utf16Fixup[32] =
3354 0, 0, 0, 0, 0, 0, 0, 0,
3355 0, 0, 0, 0, 0, 0, 0, 0,
3356 0, 0, 0, 0, 0, 0, 0, 0,
3357 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3360 static int
3361 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3363 int len1, len2;
3365 Py_UNICODE *s1 = str1->str;
3366 Py_UNICODE *s2 = str2->str;
3368 len1 = str1->length;
3369 len2 = str2->length;
3371 while (len1 > 0 && len2 > 0) {
3372 Py_UNICODE c1, c2;
3374 c1 = *s1++;
3375 c2 = *s2++;
3377 if (c1 > (1<<11) * 26)
3378 c1 += utf16Fixup[c1>>11];
3379 if (c2 > (1<<11) * 26)
3380 c2 += utf16Fixup[c2>>11];
3381 /* now c1 and c2 are in UTF-32-compatible order */
3383 if (c1 != c2)
3384 return (c1 < c2) ? -1 : 1;
3386 len1--; len2--;
3389 return (len1 < len2) ? -1 : (len1 != len2);
3392 #else
3394 static int
3395 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3397 register int len1, len2;
3399 Py_UNICODE *s1 = str1->str;
3400 Py_UNICODE *s2 = str2->str;
3402 len1 = str1->length;
3403 len2 = str2->length;
3405 while (len1 > 0 && len2 > 0) {
3406 Py_UNICODE c1, c2;
3408 c1 = *s1++;
3409 c2 = *s2++;
3411 if (c1 != c2)
3412 return (c1 < c2) ? -1 : 1;
3414 len1--; len2--;
3417 return (len1 < len2) ? -1 : (len1 != len2);
3420 #endif
3422 int PyUnicode_Compare(PyObject *left,
3423 PyObject *right)
3425 PyUnicodeObject *u = NULL, *v = NULL;
3426 int result;
3428 /* Coerce the two arguments */
3429 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3430 if (u == NULL)
3431 goto onError;
3432 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3433 if (v == NULL)
3434 goto onError;
3436 /* Shortcut for empty or interned objects */
3437 if (v == u) {
3438 Py_DECREF(u);
3439 Py_DECREF(v);
3440 return 0;
3443 result = unicode_compare(u, v);
3445 Py_DECREF(u);
3446 Py_DECREF(v);
3447 return result;
3449 onError:
3450 Py_XDECREF(u);
3451 Py_XDECREF(v);
3452 return -1;
3455 int PyUnicode_Contains(PyObject *container,
3456 PyObject *element)
3458 PyUnicodeObject *u = NULL, *v = NULL;
3459 int result;
3460 register const Py_UNICODE *p, *e;
3461 register Py_UNICODE ch;
3463 /* Coerce the two arguments */
3464 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3465 if (v == NULL) {
3466 PyErr_SetString(PyExc_TypeError,
3467 "'in <string>' requires character as left operand");
3468 goto onError;
3470 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3471 if (u == NULL) {
3472 Py_DECREF(v);
3473 goto onError;
3476 /* Check v in u */
3477 if (PyUnicode_GET_SIZE(v) != 1) {
3478 PyErr_SetString(PyExc_TypeError,
3479 "'in <string>' requires character as left operand");
3480 goto onError;
3482 ch = *PyUnicode_AS_UNICODE(v);
3483 p = PyUnicode_AS_UNICODE(u);
3484 e = p + PyUnicode_GET_SIZE(u);
3485 result = 0;
3486 while (p < e) {
3487 if (*p++ == ch) {
3488 result = 1;
3489 break;
3493 Py_DECREF(u);
3494 Py_DECREF(v);
3495 return result;
3497 onError:
3498 Py_XDECREF(u);
3499 Py_XDECREF(v);
3500 return -1;
3503 /* Concat to string or Unicode object giving a new Unicode object. */
3505 PyObject *PyUnicode_Concat(PyObject *left,
3506 PyObject *right)
3508 PyUnicodeObject *u = NULL, *v = NULL, *w;
3510 /* Coerce the two arguments */
3511 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3512 if (u == NULL)
3513 goto onError;
3514 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3515 if (v == NULL)
3516 goto onError;
3518 /* Shortcuts */
3519 if (v == unicode_empty) {
3520 Py_DECREF(v);
3521 return (PyObject *)u;
3523 if (u == unicode_empty) {
3524 Py_DECREF(u);
3525 return (PyObject *)v;
3528 /* Concat the two Unicode strings */
3529 w = _PyUnicode_New(u->length + v->length);
3530 if (w == NULL)
3531 goto onError;
3532 Py_UNICODE_COPY(w->str, u->str, u->length);
3533 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3535 Py_DECREF(u);
3536 Py_DECREF(v);
3537 return (PyObject *)w;
3539 onError:
3540 Py_XDECREF(u);
3541 Py_XDECREF(v);
3542 return NULL;
3545 static char count__doc__[] =
3546 "S.count(sub[, start[, end]]) -> int\n\
3548 Return the number of occurrences of substring sub in Unicode string\n\
3549 S[start:end]. Optional arguments start and end are\n\
3550 interpreted as in slice notation.";
3552 static PyObject *
3553 unicode_count(PyUnicodeObject *self, PyObject *args)
3555 PyUnicodeObject *substring;
3556 int start = 0;
3557 int end = INT_MAX;
3558 PyObject *result;
3560 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3561 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3562 return NULL;
3564 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3565 (PyObject *)substring);
3566 if (substring == NULL)
3567 return NULL;
3569 if (start < 0)
3570 start += self->length;
3571 if (start < 0)
3572 start = 0;
3573 if (end > self->length)
3574 end = self->length;
3575 if (end < 0)
3576 end += self->length;
3577 if (end < 0)
3578 end = 0;
3580 result = PyInt_FromLong((long) count(self, start, end, substring));
3582 Py_DECREF(substring);
3583 return result;
3586 static char encode__doc__[] =
3587 "S.encode([encoding[,errors]]) -> string\n\
3589 Return an encoded string version of S. Default encoding is the current\n\
3590 default string encoding. errors may be given to set a different error\n\
3591 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3592 a ValueError. Other possible values are 'ignore' and 'replace'.";
3594 static PyObject *
3595 unicode_encode(PyUnicodeObject *self, PyObject *args)
3597 char *encoding = NULL;
3598 char *errors = NULL;
3599 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3600 return NULL;
3601 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3604 static char expandtabs__doc__[] =
3605 "S.expandtabs([tabsize]) -> unicode\n\
3607 Return a copy of S where all tab characters are expanded using spaces.\n\
3608 If tabsize is not given, a tab size of 8 characters is assumed.";
3610 static PyObject*
3611 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3613 Py_UNICODE *e;
3614 Py_UNICODE *p;
3615 Py_UNICODE *q;
3616 int i, j;
3617 PyUnicodeObject *u;
3618 int tabsize = 8;
3620 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3621 return NULL;
3623 /* First pass: determine size of output string */
3624 i = j = 0;
3625 e = self->str + self->length;
3626 for (p = self->str; p < e; p++)
3627 if (*p == '\t') {
3628 if (tabsize > 0)
3629 j += tabsize - (j % tabsize);
3631 else {
3632 j++;
3633 if (*p == '\n' || *p == '\r') {
3634 i += j;
3635 j = 0;
3639 /* Second pass: create output string and fill it */
3640 u = _PyUnicode_New(i + j);
3641 if (!u)
3642 return NULL;
3644 j = 0;
3645 q = u->str;
3647 for (p = self->str; p < e; p++)
3648 if (*p == '\t') {
3649 if (tabsize > 0) {
3650 i = tabsize - (j % tabsize);
3651 j += i;
3652 while (i--)
3653 *q++ = ' ';
3656 else {
3657 j++;
3658 *q++ = *p;
3659 if (*p == '\n' || *p == '\r')
3660 j = 0;
3663 return (PyObject*) u;
3666 static char find__doc__[] =
3667 "S.find(sub [,start [,end]]) -> int\n\
3669 Return the lowest index in S where substring sub is found,\n\
3670 such that sub is contained within s[start,end]. Optional\n\
3671 arguments start and end are interpreted as in slice notation.\n\
3673 Return -1 on failure.";
3675 static PyObject *
3676 unicode_find(PyUnicodeObject *self, PyObject *args)
3678 PyUnicodeObject *substring;
3679 int start = 0;
3680 int end = INT_MAX;
3681 PyObject *result;
3683 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3684 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3685 return NULL;
3686 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3687 (PyObject *)substring);
3688 if (substring == NULL)
3689 return NULL;
3691 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3693 Py_DECREF(substring);
3694 return result;
3697 static PyObject *
3698 unicode_getitem(PyUnicodeObject *self, int index)
3700 if (index < 0 || index >= self->length) {
3701 PyErr_SetString(PyExc_IndexError, "string index out of range");
3702 return NULL;
3705 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3708 static long
3709 unicode_hash(PyUnicodeObject *self)
3711 /* Since Unicode objects compare equal to their ASCII string
3712 counterparts, they should use the individual character values
3713 as basis for their hash value. This is needed to assure that
3714 strings and Unicode objects behave in the same way as
3715 dictionary keys. */
3717 register int len;
3718 register Py_UNICODE *p;
3719 register long x;
3721 if (self->hash != -1)
3722 return self->hash;
3723 len = PyUnicode_GET_SIZE(self);
3724 p = PyUnicode_AS_UNICODE(self);
3725 x = *p << 7;
3726 while (--len >= 0)
3727 x = (1000003*x) ^ *p++;
3728 x ^= PyUnicode_GET_SIZE(self);
3729 if (x == -1)
3730 x = -2;
3731 self->hash = x;
3732 return x;
3735 static char index__doc__[] =
3736 "S.index(sub [,start [,end]]) -> int\n\
3738 Like S.find() but raise ValueError when the substring is not found.";
3740 static PyObject *
3741 unicode_index(PyUnicodeObject *self, PyObject *args)
3743 int result;
3744 PyUnicodeObject *substring;
3745 int start = 0;
3746 int end = INT_MAX;
3748 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3749 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3750 return NULL;
3752 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3753 (PyObject *)substring);
3754 if (substring == NULL)
3755 return NULL;
3757 result = findstring(self, substring, start, end, 1);
3759 Py_DECREF(substring);
3760 if (result < 0) {
3761 PyErr_SetString(PyExc_ValueError, "substring not found");
3762 return NULL;
3764 return PyInt_FromLong(result);
3767 static char islower__doc__[] =
3768 "S.islower() -> int\n\
3770 Return 1 if all cased characters in S are lowercase and there is\n\
3771 at least one cased character in S, 0 otherwise.";
3773 static PyObject*
3774 unicode_islower(PyUnicodeObject *self, PyObject *args)
3776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3777 register const Py_UNICODE *e;
3778 int cased;
3780 if (!PyArg_NoArgs(args))
3781 return NULL;
3783 /* Shortcut for single character strings */
3784 if (PyUnicode_GET_SIZE(self) == 1)
3785 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3787 /* Special case for empty strings */
3788 if (PyString_GET_SIZE(self) == 0)
3789 return PyInt_FromLong(0);
3791 e = p + PyUnicode_GET_SIZE(self);
3792 cased = 0;
3793 for (; p < e; p++) {
3794 register const Py_UNICODE ch = *p;
3796 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3797 return PyInt_FromLong(0);
3798 else if (!cased && Py_UNICODE_ISLOWER(ch))
3799 cased = 1;
3801 return PyInt_FromLong(cased);
3804 static char isupper__doc__[] =
3805 "S.isupper() -> int\n\
3807 Return 1 if all cased characters in S are uppercase and there is\n\
3808 at least one cased character in S, 0 otherwise.";
3810 static PyObject*
3811 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3813 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3814 register const Py_UNICODE *e;
3815 int cased;
3817 if (!PyArg_NoArgs(args))
3818 return NULL;
3820 /* Shortcut for single character strings */
3821 if (PyUnicode_GET_SIZE(self) == 1)
3822 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3824 /* Special case for empty strings */
3825 if (PyString_GET_SIZE(self) == 0)
3826 return PyInt_FromLong(0);
3828 e = p + PyUnicode_GET_SIZE(self);
3829 cased = 0;
3830 for (; p < e; p++) {
3831 register const Py_UNICODE ch = *p;
3833 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3834 return PyInt_FromLong(0);
3835 else if (!cased && Py_UNICODE_ISUPPER(ch))
3836 cased = 1;
3838 return PyInt_FromLong(cased);
3841 static char istitle__doc__[] =
3842 "S.istitle() -> int\n\
3844 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3845 may only follow uncased characters and lowercase characters only cased\n\
3846 ones. Return 0 otherwise.";
3848 static PyObject*
3849 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3852 register const Py_UNICODE *e;
3853 int cased, previous_is_cased;
3855 if (!PyArg_NoArgs(args))
3856 return NULL;
3858 /* Shortcut for single character strings */
3859 if (PyUnicode_GET_SIZE(self) == 1)
3860 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3861 (Py_UNICODE_ISUPPER(*p) != 0));
3863 /* Special case for empty strings */
3864 if (PyString_GET_SIZE(self) == 0)
3865 return PyInt_FromLong(0);
3867 e = p + PyUnicode_GET_SIZE(self);
3868 cased = 0;
3869 previous_is_cased = 0;
3870 for (; p < e; p++) {
3871 register const Py_UNICODE ch = *p;
3873 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3874 if (previous_is_cased)
3875 return PyInt_FromLong(0);
3876 previous_is_cased = 1;
3877 cased = 1;
3879 else if (Py_UNICODE_ISLOWER(ch)) {
3880 if (!previous_is_cased)
3881 return PyInt_FromLong(0);
3882 previous_is_cased = 1;
3883 cased = 1;
3885 else
3886 previous_is_cased = 0;
3888 return PyInt_FromLong(cased);
3891 static char isspace__doc__[] =
3892 "S.isspace() -> int\n\
3894 Return 1 if there are only whitespace characters in S,\n\
3895 0 otherwise.";
3897 static PyObject*
3898 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3900 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3901 register const Py_UNICODE *e;
3903 if (!PyArg_NoArgs(args))
3904 return NULL;
3906 /* Shortcut for single character strings */
3907 if (PyUnicode_GET_SIZE(self) == 1 &&
3908 Py_UNICODE_ISSPACE(*p))
3909 return PyInt_FromLong(1);
3911 /* Special case for empty strings */
3912 if (PyString_GET_SIZE(self) == 0)
3913 return PyInt_FromLong(0);
3915 e = p + PyUnicode_GET_SIZE(self);
3916 for (; p < e; p++) {
3917 if (!Py_UNICODE_ISSPACE(*p))
3918 return PyInt_FromLong(0);
3920 return PyInt_FromLong(1);
3923 static char isalpha__doc__[] =
3924 "S.isalpha() -> int\n\
3926 Return 1 if all characters in S are alphabetic\n\
3927 and there is at least one character in S, 0 otherwise.";
3929 static PyObject*
3930 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3932 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3933 register const Py_UNICODE *e;
3935 if (!PyArg_NoArgs(args))
3936 return NULL;
3938 /* Shortcut for single character strings */
3939 if (PyUnicode_GET_SIZE(self) == 1 &&
3940 Py_UNICODE_ISALPHA(*p))
3941 return PyInt_FromLong(1);
3943 /* Special case for empty strings */
3944 if (PyString_GET_SIZE(self) == 0)
3945 return PyInt_FromLong(0);
3947 e = p + PyUnicode_GET_SIZE(self);
3948 for (; p < e; p++) {
3949 if (!Py_UNICODE_ISALPHA(*p))
3950 return PyInt_FromLong(0);
3952 return PyInt_FromLong(1);
3955 static char isalnum__doc__[] =
3956 "S.isalnum() -> int\n\
3958 Return 1 if all characters in S are alphanumeric\n\
3959 and there is at least one character in S, 0 otherwise.";
3961 static PyObject*
3962 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3964 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3965 register const Py_UNICODE *e;
3967 if (!PyArg_NoArgs(args))
3968 return NULL;
3970 /* Shortcut for single character strings */
3971 if (PyUnicode_GET_SIZE(self) == 1 &&
3972 Py_UNICODE_ISALNUM(*p))
3973 return PyInt_FromLong(1);
3975 /* Special case for empty strings */
3976 if (PyString_GET_SIZE(self) == 0)
3977 return PyInt_FromLong(0);
3979 e = p + PyUnicode_GET_SIZE(self);
3980 for (; p < e; p++) {
3981 if (!Py_UNICODE_ISALNUM(*p))
3982 return PyInt_FromLong(0);
3984 return PyInt_FromLong(1);
3987 static char isdecimal__doc__[] =
3988 "S.isdecimal() -> int\n\
3990 Return 1 if there are only decimal characters in S,\n\
3991 0 otherwise.";
3993 static PyObject*
3994 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3996 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3997 register const Py_UNICODE *e;
3999 if (!PyArg_NoArgs(args))
4000 return NULL;
4002 /* Shortcut for single character strings */
4003 if (PyUnicode_GET_SIZE(self) == 1 &&
4004 Py_UNICODE_ISDECIMAL(*p))
4005 return PyInt_FromLong(1);
4007 /* Special case for empty strings */
4008 if (PyString_GET_SIZE(self) == 0)
4009 return PyInt_FromLong(0);
4011 e = p + PyUnicode_GET_SIZE(self);
4012 for (; p < e; p++) {
4013 if (!Py_UNICODE_ISDECIMAL(*p))
4014 return PyInt_FromLong(0);
4016 return PyInt_FromLong(1);
4019 static char isdigit__doc__[] =
4020 "S.isdigit() -> int\n\
4022 Return 1 if there are only digit characters in S,\n\
4023 0 otherwise.";
4025 static PyObject*
4026 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
4028 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4029 register const Py_UNICODE *e;
4031 if (!PyArg_NoArgs(args))
4032 return NULL;
4034 /* Shortcut for single character strings */
4035 if (PyUnicode_GET_SIZE(self) == 1 &&
4036 Py_UNICODE_ISDIGIT(*p))
4037 return PyInt_FromLong(1);
4039 /* Special case for empty strings */
4040 if (PyString_GET_SIZE(self) == 0)
4041 return PyInt_FromLong(0);
4043 e = p + PyUnicode_GET_SIZE(self);
4044 for (; p < e; p++) {
4045 if (!Py_UNICODE_ISDIGIT(*p))
4046 return PyInt_FromLong(0);
4048 return PyInt_FromLong(1);
4051 static char isnumeric__doc__[] =
4052 "S.isnumeric() -> int\n\
4054 Return 1 if there are only numeric characters in S,\n\
4055 0 otherwise.";
4057 static PyObject*
4058 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4060 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4061 register const Py_UNICODE *e;
4063 if (!PyArg_NoArgs(args))
4064 return NULL;
4066 /* Shortcut for single character strings */
4067 if (PyUnicode_GET_SIZE(self) == 1 &&
4068 Py_UNICODE_ISNUMERIC(*p))
4069 return PyInt_FromLong(1);
4071 /* Special case for empty strings */
4072 if (PyString_GET_SIZE(self) == 0)
4073 return PyInt_FromLong(0);
4075 e = p + PyUnicode_GET_SIZE(self);
4076 for (; p < e; p++) {
4077 if (!Py_UNICODE_ISNUMERIC(*p))
4078 return PyInt_FromLong(0);
4080 return PyInt_FromLong(1);
4083 static char join__doc__[] =
4084 "S.join(sequence) -> unicode\n\
4086 Return a string which is the concatenation of the strings in the\n\
4087 sequence. The separator between elements is S.";
4089 static PyObject*
4090 unicode_join(PyUnicodeObject *self, PyObject *args)
4092 PyObject *data;
4093 if (!PyArg_ParseTuple(args, "O:join", &data))
4094 return NULL;
4096 return PyUnicode_Join((PyObject *)self, data);
4099 static int
4100 unicode_length(PyUnicodeObject *self)
4102 return self->length;
4105 static char ljust__doc__[] =
4106 "S.ljust(width) -> unicode\n\
4108 Return S left justified in a Unicode string of length width. Padding is\n\
4109 done using spaces.";
4111 static PyObject *
4112 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4114 int width;
4115 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4116 return NULL;
4118 if (self->length >= width) {
4119 Py_INCREF(self);
4120 return (PyObject*) self;
4123 return (PyObject*) pad(self, 0, width - self->length, ' ');
4126 static char lower__doc__[] =
4127 "S.lower() -> unicode\n\
4129 Return a copy of the string S converted to lowercase.";
4131 static PyObject*
4132 unicode_lower(PyUnicodeObject *self, PyObject *args)
4134 if (!PyArg_NoArgs(args))
4135 return NULL;
4136 return fixup(self, fixlower);
4139 static char lstrip__doc__[] =
4140 "S.lstrip() -> unicode\n\
4142 Return a copy of the string S with leading whitespace removed.";
4144 static PyObject *
4145 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4147 if (!PyArg_NoArgs(args))
4148 return NULL;
4149 return strip(self, 1, 0);
4152 static PyObject*
4153 unicode_repeat(PyUnicodeObject *str, int len)
4155 PyUnicodeObject *u;
4156 Py_UNICODE *p;
4157 int nchars;
4158 size_t nbytes;
4160 if (len < 0)
4161 len = 0;
4163 if (len == 1) {
4164 /* no repeat, return original string */
4165 Py_INCREF(str);
4166 return (PyObject*) str;
4169 /* ensure # of chars needed doesn't overflow int and # of bytes
4170 * needed doesn't overflow size_t
4172 nchars = len * str->length;
4173 if (len && nchars / len != str->length) {
4174 PyErr_SetString(PyExc_OverflowError,
4175 "repeated string is too long");
4176 return NULL;
4178 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4179 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4180 PyErr_SetString(PyExc_OverflowError,
4181 "repeated string is too long");
4182 return NULL;
4184 u = _PyUnicode_New(nchars);
4185 if (!u)
4186 return NULL;
4188 p = u->str;
4190 while (len-- > 0) {
4191 Py_UNICODE_COPY(p, str->str, str->length);
4192 p += str->length;
4195 return (PyObject*) u;
4198 PyObject *PyUnicode_Replace(PyObject *obj,
4199 PyObject *subobj,
4200 PyObject *replobj,
4201 int maxcount)
4203 PyObject *self;
4204 PyObject *str1;
4205 PyObject *str2;
4206 PyObject *result;
4208 self = PyUnicode_FromObject(obj);
4209 if (self == NULL)
4210 return NULL;
4211 str1 = PyUnicode_FromObject(subobj);
4212 if (str1 == NULL) {
4213 Py_DECREF(self);
4214 return NULL;
4216 str2 = PyUnicode_FromObject(replobj);
4217 if (str2 == NULL) {
4218 Py_DECREF(self);
4219 Py_DECREF(str1);
4220 return NULL;
4222 result = replace((PyUnicodeObject *)self,
4223 (PyUnicodeObject *)str1,
4224 (PyUnicodeObject *)str2,
4225 maxcount);
4226 Py_DECREF(self);
4227 Py_DECREF(str1);
4228 Py_DECREF(str2);
4229 return result;
4232 static char replace__doc__[] =
4233 "S.replace (old, new[, maxsplit]) -> unicode\n\
4235 Return a copy of S with all occurrences of substring\n\
4236 old replaced by new. If the optional argument maxsplit is\n\
4237 given, only the first maxsplit occurrences are replaced.";
4239 static PyObject*
4240 unicode_replace(PyUnicodeObject *self, PyObject *args)
4242 PyUnicodeObject *str1;
4243 PyUnicodeObject *str2;
4244 int maxcount = -1;
4245 PyObject *result;
4247 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4248 return NULL;
4249 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4250 if (str1 == NULL)
4251 return NULL;
4252 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4253 if (str2 == NULL)
4254 return NULL;
4256 result = replace(self, str1, str2, maxcount);
4258 Py_DECREF(str1);
4259 Py_DECREF(str2);
4260 return result;
4263 static
4264 PyObject *unicode_repr(PyObject *unicode)
4266 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4267 PyUnicode_GET_SIZE(unicode),
4271 static char rfind__doc__[] =
4272 "S.rfind(sub [,start [,end]]) -> int\n\
4274 Return the highest index in S where substring sub is found,\n\
4275 such that sub is contained within s[start,end]. Optional\n\
4276 arguments start and end are interpreted as in slice notation.\n\
4278 Return -1 on failure.";
4280 static PyObject *
4281 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4283 PyUnicodeObject *substring;
4284 int start = 0;
4285 int end = INT_MAX;
4286 PyObject *result;
4288 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4289 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4290 return NULL;
4291 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4292 (PyObject *)substring);
4293 if (substring == NULL)
4294 return NULL;
4296 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4298 Py_DECREF(substring);
4299 return result;
4302 static char rindex__doc__[] =
4303 "S.rindex(sub [,start [,end]]) -> int\n\
4305 Like S.rfind() but raise ValueError when the substring is not found.";
4307 static PyObject *
4308 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4310 int result;
4311 PyUnicodeObject *substring;
4312 int start = 0;
4313 int end = INT_MAX;
4315 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4316 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4317 return NULL;
4318 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4319 (PyObject *)substring);
4320 if (substring == NULL)
4321 return NULL;
4323 result = findstring(self, substring, start, end, -1);
4325 Py_DECREF(substring);
4326 if (result < 0) {
4327 PyErr_SetString(PyExc_ValueError, "substring not found");
4328 return NULL;
4330 return PyInt_FromLong(result);
4333 static char rjust__doc__[] =
4334 "S.rjust(width) -> unicode\n\
4336 Return S right justified in a Unicode string of length width. Padding is\n\
4337 done using spaces.";
4339 static PyObject *
4340 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4342 int width;
4343 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4344 return NULL;
4346 if (self->length >= width) {
4347 Py_INCREF(self);
4348 return (PyObject*) self;
4351 return (PyObject*) pad(self, width - self->length, 0, ' ');
4354 static char rstrip__doc__[] =
4355 "S.rstrip() -> unicode\n\
4357 Return a copy of the string S with trailing whitespace removed.";
4359 static PyObject *
4360 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4362 if (!PyArg_NoArgs(args))
4363 return NULL;
4364 return strip(self, 0, 1);
4367 static PyObject*
4368 unicode_slice(PyUnicodeObject *self, int start, int end)
4370 /* standard clamping */
4371 if (start < 0)
4372 start = 0;
4373 if (end < 0)
4374 end = 0;
4375 if (end > self->length)
4376 end = self->length;
4377 if (start == 0 && end == self->length) {
4378 /* full slice, return original string */
4379 Py_INCREF(self);
4380 return (PyObject*) self;
4382 if (start > end)
4383 start = end;
4384 /* copy slice */
4385 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4386 end - start);
4389 PyObject *PyUnicode_Split(PyObject *s,
4390 PyObject *sep,
4391 int maxsplit)
4393 PyObject *result;
4395 s = PyUnicode_FromObject(s);
4396 if (s == NULL)
4397 return NULL;
4398 if (sep != NULL) {
4399 sep = PyUnicode_FromObject(sep);
4400 if (sep == NULL) {
4401 Py_DECREF(s);
4402 return NULL;
4406 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4408 Py_DECREF(s);
4409 Py_XDECREF(sep);
4410 return result;
4413 static char split__doc__[] =
4414 "S.split([sep [,maxsplit]]) -> list of strings\n\
4416 Return a list of the words in S, using sep as the\n\
4417 delimiter string. If maxsplit is given, at most maxsplit\n\
4418 splits are done. If sep is not specified, any whitespace string\n\
4419 is a separator.";
4421 static PyObject*
4422 unicode_split(PyUnicodeObject *self, PyObject *args)
4424 PyObject *substring = Py_None;
4425 int maxcount = -1;
4427 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4428 return NULL;
4430 if (substring == Py_None)
4431 return split(self, NULL, maxcount);
4432 else if (PyUnicode_Check(substring))
4433 return split(self, (PyUnicodeObject *)substring, maxcount);
4434 else
4435 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4438 static char splitlines__doc__[] =
4439 "S.splitlines([keepends]]) -> list of strings\n\
4441 Return a list of the lines in S, breaking at line boundaries.\n\
4442 Line breaks are not included in the resulting list unless keepends\n\
4443 is given and true.";
4445 static PyObject*
4446 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4448 int keepends = 0;
4450 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4451 return NULL;
4453 return PyUnicode_Splitlines((PyObject *)self, keepends);
4456 static
4457 PyObject *unicode_str(PyUnicodeObject *self)
4459 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4462 static char strip__doc__[] =
4463 "S.strip() -> unicode\n\
4465 Return a copy of S with leading and trailing whitespace removed.";
4467 static PyObject *
4468 unicode_strip(PyUnicodeObject *self, PyObject *args)
4470 if (!PyArg_NoArgs(args))
4471 return NULL;
4472 return strip(self, 1, 1);
4475 static char swapcase__doc__[] =
4476 "S.swapcase() -> unicode\n\
4478 Return a copy of S with uppercase characters converted to lowercase\n\
4479 and vice versa.";
4481 static PyObject*
4482 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4484 if (!PyArg_NoArgs(args))
4485 return NULL;
4486 return fixup(self, fixswapcase);
4489 static char translate__doc__[] =
4490 "S.translate(table) -> unicode\n\
4492 Return a copy of the string S, where all characters have been mapped\n\
4493 through the given translation table, which must be a mapping of\n\
4494 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4495 are left untouched. Characters mapped to None are deleted.";
4497 static PyObject*
4498 unicode_translate(PyUnicodeObject *self, PyObject *args)
4500 PyObject *table;
4502 if (!PyArg_ParseTuple(args, "O:translate", &table))
4503 return NULL;
4504 return PyUnicode_TranslateCharmap(self->str,
4505 self->length,
4506 table,
4507 "ignore");
4510 static char upper__doc__[] =
4511 "S.upper() -> unicode\n\
4513 Return a copy of S converted to uppercase.";
4515 static PyObject*
4516 unicode_upper(PyUnicodeObject *self, PyObject *args)
4518 if (!PyArg_NoArgs(args))
4519 return NULL;
4520 return fixup(self, fixupper);
4523 #if 0
4524 static char zfill__doc__[] =
4525 "S.zfill(width) -> unicode\n\
4527 Pad a numeric string x with zeros on the left, to fill a field\n\
4528 of the specified width. The string x is never truncated.";
4530 static PyObject *
4531 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4533 int fill;
4534 PyUnicodeObject *u;
4536 int width;
4537 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4538 return NULL;
4540 if (self->length >= width) {
4541 Py_INCREF(self);
4542 return (PyObject*) self;
4545 fill = width - self->length;
4547 u = pad(self, fill, 0, '0');
4549 if (u->str[fill] == '+' || u->str[fill] == '-') {
4550 /* move sign to beginning of string */
4551 u->str[0] = u->str[fill];
4552 u->str[fill] = '0';
4555 return (PyObject*) u;
4557 #endif
4559 #if 0
4560 static PyObject*
4561 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4563 if (!PyArg_NoArgs(args))
4564 return NULL;
4565 return PyInt_FromLong(unicode_freelist_size);
4567 #endif
4569 static char startswith__doc__[] =
4570 "S.startswith(prefix[, start[, end]]) -> int\n\
4572 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4573 optional start, test S beginning at that position. With optional end, stop\n\
4574 comparing S at that position.";
4576 static PyObject *
4577 unicode_startswith(PyUnicodeObject *self,
4578 PyObject *args)
4580 PyUnicodeObject *substring;
4581 int start = 0;
4582 int end = INT_MAX;
4583 PyObject *result;
4585 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4586 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4587 return NULL;
4588 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4589 (PyObject *)substring);
4590 if (substring == NULL)
4591 return NULL;
4593 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4595 Py_DECREF(substring);
4596 return result;
4600 static char endswith__doc__[] =
4601 "S.endswith(suffix[, start[, end]]) -> int\n\
4603 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4604 optional start, test S beginning at that position. With optional end, stop\n\
4605 comparing S at that position.";
4607 static PyObject *
4608 unicode_endswith(PyUnicodeObject *self,
4609 PyObject *args)
4611 PyUnicodeObject *substring;
4612 int start = 0;
4613 int end = INT_MAX;
4614 PyObject *result;
4616 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4617 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4618 return NULL;
4619 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4620 (PyObject *)substring);
4621 if (substring == NULL)
4622 return NULL;
4624 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4626 Py_DECREF(substring);
4627 return result;
4631 static PyMethodDef unicode_methods[] = {
4633 /* Order is according to common usage: often used methods should
4634 appear first, since lookup is done sequentially. */
4636 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4637 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4638 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4639 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4640 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4641 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4642 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4643 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4644 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4645 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4646 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4647 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4648 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4649 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4650 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4651 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4652 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4653 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4654 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4655 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4656 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4657 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4658 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4659 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4660 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4661 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4662 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4663 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4664 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4665 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4666 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4667 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4668 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4669 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4670 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4671 #if 0
4672 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4673 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4674 #endif
4676 #if 0
4677 /* This one is just used for debugging the implementation. */
4678 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4679 #endif
4681 {NULL, NULL}
4684 static PySequenceMethods unicode_as_sequence = {
4685 (inquiry) unicode_length, /* sq_length */
4686 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4687 (intargfunc) unicode_repeat, /* sq_repeat */
4688 (intargfunc) unicode_getitem, /* sq_item */
4689 (intintargfunc) unicode_slice, /* sq_slice */
4690 0, /* sq_ass_item */
4691 0, /* sq_ass_slice */
4692 (objobjproc)PyUnicode_Contains, /*sq_contains*/
4695 static int
4696 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4697 int index,
4698 const void **ptr)
4700 if (index != 0) {
4701 PyErr_SetString(PyExc_SystemError,
4702 "accessing non-existent unicode segment");
4703 return -1;
4705 *ptr = (void *) self->str;
4706 return PyUnicode_GET_DATA_SIZE(self);
4709 static int
4710 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4711 const void **ptr)
4713 PyErr_SetString(PyExc_TypeError,
4714 "cannot use unicode as modifyable buffer");
4715 return -1;
4718 static int
4719 unicode_buffer_getsegcount(PyUnicodeObject *self,
4720 int *lenp)
4722 if (lenp)
4723 *lenp = PyUnicode_GET_DATA_SIZE(self);
4724 return 1;
4727 static int
4728 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4729 int index,
4730 const void **ptr)
4732 PyObject *str;
4734 if (index != 0) {
4735 PyErr_SetString(PyExc_SystemError,
4736 "accessing non-existent unicode segment");
4737 return -1;
4739 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4740 if (str == NULL)
4741 return -1;
4742 *ptr = (void *) PyString_AS_STRING(str);
4743 return PyString_GET_SIZE(str);
4746 /* Helpers for PyUnicode_Format() */
4748 static PyObject *
4749 getnextarg(PyObject *args, int arglen, int *p_argidx)
4751 int argidx = *p_argidx;
4752 if (argidx < arglen) {
4753 (*p_argidx)++;
4754 if (arglen < 0)
4755 return args;
4756 else
4757 return PyTuple_GetItem(args, argidx);
4759 PyErr_SetString(PyExc_TypeError,
4760 "not enough arguments for format string");
4761 return NULL;
4764 #define F_LJUST (1<<0)
4765 #define F_SIGN (1<<1)
4766 #define F_BLANK (1<<2)
4767 #define F_ALT (1<<3)
4768 #define F_ZERO (1<<4)
4770 static
4771 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4773 register int i;
4774 int len;
4775 va_list va;
4776 char *charbuffer;
4777 va_start(va, format);
4779 /* First, format the string as char array, then expand to Py_UNICODE
4780 array. */
4781 charbuffer = (char *)buffer;
4782 len = vsprintf(charbuffer, format, va);
4783 for (i = len - 1; i >= 0; i--)
4784 buffer[i] = (Py_UNICODE) charbuffer[i];
4786 va_end(va);
4787 return len;
4790 static int
4791 formatfloat(Py_UNICODE *buf,
4792 size_t buflen,
4793 int flags,
4794 int prec,
4795 int type,
4796 PyObject *v)
4798 /* fmt = '%#.' + `prec` + `type`
4799 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4800 char fmt[20];
4801 double x;
4803 x = PyFloat_AsDouble(v);
4804 if (x == -1.0 && PyErr_Occurred())
4805 return -1;
4806 if (prec < 0)
4807 prec = 6;
4808 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4809 type = 'g';
4810 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4811 /* worst case length calc to ensure no buffer overrun:
4812 fmt = %#.<prec>g
4813 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4814 for any double rep.)
4815 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4816 If prec=0 the effective precision is 1 (the leading digit is
4817 always given), therefore increase by one to 10+prec. */
4818 if (buflen <= (size_t)10 + (size_t)prec) {
4819 PyErr_SetString(PyExc_OverflowError,
4820 "formatted float is too long (precision too long?)");
4821 return -1;
4823 return usprintf(buf, fmt, x);
4826 static PyObject*
4827 formatlong(PyObject *val, int flags, int prec, int type)
4829 char *buf;
4830 int i, len;
4831 PyObject *str; /* temporary string object. */
4832 PyUnicodeObject *result;
4834 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4835 if (!str)
4836 return NULL;
4837 result = _PyUnicode_New(len);
4838 for (i = 0; i < len; i++)
4839 result->str[i] = buf[i];
4840 result->str[len] = 0;
4841 Py_DECREF(str);
4842 return (PyObject*)result;
4845 static int
4846 formatint(Py_UNICODE *buf,
4847 size_t buflen,
4848 int flags,
4849 int prec,
4850 int type,
4851 PyObject *v)
4853 /* fmt = '%#.' + `prec` + 'l' + `type`
4854 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4855 + 1 + 1 = 24*/
4856 char fmt[64]; /* plenty big enough! */
4857 long x;
4858 int use_native_c_format = 1;
4860 x = PyInt_AsLong(v);
4861 if (x == -1 && PyErr_Occurred())
4862 return -1;
4863 if (prec < 0)
4864 prec = 1;
4865 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4866 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4867 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4868 PyErr_SetString(PyExc_OverflowError,
4869 "formatted integer is too long (precision too long?)");
4870 return -1;
4872 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4873 * but we want it (for consistency with other %#x conversions, and
4874 * for consistency with Python's hex() function).
4875 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4876 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4877 * So add it only if the platform doesn't already.
4879 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4880 /* Only way to know what the platform does is to try it. */
4881 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4882 if (fmt[1] != (char)type) {
4883 /* Supply our own leading 0x/0X -- needed under std C */
4884 use_native_c_format = 0;
4885 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4888 if (use_native_c_format)
4889 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4890 return usprintf(buf, fmt, x);
4893 static int
4894 formatchar(Py_UNICODE *buf,
4895 size_t buflen,
4896 PyObject *v)
4898 /* presume that the buffer is at least 2 characters long */
4899 if (PyUnicode_Check(v)) {
4900 if (PyUnicode_GET_SIZE(v) != 1)
4901 goto onError;
4902 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4905 else if (PyString_Check(v)) {
4906 if (PyString_GET_SIZE(v) != 1)
4907 goto onError;
4908 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4911 else {
4912 /* Integer input truncated to a character */
4913 long x;
4914 x = PyInt_AsLong(v);
4915 if (x == -1 && PyErr_Occurred())
4916 goto onError;
4917 buf[0] = (char) x;
4919 buf[1] = '\0';
4920 return 1;
4922 onError:
4923 PyErr_SetString(PyExc_TypeError,
4924 "%c requires int or char");
4925 return -1;
4928 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4930 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4931 chars are formatted. XXX This is a magic number. Each formatting
4932 routine does bounds checking to ensure no overflow, but a better
4933 solution may be to malloc a buffer of appropriate size for each
4934 format. For now, the current solution is sufficient.
4936 #define FORMATBUFLEN (size_t)120
4938 PyObject *PyUnicode_Format(PyObject *format,
4939 PyObject *args)
4941 Py_UNICODE *fmt, *res;
4942 int fmtcnt, rescnt, reslen, arglen, argidx;
4943 int args_owned = 0;
4944 PyUnicodeObject *result = NULL;
4945 PyObject *dict = NULL;
4946 PyObject *uformat;
4948 if (format == NULL || args == NULL) {
4949 PyErr_BadInternalCall();
4950 return NULL;
4952 uformat = PyUnicode_FromObject(format);
4953 if (uformat == NULL)
4954 return NULL;
4955 fmt = PyUnicode_AS_UNICODE(uformat);
4956 fmtcnt = PyUnicode_GET_SIZE(uformat);
4958 reslen = rescnt = fmtcnt + 100;
4959 result = _PyUnicode_New(reslen);
4960 if (result == NULL)
4961 goto onError;
4962 res = PyUnicode_AS_UNICODE(result);
4964 if (PyTuple_Check(args)) {
4965 arglen = PyTuple_Size(args);
4966 argidx = 0;
4968 else {
4969 arglen = -1;
4970 argidx = -2;
4972 if (args->ob_type->tp_as_mapping)
4973 dict = args;
4975 while (--fmtcnt >= 0) {
4976 if (*fmt != '%') {
4977 if (--rescnt < 0) {
4978 rescnt = fmtcnt + 100;
4979 reslen += rescnt;
4980 if (_PyUnicode_Resize(&result, reslen) < 0)
4981 return NULL;
4982 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4983 --rescnt;
4985 *res++ = *fmt++;
4987 else {
4988 /* Got a format specifier */
4989 int flags = 0;
4990 int width = -1;
4991 int prec = -1;
4992 Py_UNICODE c = '\0';
4993 Py_UNICODE fill;
4994 PyObject *v = NULL;
4995 PyObject *temp = NULL;
4996 Py_UNICODE *pbuf;
4997 Py_UNICODE sign;
4998 int len;
4999 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5001 fmt++;
5002 if (*fmt == '(') {
5003 Py_UNICODE *keystart;
5004 int keylen;
5005 PyObject *key;
5006 int pcount = 1;
5008 if (dict == NULL) {
5009 PyErr_SetString(PyExc_TypeError,
5010 "format requires a mapping");
5011 goto onError;
5013 ++fmt;
5014 --fmtcnt;
5015 keystart = fmt;
5016 /* Skip over balanced parentheses */
5017 while (pcount > 0 && --fmtcnt >= 0) {
5018 if (*fmt == ')')
5019 --pcount;
5020 else if (*fmt == '(')
5021 ++pcount;
5022 fmt++;
5024 keylen = fmt - keystart - 1;
5025 if (fmtcnt < 0 || pcount > 0) {
5026 PyErr_SetString(PyExc_ValueError,
5027 "incomplete format key");
5028 goto onError;
5030 /* keys are converted to strings using UTF-8 and
5031 then looked up since Python uses strings to hold
5032 variables names etc. in its namespaces and we
5033 wouldn't want to break common idioms. */
5034 key = PyUnicode_EncodeUTF8(keystart,
5035 keylen,
5036 NULL);
5037 if (key == NULL)
5038 goto onError;
5039 if (args_owned) {
5040 Py_DECREF(args);
5041 args_owned = 0;
5043 args = PyObject_GetItem(dict, key);
5044 Py_DECREF(key);
5045 if (args == NULL) {
5046 goto onError;
5048 args_owned = 1;
5049 arglen = -1;
5050 argidx = -2;
5052 while (--fmtcnt >= 0) {
5053 switch (c = *fmt++) {
5054 case '-': flags |= F_LJUST; continue;
5055 case '+': flags |= F_SIGN; continue;
5056 case ' ': flags |= F_BLANK; continue;
5057 case '#': flags |= F_ALT; continue;
5058 case '0': flags |= F_ZERO; continue;
5060 break;
5062 if (c == '*') {
5063 v = getnextarg(args, arglen, &argidx);
5064 if (v == NULL)
5065 goto onError;
5066 if (!PyInt_Check(v)) {
5067 PyErr_SetString(PyExc_TypeError,
5068 "* wants int");
5069 goto onError;
5071 width = PyInt_AsLong(v);
5072 if (width < 0) {
5073 flags |= F_LJUST;
5074 width = -width;
5076 if (--fmtcnt >= 0)
5077 c = *fmt++;
5079 else if (c >= '0' && c <= '9') {
5080 width = c - '0';
5081 while (--fmtcnt >= 0) {
5082 c = *fmt++;
5083 if (c < '0' || c > '9')
5084 break;
5085 if ((width*10) / 10 != width) {
5086 PyErr_SetString(PyExc_ValueError,
5087 "width too big");
5088 goto onError;
5090 width = width*10 + (c - '0');
5093 if (c == '.') {
5094 prec = 0;
5095 if (--fmtcnt >= 0)
5096 c = *fmt++;
5097 if (c == '*') {
5098 v = getnextarg(args, arglen, &argidx);
5099 if (v == NULL)
5100 goto onError;
5101 if (!PyInt_Check(v)) {
5102 PyErr_SetString(PyExc_TypeError,
5103 "* wants int");
5104 goto onError;
5106 prec = PyInt_AsLong(v);
5107 if (prec < 0)
5108 prec = 0;
5109 if (--fmtcnt >= 0)
5110 c = *fmt++;
5112 else if (c >= '0' && c <= '9') {
5113 prec = c - '0';
5114 while (--fmtcnt >= 0) {
5115 c = Py_CHARMASK(*fmt++);
5116 if (c < '0' || c > '9')
5117 break;
5118 if ((prec*10) / 10 != prec) {
5119 PyErr_SetString(PyExc_ValueError,
5120 "prec too big");
5121 goto onError;
5123 prec = prec*10 + (c - '0');
5126 } /* prec */
5127 if (fmtcnt >= 0) {
5128 if (c == 'h' || c == 'l' || c == 'L') {
5129 if (--fmtcnt >= 0)
5130 c = *fmt++;
5133 if (fmtcnt < 0) {
5134 PyErr_SetString(PyExc_ValueError,
5135 "incomplete format");
5136 goto onError;
5138 if (c != '%') {
5139 v = getnextarg(args, arglen, &argidx);
5140 if (v == NULL)
5141 goto onError;
5143 sign = 0;
5144 fill = ' ';
5145 switch (c) {
5147 case '%':
5148 pbuf = formatbuf;
5149 /* presume that buffer length is at least 1 */
5150 pbuf[0] = '%';
5151 len = 1;
5152 break;
5154 case 's':
5155 case 'r':
5156 if (PyUnicode_Check(v) && c == 's') {
5157 temp = v;
5158 Py_INCREF(temp);
5160 else {
5161 PyObject *unicode;
5162 if (c == 's')
5163 temp = PyObject_Str(v);
5164 else
5165 temp = PyObject_Repr(v);
5166 if (temp == NULL)
5167 goto onError;
5168 if (!PyString_Check(temp)) {
5169 /* XXX Note: this should never happen, since
5170 PyObject_Repr() and PyObject_Str() assure
5171 this */
5172 Py_DECREF(temp);
5173 PyErr_SetString(PyExc_TypeError,
5174 "%s argument has non-string str()");
5175 goto onError;
5177 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5178 PyString_GET_SIZE(temp),
5179 NULL,
5180 "strict");
5181 Py_DECREF(temp);
5182 temp = unicode;
5183 if (temp == NULL)
5184 goto onError;
5186 pbuf = PyUnicode_AS_UNICODE(temp);
5187 len = PyUnicode_GET_SIZE(temp);
5188 if (prec >= 0 && len > prec)
5189 len = prec;
5190 break;
5192 case 'i':
5193 case 'd':
5194 case 'u':
5195 case 'o':
5196 case 'x':
5197 case 'X':
5198 if (c == 'i')
5199 c = 'd';
5200 if (PyLong_Check(v)) {
5201 temp = formatlong(v, flags, prec, c);
5202 if (!temp)
5203 goto onError;
5204 pbuf = PyUnicode_AS_UNICODE(temp);
5205 len = PyUnicode_GET_SIZE(temp);
5206 /* unbounded ints can always produce
5207 a sign character! */
5208 sign = 1;
5210 else {
5211 pbuf = formatbuf;
5212 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5213 flags, prec, c, v);
5214 if (len < 0)
5215 goto onError;
5216 /* only d conversion is signed */
5217 sign = c == 'd';
5219 if (flags & F_ZERO)
5220 fill = '0';
5221 break;
5223 case 'e':
5224 case 'E':
5225 case 'f':
5226 case 'g':
5227 case 'G':
5228 pbuf = formatbuf;
5229 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5230 flags, prec, c, v);
5231 if (len < 0)
5232 goto onError;
5233 sign = 1;
5234 if (flags & F_ZERO)
5235 fill = '0';
5236 break;
5238 case 'c':
5239 pbuf = formatbuf;
5240 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5241 if (len < 0)
5242 goto onError;
5243 break;
5245 default:
5246 PyErr_Format(PyExc_ValueError,
5247 "unsupported format character '%c' (0x%x) "
5248 "at index %i",
5249 (31<=c && c<=126) ? c : '?',
5250 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5251 goto onError;
5253 if (sign) {
5254 if (*pbuf == '-' || *pbuf == '+') {
5255 sign = *pbuf++;
5256 len--;
5258 else if (flags & F_SIGN)
5259 sign = '+';
5260 else if (flags & F_BLANK)
5261 sign = ' ';
5262 else
5263 sign = 0;
5265 if (width < len)
5266 width = len;
5267 if (rescnt < width + (sign != 0)) {
5268 reslen -= rescnt;
5269 rescnt = width + fmtcnt + 100;
5270 reslen += rescnt;
5271 if (_PyUnicode_Resize(&result, reslen) < 0)
5272 return NULL;
5273 res = PyUnicode_AS_UNICODE(result)
5274 + reslen - rescnt;
5276 if (sign) {
5277 if (fill != ' ')
5278 *res++ = sign;
5279 rescnt--;
5280 if (width > len)
5281 width--;
5283 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5284 assert(pbuf[0] == '0');
5285 assert(pbuf[1] == c);
5286 if (fill != ' ') {
5287 *res++ = *pbuf++;
5288 *res++ = *pbuf++;
5290 rescnt -= 2;
5291 width -= 2;
5292 if (width < 0)
5293 width = 0;
5294 len -= 2;
5296 if (width > len && !(flags & F_LJUST)) {
5297 do {
5298 --rescnt;
5299 *res++ = fill;
5300 } while (--width > len);
5302 if (fill == ' ') {
5303 if (sign)
5304 *res++ = sign;
5305 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5306 assert(pbuf[0] == '0');
5307 assert(pbuf[1] == c);
5308 *res++ = *pbuf++;
5309 *res++ = *pbuf++;
5312 Py_UNICODE_COPY(res, pbuf, len);
5313 res += len;
5314 rescnt -= len;
5315 while (--width >= len) {
5316 --rescnt;
5317 *res++ = ' ';
5319 if (dict && (argidx < arglen) && c != '%') {
5320 PyErr_SetString(PyExc_TypeError,
5321 "not all arguments converted");
5322 goto onError;
5324 Py_XDECREF(temp);
5325 } /* '%' */
5326 } /* until end */
5327 if (argidx < arglen && !dict) {
5328 PyErr_SetString(PyExc_TypeError,
5329 "not all arguments converted");
5330 goto onError;
5333 if (args_owned) {
5334 Py_DECREF(args);
5336 Py_DECREF(uformat);
5337 if (_PyUnicode_Resize(&result, reslen - rescnt))
5338 goto onError;
5339 return (PyObject *)result;
5341 onError:
5342 Py_XDECREF(result);
5343 Py_DECREF(uformat);
5344 if (args_owned) {
5345 Py_DECREF(args);
5347 return NULL;
5350 static PyBufferProcs unicode_as_buffer = {
5351 (getreadbufferproc) unicode_buffer_getreadbuf,
5352 (getwritebufferproc) unicode_buffer_getwritebuf,
5353 (getsegcountproc) unicode_buffer_getsegcount,
5354 (getcharbufferproc) unicode_buffer_getcharbuf,
5357 static PyObject *
5358 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5360 PyObject *x = NULL;
5361 static char *kwlist[] = {"string", "encoding", "errors", 0};
5362 char *encoding = NULL;
5363 char *errors = NULL;
5365 assert(type == &PyUnicode_Type);
5366 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5367 kwlist, &x, &encoding, &errors))
5368 return NULL;
5369 if (x == NULL)
5370 return (PyObject *)_PyUnicode_New(0);
5371 return PyUnicode_FromEncodedObject(x, encoding, errors);
5374 static char unicode_doc[] =
5375 "unicode(string [, encoding[, errors]]) -> object\n\
5377 Create a new Unicode object from the given encoded string.\n\
5378 encoding defaults to the current default string encoding and \n\
5379 errors, defining the error handling, to 'strict'.";
5381 PyTypeObject PyUnicode_Type = {
5382 PyObject_HEAD_INIT(&PyType_Type)
5383 0, /* ob_size */
5384 "unicode", /* tp_name */
5385 sizeof(PyUnicodeObject), /* tp_size */
5386 0, /* tp_itemsize */
5387 /* Slots */
5388 (destructor)_PyUnicode_Free, /* tp_dealloc */
5389 0, /* tp_print */
5390 0, /* tp_getattr */
5391 0, /* tp_setattr */
5392 (cmpfunc) unicode_compare, /* tp_compare */
5393 (reprfunc) unicode_repr, /* tp_repr */
5394 0, /* tp_as_number */
5395 &unicode_as_sequence, /* tp_as_sequence */
5396 0, /* tp_as_mapping */
5397 (hashfunc) unicode_hash, /* tp_hash*/
5398 0, /* tp_call*/
5399 (reprfunc) unicode_str, /* tp_str */
5400 PyObject_GenericGetAttr, /* tp_getattro */
5401 0, /* tp_setattro */
5402 &unicode_as_buffer, /* tp_as_buffer */
5403 Py_TPFLAGS_DEFAULT, /* tp_flags */
5404 unicode_doc, /* tp_doc */
5405 0, /* tp_traverse */
5406 0, /* tp_clear */
5407 0, /* tp_richcompare */
5408 0, /* tp_weaklistoffset */
5409 0, /* tp_iter */
5410 0, /* tp_iternext */
5411 unicode_methods, /* tp_methods */
5412 0, /* tp_members */
5413 0, /* tp_getset */
5414 0, /* tp_base */
5415 0, /* tp_dict */
5416 0, /* tp_descr_get */
5417 0, /* tp_descr_set */
5418 0, /* tp_dictoffset */
5419 0, /* tp_init */
5420 0, /* tp_alloc */
5421 unicode_new, /* tp_new */
5424 /* Initialize the Unicode implementation */
5426 void _PyUnicode_Init(void)
5428 int i;
5430 /* Init the implementation */
5431 unicode_freelist = NULL;
5432 unicode_freelist_size = 0;
5433 unicode_empty = _PyUnicode_New(0);
5434 strcpy(unicode_default_encoding, "ascii");
5435 for (i = 0; i < 256; i++)
5436 unicode_latin1[i] = NULL;
5439 /* Finalize the Unicode implementation */
5441 void
5442 _PyUnicode_Fini(void)
5444 PyUnicodeObject *u;
5445 int i;
5447 Py_XDECREF(unicode_empty);
5448 unicode_empty = NULL;
5450 for (i = 0; i < 256; i++) {
5451 if (unicode_latin1[i]) {
5452 Py_DECREF(unicode_latin1[i]);
5453 unicode_latin1[i] = NULL;
5457 for (u = unicode_freelist; u != NULL;) {
5458 PyUnicodeObject *v = u;
5459 u = *(PyUnicodeObject **)u;
5460 if (v->str)
5461 PyMem_DEL(v->str);
5462 Py_XDECREF(v->defenc);
5463 PyObject_DEL(v);
5465 unicode_freelist = NULL;
5466 unicode_freelist_size = 0;