3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_DEL(unicode
);
227 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
229 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
230 /* Keep-Alive optimization */
231 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
232 PyMem_DEL(unicode
->str
);
236 if (unicode
->defenc
) {
237 Py_DECREF(unicode
->defenc
);
238 unicode
->defenc
= NULL
;
240 /* Add to free list */
241 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
242 unicode_freelist
= unicode
;
243 unicode_freelist_size
++;
246 PyMem_DEL(unicode
->str
);
247 Py_XDECREF(unicode
->defenc
);
248 PyObject_DEL(unicode
);
252 int PyUnicode_Resize(PyObject
**unicode
,
255 register PyUnicodeObject
*v
;
257 /* Argument checks */
258 if (unicode
== NULL
) {
259 PyErr_BadInternalCall();
262 v
= (PyUnicodeObject
*)*unicode
;
263 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
264 PyErr_BadInternalCall();
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v
->length
!= length
&&
272 (v
== unicode_empty
|| v
->length
== 1)) {
273 PyUnicodeObject
*w
= _PyUnicode_New(length
);
276 Py_UNICODE_COPY(w
->str
, v
->str
,
277 length
< v
->length
? length
: v
->length
);
278 *unicode
= (PyObject
*)w
;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v
, length
);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
294 PyUnicodeObject
*unicode
;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
300 /* Optimization for empty strings */
301 if (size
== 0 && unicode_empty
!= NULL
) {
302 Py_INCREF(unicode_empty
);
303 return (PyObject
*)unicode_empty
;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size
== 1 && *u
< 256) {
309 unicode
= unicode_latin1
[*u
];
311 unicode
= _PyUnicode_New(1);
314 unicode
->str
[0] = *u
;
315 unicode_latin1
[*u
] = unicode
;
318 return (PyObject
*)unicode
;
322 unicode
= _PyUnicode_New(size
);
326 /* Copy the Unicode data into the new object */
328 Py_UNICODE_COPY(unicode
->str
, u
, size
);
330 return (PyObject
*)unicode
;
335 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
338 PyUnicodeObject
*unicode
;
341 PyErr_BadInternalCall();
345 unicode
= _PyUnicode_New(size
);
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
354 register Py_UNICODE
*u
;
356 u
= PyUnicode_AS_UNICODE(unicode
);
357 for (i
= size
; i
>= 0; i
--)
362 return (PyObject
*)unicode
;
365 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
369 if (unicode
== NULL
) {
370 PyErr_BadInternalCall();
373 if (size
> PyUnicode_GET_SIZE(unicode
))
374 size
= PyUnicode_GET_SIZE(unicode
);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
379 register Py_UNICODE
*u
;
381 u
= PyUnicode_AS_UNICODE(unicode
);
382 for (i
= size
; i
>= 0; i
--)
392 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
394 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
397 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
398 const char *encoding
,
407 PyErr_BadInternalCall();
412 if (PyInstance_Check(obj
)) {
414 func
= PyObject_GetAttrString(obj
, "__str__");
416 PyErr_SetString(PyExc_TypeError
,
417 "coercing to Unicode: instance doesn't define __str__");
420 obj
= PyEval_CallObject(func
, NULL
);
426 if (PyUnicode_Check(obj
)) {
430 PyErr_SetString(PyExc_TypeError
,
431 "decoding Unicode is not supported");
436 else if (PyString_Check(obj
)) {
437 s
= PyString_AS_STRING(obj
);
438 len
= PyString_GET_SIZE(obj
);
440 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError
))
444 PyErr_Format(PyExc_TypeError
,
445 "coercing to Unicode: need string or buffer, "
447 obj
->ob_type
->tp_name
);
451 /* Convert to Unicode */
453 Py_INCREF(unicode_empty
);
454 v
= (PyObject
*)unicode_empty
;
457 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
472 PyObject
*PyUnicode_Decode(const char *s
,
474 const char *encoding
,
477 PyObject
*buffer
= NULL
, *unicode
;
479 if (encoding
== NULL
)
480 encoding
= PyUnicode_GetDefaultEncoding();
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding
, "utf-8") == 0)
484 return PyUnicode_DecodeUTF8(s
, size
, errors
);
485 else if (strcmp(encoding
, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s
, size
, errors
);
487 else if (strcmp(encoding
, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s
, size
, errors
);
490 /* Decode via the codec registry */
491 buffer
= PyBuffer_FromMemory((void *)s
, size
);
494 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
497 if (!PyUnicode_Check(unicode
)) {
498 PyErr_Format(PyExc_TypeError
,
499 "decoder did not return an unicode object (type=%.400s)",
500 unicode
->ob_type
->tp_name
);
512 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
514 const char *encoding
,
517 PyObject
*v
, *unicode
;
519 unicode
= PyUnicode_FromUnicode(s
, size
);
522 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
527 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
528 const char *encoding
,
533 if (!PyUnicode_Check(unicode
)) {
538 if (encoding
== NULL
)
539 encoding
= PyUnicode_GetDefaultEncoding();
541 /* Shortcuts for common default encodings */
542 if (errors
== NULL
) {
543 if (strcmp(encoding
, "utf-8") == 0)
544 return PyUnicode_AsUTF8String(unicode
);
545 else if (strcmp(encoding
, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode
);
547 else if (strcmp(encoding
, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode
);
551 /* Encode via the codec registry */
552 v
= PyCodec_Encode(unicode
, encoding
, errors
);
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v
)) {
557 PyErr_Format(PyExc_TypeError
,
558 "encoder did not return a string object (type=%.400s)",
559 v
->ob_type
->tp_name
);
569 /* Return a Python string holding the default encoded value of the
572 The resulting string is cached in the Unicode object for subsequent
573 usage by this function. The cached version is needed to implement
574 the character buffer interface and will live (at least) as long as
575 the Unicode object itself.
577 The refcount of the string is *not* incremented.
579 *** Exported for internal use by the interpreter only !!! ***
583 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
586 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
590 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
591 if (v
&& errors
== NULL
)
592 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
596 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
598 if (!PyUnicode_Check(unicode
)) {
602 return PyUnicode_AS_UNICODE(unicode
);
608 int PyUnicode_GetSize(PyObject
*unicode
)
610 if (!PyUnicode_Check(unicode
)) {
614 return PyUnicode_GET_SIZE(unicode
);
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding
;
625 int PyUnicode_SetDefaultEncoding(const char *encoding
)
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v
= _PyCodec_Lookup(encoding
);
635 strncpy(unicode_default_encoding
,
637 sizeof(unicode_default_encoding
));
644 /* --- UTF-8 Codec -------------------------------------------------------- */
647 char utf8_code_length
[256] = {
648 /* Map UTF-8 encoded prefix byte to sequence length. zero means
649 illegal prefix. see RFC 2279 for details */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
663 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
665 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
669 int utf8_decoding_error(const char **source
,
674 if ((errors
== NULL
) ||
675 (strcmp(errors
,"strict") == 0)) {
676 PyErr_Format(PyExc_UnicodeError
,
677 "UTF-8 decoding error: %.400s",
681 else if (strcmp(errors
,"ignore") == 0) {
685 else if (strcmp(errors
,"replace") == 0) {
687 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
692 PyErr_Format(PyExc_ValueError
,
693 "UTF-8 decoding error; unknown error handling code: %.400s",
699 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
705 PyUnicodeObject
*unicode
;
707 const char *errmsg
= "";
709 /* Note: size will always be longer than the resulting Unicode
711 unicode
= _PyUnicode_New(size
);
715 return (PyObject
*)unicode
;
717 /* Unpack UTF-8 encoded data */
722 Py_UCS4 ch
= (unsigned char)*s
;
725 *p
++ = (Py_UNICODE
)ch
;
730 n
= utf8_code_length
[ch
];
733 errmsg
= "unexpected end of data";
740 errmsg
= "unexpected code byte";
744 errmsg
= "internal error";
748 if ((s
[1] & 0xc0) != 0x80) {
749 errmsg
= "invalid data";
752 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
754 errmsg
= "illegal encoding";
758 *p
++ = (Py_UNICODE
)ch
;
762 if ((s
[1] & 0xc0) != 0x80 ||
763 (s
[2] & 0xc0) != 0x80) {
764 errmsg
= "invalid data";
767 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
768 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
769 errmsg
= "illegal encoding";
773 *p
++ = (Py_UNICODE
)ch
;
777 if ((s
[1] & 0xc0) != 0x80 ||
778 (s
[2] & 0xc0) != 0x80 ||
779 (s
[3] & 0xc0) != 0x80) {
780 errmsg
= "invalid data";
783 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
784 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
785 /* validate and convert to UTF-16 */
786 if ((ch
< 0x10000) /* minimum value allowed for 4
788 || (ch
> 0x10ffff)) /* maximum value allowed for
791 errmsg
= "illegal encoding";
794 #ifdef Py_UNICODE_WIDE
795 *p
++ = (Py_UNICODE
)ch
;
797 /* compute and append the two surrogates: */
799 /* translate from 10000..10FFFF to 0..FFFF */
802 /* high surrogate = top 10 bits added to D800 */
803 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
805 /* low surrogate = bottom 10 bits added to DC00 */
806 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
811 /* Other sizes are only needed for UCS-4 */
812 errmsg
= "unsupported Unicode code range";
819 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
824 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
827 return (PyObject
*)unicode
;
834 /* Not used anymore, now that the encoder supports UTF-16
838 int utf8_encoding_error(const Py_UNICODE
**source
,
843 if ((errors
== NULL
) ||
844 (strcmp(errors
,"strict") == 0)) {
845 PyErr_Format(PyExc_UnicodeError
,
846 "UTF-8 encoding error: %.400s",
850 else if (strcmp(errors
,"ignore") == 0) {
853 else if (strcmp(errors
,"replace") == 0) {
859 PyErr_Format(PyExc_ValueError
,
860 "UTF-8 encoding error; "
861 "unknown error handling code: %.400s",
868 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
876 unsigned int cbAllocated
= 3 * size
;
877 unsigned int cbWritten
= 0;
880 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
886 p
= q
= PyString_AS_STRING(v
);
893 else if (ch
< 0x0800) {
894 *p
++ = 0xc0 | (ch
>> 6);
895 *p
++ = 0x80 | (ch
& 0x3f);
898 else if (ch
< 0x10000) {
899 /* Check for high surrogate */
900 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
903 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
905 if (cbWritten
>= (cbAllocated
- 4)) {
906 /* Provide enough room for some more
909 if (_PyString_Resize(&v
, cbAllocated
))
913 /* combine the two values */
914 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
916 *p
++ = (char)((ch
>> 18) | 0xf0);
917 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
924 *p
++ = (char)(0xe0 | (ch
>> 12));
927 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
928 *p
++ = (char)(0x80 | (ch
& 0x3f));
930 *p
++ = 0xf0 | (ch
>>18);
931 *p
++ = 0x80 | ((ch
>>12) & 0x3f);
932 *p
++ = 0x80 | ((ch
>>6) & 0x3f);
933 *p
++ = 0x80 | (ch
& 0x3f);
938 if (_PyString_Resize(&v
, p
- q
))
947 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
949 if (!PyUnicode_Check(unicode
)) {
953 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
954 PyUnicode_GET_SIZE(unicode
),
958 /* --- UTF-16 Codec ------------------------------------------------------- */
961 int utf16_decoding_error(const Py_UCS2
**source
,
966 if ((errors
== NULL
) ||
967 (strcmp(errors
,"strict") == 0)) {
968 PyErr_Format(PyExc_UnicodeError
,
969 "UTF-16 decoding error: %.400s",
973 else if (strcmp(errors
,"ignore") == 0) {
976 else if (strcmp(errors
,"replace") == 0) {
978 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
984 PyErr_Format(PyExc_ValueError
,
985 "UTF-16 decoding error; "
986 "unknown error handling code: %.400s",
992 PyObject
*PyUnicode_DecodeUTF16(const char *s
,
997 PyUnicodeObject
*unicode
;
999 const Py_UCS2
*q
, *e
;
1001 const char *errmsg
= "";
1003 /* size should be an even number */
1004 if (size
% sizeof(Py_UCS2
) != 0) {
1005 if (utf16_decoding_error(NULL
, NULL
, errors
, "truncated data"))
1007 /* The remaining input chars are ignored if we fall through
1011 /* Note: size will always be longer than the resulting Unicode
1013 unicode
= _PyUnicode_New(size
);
1017 return (PyObject
*)unicode
;
1019 /* Unpack UTF-16 encoded data */
1022 e
= q
+ (size
/ sizeof(Py_UCS2
));
1027 /* Check for BOM marks (U+FEFF) in the input and adjust current
1028 byte order setting accordingly. In native mode, the leading BOM
1029 mark is skipped, in all other modes, it is copied to the output
1030 stream as-is (giving a ZWNBSP character). */
1032 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1036 } else if (*q
== 0xFFFE) {
1044 } else if (*q
== 0xFFFE) {
1052 register Py_UCS2 ch
= *q
++;
1054 /* Swap input bytes if needed. (This assumes
1055 sizeof(Py_UNICODE) == 2 !) */
1056 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1058 ch
= (ch
>> 8) | (ch
<< 8);
1061 ch
= (ch
>> 8) | (ch
<< 8);
1063 if (ch
< 0xD800 || ch
> 0xDFFF) {
1068 /* UTF-16 code pair: */
1070 errmsg
= "unexpected end of data";
1073 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1075 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1077 ch2
= (ch2
>> 8) | (ch2
<< 8);
1080 ch2
= (ch2
>> 8) | (ch2
<< 8);
1082 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1083 #ifndef Py_UNICODE_WIDE
1084 /* This is valid data (a UTF-16 surrogate pair), but
1085 we are not able to store this information since our
1086 Py_UNICODE type only has 16 bits... this might
1087 change someday, even though it's unlikely. */
1088 errmsg
= "code pairs are not supported";
1091 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1097 errmsg
= "illegal UTF-16 surrogate";
1102 errmsg
= "illegal encoding";
1103 /* Fall through to report the error */
1106 if (utf16_decoding_error(&q
, &p
, errors
, errmsg
))
1114 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1117 return (PyObject
*)unicode
;
1126 PyObject
*PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1134 int i
, pairs
, doswap
= 1;
1136 for (i
= pairs
= 0; i
< size
; i
++)
1137 if (s
[i
] >= 0x10000)
1139 v
= PyString_FromStringAndSize(NULL
,
1140 sizeof(Py_UCS2
) * (size
+ pairs
+ (byteorder
== 0)));
1144 q
= PyString_AS_STRING(v
);
1150 if (byteorder
== 0 ||
1151 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1158 while (size
-- > 0) {
1159 Py_UNICODE ch
= *s
++;
1161 if (ch
>= 0x10000) {
1162 ch2
= 0xDC00|((ch
-0x10000) & 0x3FF);
1163 ch
= 0xD800|((ch
-0x10000)>>10);
1166 *p
++ = (ch
>> 8) | (ch
<< 8);
1168 *p
++ = (ch2
>> 8) | (ch2
<< 8);
1178 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1180 if (!PyUnicode_Check(unicode
)) {
1181 PyErr_BadArgument();
1184 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1185 PyUnicode_GET_SIZE(unicode
),
1190 /* --- Unicode Escape Codec ----------------------------------------------- */
1193 int unicodeescape_decoding_error(const char **source
,
1196 const char *details
)
1198 if ((errors
== NULL
) ||
1199 (strcmp(errors
,"strict") == 0)) {
1200 PyErr_Format(PyExc_UnicodeError
,
1201 "Unicode-Escape decoding error: %.400s",
1205 else if (strcmp(errors
,"ignore") == 0) {
1208 else if (strcmp(errors
,"replace") == 0) {
1209 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1213 PyErr_Format(PyExc_ValueError
,
1214 "Unicode-Escape decoding error; "
1215 "unknown error handling code: %.400s",
1221 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1223 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1228 Py_UNICODE
*p
, *buf
;
1231 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1233 /* Escaped strings will always be longer than the resulting
1234 Unicode string, so we start with size here and then reduce the
1235 length after conversion to the true value. */
1236 v
= _PyUnicode_New(size
);
1240 return (PyObject
*)v
;
1242 p
= buf
= PyUnicode_AS_UNICODE(v
);
1250 /* Non-escape characters are interpreted as Unicode ordinals */
1252 *p
++ = (unsigned char) *s
++;
1262 case '\\': *p
++ = '\\'; break;
1263 case '\'': *p
++ = '\''; break;
1264 case '\"': *p
++ = '\"'; break;
1265 case 'b': *p
++ = '\b'; break;
1266 case 'f': *p
++ = '\014'; break; /* FF */
1267 case 't': *p
++ = '\t'; break;
1268 case 'n': *p
++ = '\n'; break;
1269 case 'r': *p
++ = '\r'; break;
1270 case 'v': *p
++ = '\013'; break; /* VT */
1271 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1273 /* \OOO (octal) escapes */
1274 case '0': case '1': case '2': case '3':
1275 case '4': case '5': case '6': case '7':
1277 if ('0' <= *s
&& *s
<= '7') {
1278 x
= (x
<<3) + *s
++ - '0';
1279 if ('0' <= *s
&& *s
<= '7')
1280 x
= (x
<<3) + *s
++ - '0';
1289 message
= "truncated \\xXX escape";
1295 message
= "truncated \\uXXXX escape";
1301 message
= "truncated \\UXXXXXXXX escape";
1304 for (i
= 0; i
< digits
; i
++) {
1305 c
= (unsigned char) s
[i
];
1307 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1313 chr
= (chr
<<4) & ~0xF;
1314 if (c
>= '0' && c
<= '9')
1316 else if (c
>= 'a' && c
<= 'f')
1317 chr
+= 10 + c
- 'a';
1319 chr
+= 10 + c
- 'A';
1323 /* when we get here, chr is a 32-bit unicode character */
1325 /* UCS-2 character */
1326 *p
++ = (Py_UNICODE
) chr
;
1327 else if (chr
<= 0x10ffff) {
1328 /* UCS-4 character. Either store directly, or as surrogate pair. */
1329 #ifdef Py_UNICODE_WIDE
1333 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1334 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1337 if (unicodeescape_decoding_error(
1339 "illegal Unicode character")
1342 *p
++ = x
; /* store replacement character */
1348 message
= "malformed \\N character escape";
1349 if (ucnhash_CAPI
== NULL
) {
1350 /* load the unicode data module */
1352 m
= PyImport_ImportModule("unicodedata");
1355 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1359 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1361 if (ucnhash_CAPI
== NULL
)
1365 const char *start
= s
+1;
1366 /* look for the closing brace */
1367 while (*s
!= '}' && s
< end
)
1369 if (s
> start
&& s
< end
&& *s
== '}') {
1370 /* found a name. look it up in the unicode database */
1371 message
= "unknown Unicode character name";
1373 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1377 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1384 *p
++ = (unsigned char)s
[-1];
1388 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1390 return (PyObject
*)v
;
1395 "\\N escapes not supported (can't load unicodedata module)"
1404 /* Return a Unicode-Escape string version of the Unicode object.
1406 If quotes is true, the string is enclosed in u"" or u'' quotes as
1411 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1416 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1424 static const char *hexdigit
= "0123456789abcdef";
1426 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1430 p
= q
= PyString_AS_STRING(repr
);
1434 *p
++ = (findchar(s
, size
, '\'') &&
1435 !findchar(s
, size
, '"')) ? '"' : '\'';
1437 while (size
-- > 0) {
1438 Py_UNICODE ch
= *s
++;
1440 if (quotes
&& (ch
== (Py_UNICODE
) q
[1] || ch
== '\\')) {
1444 /* Map 21-bit characters to '\U00xxxxxx' */
1445 else if (ch
>= 0x10000) {
1448 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
1449 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
1450 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
1451 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
1452 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1453 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1454 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1455 *p
++ = hexdigit
[ch
& 15];
1457 /* Map 16-bit characters to '\uxxxx' */
1458 else if (ch
>= 256) {
1461 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1462 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1463 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1464 *p
++ = hexdigit
[ch
& 15];
1466 /* Map special whitespace to '\t', \n', '\r' */
1467 else if (ch
== '\t') {
1471 else if (ch
== '\n') {
1475 else if (ch
== '\r') {
1479 /* Map non-printable US ASCII to '\xhh' */
1480 else if (ch
< ' ' || ch
>= 128) {
1483 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1484 *p
++ = hexdigit
[ch
& 15];
1486 /* Copy everything else as-is */
1494 if (_PyString_Resize(&repr
, p
- q
))
1504 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1507 return unicodeescape_string(s
, size
, 0);
1510 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1512 if (!PyUnicode_Check(unicode
)) {
1513 PyErr_BadArgument();
1516 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1517 PyUnicode_GET_SIZE(unicode
));
1520 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1522 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1527 Py_UNICODE
*p
, *buf
;
1531 /* Escaped strings will always be longer than the resulting
1532 Unicode string, so we start with size here and then reduce the
1533 length after conversion to the true value. */
1534 v
= _PyUnicode_New(size
);
1538 return (PyObject
*)v
;
1539 p
= buf
= PyUnicode_AS_UNICODE(v
);
1546 /* Non-escape characters are interpreted as Unicode ordinals */
1548 *p
++ = (unsigned char)*s
++;
1552 /* \u-escapes are only interpreted iff the number of leading
1553 backslashes if odd */
1558 *p
++ = (unsigned char)*s
++;
1560 if (((s
- bs
) & 1) == 0 ||
1568 /* \uXXXX with 4 hex digits */
1569 for (x
= 0, i
= 0; i
< 4; i
++) {
1570 c
= (unsigned char)s
[i
];
1572 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1573 "truncated \\uXXXX"))
1579 if (c
>= '0' && c
<= '9')
1581 else if (c
>= 'a' && c
<= 'f')
1589 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1591 return (PyObject
*)v
;
1598 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1605 static const char *hexdigit
= "0123456789abcdef";
1607 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1613 p
= q
= PyString_AS_STRING(repr
);
1614 while (size
-- > 0) {
1615 Py_UNICODE ch
= *s
++;
1616 /* Map 16-bit characters to '\uxxxx' */
1620 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1621 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1622 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1623 *p
++ = hexdigit
[ch
& 15];
1625 /* Copy everything else as-is */
1630 if (_PyString_Resize(&repr
, p
- q
))
1640 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1642 if (!PyUnicode_Check(unicode
)) {
1643 PyErr_BadArgument();
1646 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1647 PyUnicode_GET_SIZE(unicode
));
1650 /* --- Latin-1 Codec ------------------------------------------------------ */
1652 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
1659 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1660 if (size
== 1 && *(unsigned char*)s
< 256) {
1661 Py_UNICODE r
= *(unsigned char*)s
;
1662 return PyUnicode_FromUnicode(&r
, 1);
1665 v
= _PyUnicode_New(size
);
1669 return (PyObject
*)v
;
1670 p
= PyUnicode_AS_UNICODE(v
);
1672 *p
++ = (unsigned char)*s
++;
1673 return (PyObject
*)v
;
1681 int latin1_encoding_error(const Py_UNICODE
**source
,
1684 const char *details
)
1686 if ((errors
== NULL
) ||
1687 (strcmp(errors
,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError
,
1689 "Latin-1 encoding error: %.400s",
1693 else if (strcmp(errors
,"ignore") == 0) {
1696 else if (strcmp(errors
,"replace") == 0) {
1702 PyErr_Format(PyExc_ValueError
,
1703 "Latin-1 encoding error; "
1704 "unknown error handling code: %.400s",
1710 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
1717 repr
= PyString_FromStringAndSize(NULL
, size
);
1723 s
= PyString_AS_STRING(repr
);
1725 while (size
-- > 0) {
1726 Py_UNICODE ch
= *p
++;
1728 if (latin1_encoding_error(&p
, &s
, errors
,
1729 "ordinal not in range(256)"))
1735 /* Resize if error handling skipped some characters */
1736 if (s
- start
< PyString_GET_SIZE(repr
))
1737 if (_PyString_Resize(&repr
, s
- start
))
1746 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
1748 if (!PyUnicode_Check(unicode
)) {
1749 PyErr_BadArgument();
1752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
1753 PyUnicode_GET_SIZE(unicode
),
1757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1760 int ascii_decoding_error(const char **source
,
1763 const char *details
)
1765 if ((errors
== NULL
) ||
1766 (strcmp(errors
,"strict") == 0)) {
1767 PyErr_Format(PyExc_UnicodeError
,
1768 "ASCII decoding error: %.400s",
1772 else if (strcmp(errors
,"ignore") == 0) {
1775 else if (strcmp(errors
,"replace") == 0) {
1776 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1781 PyErr_Format(PyExc_ValueError
,
1782 "ASCII decoding error; "
1783 "unknown error handling code: %.400s",
1789 PyObject
*PyUnicode_DecodeASCII(const char *s
,
1796 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1797 if (size
== 1 && *(unsigned char*)s
< 128) {
1798 Py_UNICODE r
= *(unsigned char*)s
;
1799 return PyUnicode_FromUnicode(&r
, 1);
1802 v
= _PyUnicode_New(size
);
1806 return (PyObject
*)v
;
1807 p
= PyUnicode_AS_UNICODE(v
);
1808 while (size
-- > 0) {
1809 register unsigned char c
;
1811 c
= (unsigned char)*s
++;
1814 else if (ascii_decoding_error(&s
, &p
, errors
,
1815 "ordinal not in range(128)"))
1818 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
1819 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1821 return (PyObject
*)v
;
1829 int ascii_encoding_error(const Py_UNICODE
**source
,
1832 const char *details
)
1834 if ((errors
== NULL
) ||
1835 (strcmp(errors
,"strict") == 0)) {
1836 PyErr_Format(PyExc_UnicodeError
,
1837 "ASCII encoding error: %.400s",
1841 else if (strcmp(errors
,"ignore") == 0) {
1844 else if (strcmp(errors
,"replace") == 0) {
1850 PyErr_Format(PyExc_ValueError
,
1851 "ASCII encoding error; "
1852 "unknown error handling code: %.400s",
1858 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
1865 repr
= PyString_FromStringAndSize(NULL
, size
);
1871 s
= PyString_AS_STRING(repr
);
1873 while (size
-- > 0) {
1874 Py_UNICODE ch
= *p
++;
1876 if (ascii_encoding_error(&p
, &s
, errors
,
1877 "ordinal not in range(128)"))
1883 /* Resize if error handling skipped some characters */
1884 if (s
- start
< PyString_GET_SIZE(repr
))
1885 if (_PyString_Resize(&repr
, s
- start
))
1894 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
1896 if (!PyUnicode_Check(unicode
)) {
1897 PyErr_BadArgument();
1900 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
1901 PyUnicode_GET_SIZE(unicode
),
1905 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1907 /* --- MBCS codecs for Windows -------------------------------------------- */
1909 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
1916 /* First get the size of the result */
1917 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
1918 if (size
> 0 && usize
==0)
1919 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1921 v
= _PyUnicode_New(usize
);
1925 return (PyObject
*)v
;
1926 p
= PyUnicode_AS_UNICODE(v
);
1927 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
1929 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1932 return (PyObject
*)v
;
1935 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
1943 /* If there are no characters, bail now! */
1945 return PyString_FromString("");
1947 /* First get the size of the result */
1948 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
1950 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1952 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
1958 /* Do the conversion */
1959 s
= PyString_AS_STRING(repr
);
1960 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
1962 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1967 #endif /* MS_WIN32 */
1969 /* --- Character Mapping Codec -------------------------------------------- */
1972 int charmap_decoding_error(const char **source
,
1975 const char *details
)
1977 if ((errors
== NULL
) ||
1978 (strcmp(errors
,"strict") == 0)) {
1979 PyErr_Format(PyExc_UnicodeError
,
1980 "charmap decoding error: %.400s",
1984 else if (strcmp(errors
,"ignore") == 0) {
1987 else if (strcmp(errors
,"replace") == 0) {
1988 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1993 PyErr_Format(PyExc_ValueError
,
1994 "charmap decoding error; "
1995 "unknown error handling code: %.400s",
2001 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2010 /* Default to Latin-1 */
2011 if (mapping
== NULL
)
2012 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2014 v
= _PyUnicode_New(size
);
2018 return (PyObject
*)v
;
2019 p
= PyUnicode_AS_UNICODE(v
);
2020 while (size
-- > 0) {
2021 unsigned char ch
= *s
++;
2024 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2025 w
= PyInt_FromLong((long)ch
);
2028 x
= PyObject_GetItem(mapping
, w
);
2031 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2032 /* No mapping found means: mapping is undefined. */
2041 if (PyInt_Check(x
)) {
2042 long value
= PyInt_AS_LONG(x
);
2043 if (value
< 0 || value
> 65535) {
2044 PyErr_SetString(PyExc_TypeError
,
2045 "character mapping must be in range(65536)");
2049 *p
++ = (Py_UNICODE
)value
;
2051 else if (x
== Py_None
) {
2052 /* undefined mapping */
2053 if (charmap_decoding_error(&s
, &p
, errors
,
2054 "character maps to <undefined>")) {
2059 else if (PyUnicode_Check(x
)) {
2060 int targetsize
= PyUnicode_GET_SIZE(x
);
2062 if (targetsize
== 1)
2064 *p
++ = *PyUnicode_AS_UNICODE(x
);
2066 else if (targetsize
> 1) {
2068 if (targetsize
> extrachars
) {
2070 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2071 int needed
= (targetsize
- extrachars
) + \
2073 extrachars
+= needed
;
2074 if (_PyUnicode_Resize(&v
,
2075 PyUnicode_GET_SIZE(v
) + needed
)) {
2079 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2082 PyUnicode_AS_UNICODE(x
),
2085 extrachars
-= targetsize
;
2087 /* 1-0 mapping: skip the character */
2090 /* wrong return value */
2091 PyErr_SetString(PyExc_TypeError
,
2092 "character mapping must return integer, None or unicode");
2098 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2099 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2101 return (PyObject
*)v
;
2109 int charmap_encoding_error(const Py_UNICODE
**source
,
2112 const char *details
)
2114 if ((errors
== NULL
) ||
2115 (strcmp(errors
,"strict") == 0)) {
2116 PyErr_Format(PyExc_UnicodeError
,
2117 "charmap encoding error: %.400s",
2121 else if (strcmp(errors
,"ignore") == 0) {
2124 else if (strcmp(errors
,"replace") == 0) {
2130 PyErr_Format(PyExc_ValueError
,
2131 "charmap encoding error; "
2132 "unknown error handling code: %.400s",
2138 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2147 /* Default to Latin-1 */
2148 if (mapping
== NULL
)
2149 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2151 v
= PyString_FromStringAndSize(NULL
, size
);
2156 s
= PyString_AS_STRING(v
);
2157 while (size
-- > 0) {
2158 Py_UNICODE ch
= *p
++;
2161 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2162 w
= PyInt_FromLong((long)ch
);
2165 x
= PyObject_GetItem(mapping
, w
);
2168 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2169 /* No mapping found means: mapping is undefined. */
2178 if (PyInt_Check(x
)) {
2179 long value
= PyInt_AS_LONG(x
);
2180 if (value
< 0 || value
> 255) {
2181 PyErr_SetString(PyExc_TypeError
,
2182 "character mapping must be in range(256)");
2188 else if (x
== Py_None
) {
2189 /* undefined mapping */
2190 if (charmap_encoding_error(&p
, &s
, errors
,
2191 "character maps to <undefined>")) {
2196 else if (PyString_Check(x
)) {
2197 int targetsize
= PyString_GET_SIZE(x
);
2199 if (targetsize
== 1)
2201 *s
++ = *PyString_AS_STRING(x
);
2203 else if (targetsize
> 1) {
2205 if (targetsize
> extrachars
) {
2207 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2208 int needed
= (targetsize
- extrachars
) + \
2210 extrachars
+= needed
;
2211 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2215 s
= PyString_AS_STRING(v
) + oldpos
;
2217 memcpy(s
, PyString_AS_STRING(x
), targetsize
);
2219 extrachars
-= targetsize
;
2221 /* 1-0 mapping: skip the character */
2224 /* wrong return value */
2225 PyErr_SetString(PyExc_TypeError
,
2226 "character mapping must return integer, None or unicode");
2232 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2233 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2242 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2245 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2246 PyErr_BadArgument();
2249 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2250 PyUnicode_GET_SIZE(unicode
),
2256 int translate_error(const Py_UNICODE
**source
,
2259 const char *details
)
2261 if ((errors
== NULL
) ||
2262 (strcmp(errors
,"strict") == 0)) {
2263 PyErr_Format(PyExc_UnicodeError
,
2264 "translate error: %.400s",
2268 else if (strcmp(errors
,"ignore") == 0) {
2271 else if (strcmp(errors
,"replace") == 0) {
2277 PyErr_Format(PyExc_ValueError
,
2279 "unknown error handling code: %.400s",
2285 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2293 if (mapping
== NULL
) {
2294 PyErr_BadArgument();
2298 /* Output will never be longer than input */
2299 v
= _PyUnicode_New(size
);
2304 p
= PyUnicode_AS_UNICODE(v
);
2305 while (size
-- > 0) {
2306 Py_UNICODE ch
= *s
++;
2310 w
= PyInt_FromLong(ch
);
2313 x
= PyObject_GetItem(mapping
, w
);
2316 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2317 /* No mapping found: default to 1-1 mapping */
2327 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2328 else if (x
== Py_None
) {
2329 /* undefined mapping */
2330 if (translate_error(&s
, &p
, errors
,
2331 "character maps to <undefined>")) {
2336 else if (PyUnicode_Check(x
)) {
2337 if (PyUnicode_GET_SIZE(x
) != 1) {
2339 PyErr_SetString(PyExc_NotImplementedError
,
2340 "1-n mappings are currently not implemented");
2344 *p
++ = *PyUnicode_AS_UNICODE(x
);
2347 /* wrong return value */
2348 PyErr_SetString(PyExc_TypeError
,
2349 "translate mapping must return integer, None or unicode");
2355 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2356 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2360 return (PyObject
*)v
;
2367 PyObject
*PyUnicode_Translate(PyObject
*str
,
2373 str
= PyUnicode_FromObject(str
);
2376 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2377 PyUnicode_GET_SIZE(str
),
2388 /* --- Decimal Encoder ---------------------------------------------------- */
2390 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2395 Py_UNICODE
*p
, *end
;
2397 if (output
== NULL
) {
2398 PyErr_BadArgument();
2405 register Py_UNICODE ch
= *p
++;
2408 if (Py_UNICODE_ISSPACE(ch
)) {
2412 decimal
= Py_UNICODE_TODECIMAL(ch
);
2414 *output
++ = '0' + decimal
;
2417 if (0 < ch
&& ch
< 256) {
2418 *output
++ = (char)ch
;
2421 /* All other characters are considered invalid */
2422 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2423 PyErr_SetString(PyExc_ValueError
,
2424 "invalid decimal Unicode string");
2427 else if (strcmp(errors
, "ignore") == 0)
2429 else if (strcmp(errors
, "replace") == 0) {
2434 /* 0-terminate the output string */
2442 /* --- Helpers ------------------------------------------------------------ */
2445 int count(PyUnicodeObject
*self
,
2448 PyUnicodeObject
*substring
)
2453 start
+= self
->length
;
2456 if (end
> self
->length
)
2459 end
+= self
->length
;
2463 if (substring
->length
== 0)
2464 return (end
- start
+ 1);
2466 end
-= substring
->length
;
2468 while (start
<= end
)
2469 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2471 start
+= substring
->length
;
2478 int PyUnicode_Count(PyObject
*str
,
2485 str
= PyUnicode_FromObject(str
);
2488 substr
= PyUnicode_FromObject(substr
);
2489 if (substr
== NULL
) {
2494 result
= count((PyUnicodeObject
*)str
,
2496 (PyUnicodeObject
*)substr
);
2504 int findstring(PyUnicodeObject
*self
,
2505 PyUnicodeObject
*substring
,
2511 start
+= self
->length
;
2515 if (substring
->length
== 0)
2518 if (end
> self
->length
)
2521 end
+= self
->length
;
2525 end
-= substring
->length
;
2527 if (direction
< 0) {
2528 for (; end
>= start
; end
--)
2529 if (Py_UNICODE_MATCH(self
, end
, substring
))
2532 for (; start
<= end
; start
++)
2533 if (Py_UNICODE_MATCH(self
, start
, substring
))
2540 int PyUnicode_Find(PyObject
*str
,
2548 str
= PyUnicode_FromObject(str
);
2551 substr
= PyUnicode_FromObject(substr
);
2552 if (substr
== NULL
) {
2557 result
= findstring((PyUnicodeObject
*)str
,
2558 (PyUnicodeObject
*)substr
,
2559 start
, end
, direction
);
2566 int tailmatch(PyUnicodeObject
*self
,
2567 PyUnicodeObject
*substring
,
2573 start
+= self
->length
;
2577 if (substring
->length
== 0)
2580 if (end
> self
->length
)
2583 end
+= self
->length
;
2587 end
-= substring
->length
;
2591 if (direction
> 0) {
2592 if (Py_UNICODE_MATCH(self
, end
, substring
))
2595 if (Py_UNICODE_MATCH(self
, start
, substring
))
2602 int PyUnicode_Tailmatch(PyObject
*str
,
2610 str
= PyUnicode_FromObject(str
);
2613 substr
= PyUnicode_FromObject(substr
);
2614 if (substr
== NULL
) {
2619 result
= tailmatch((PyUnicodeObject
*)str
,
2620 (PyUnicodeObject
*)substr
,
2621 start
, end
, direction
);
2628 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2632 /* like wcschr, but doesn't stop at NULL characters */
2634 while (size
-- > 0) {
2643 /* Apply fixfct filter to the Unicode object self and return a
2644 reference to the modified object */
2647 PyObject
*fixup(PyUnicodeObject
*self
,
2648 int (*fixfct
)(PyUnicodeObject
*s
))
2653 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
2657 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
2660 /* fixfct should return TRUE if it modified the buffer. If
2661 FALSE, return a reference to the original buffer instead
2662 (to save space, not time) */
2665 return (PyObject
*) self
;
2667 return (PyObject
*) u
;
2671 int fixupper(PyUnicodeObject
*self
)
2673 int len
= self
->length
;
2674 Py_UNICODE
*s
= self
->str
;
2678 register Py_UNICODE ch
;
2680 ch
= Py_UNICODE_TOUPPER(*s
);
2692 int fixlower(PyUnicodeObject
*self
)
2694 int len
= self
->length
;
2695 Py_UNICODE
*s
= self
->str
;
2699 register Py_UNICODE ch
;
2701 ch
= Py_UNICODE_TOLOWER(*s
);
2713 int fixswapcase(PyUnicodeObject
*self
)
2715 int len
= self
->length
;
2716 Py_UNICODE
*s
= self
->str
;
2720 if (Py_UNICODE_ISUPPER(*s
)) {
2721 *s
= Py_UNICODE_TOLOWER(*s
);
2723 } else if (Py_UNICODE_ISLOWER(*s
)) {
2724 *s
= Py_UNICODE_TOUPPER(*s
);
2734 int fixcapitalize(PyUnicodeObject
*self
)
2736 int len
= self
->length
;
2737 Py_UNICODE
*s
= self
->str
;
2742 if (Py_UNICODE_ISLOWER(*s
)) {
2743 *s
= Py_UNICODE_TOUPPER(*s
);
2748 if (Py_UNICODE_ISUPPER(*s
)) {
2749 *s
= Py_UNICODE_TOLOWER(*s
);
2758 int fixtitle(PyUnicodeObject
*self
)
2760 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
2761 register Py_UNICODE
*e
;
2762 int previous_is_cased
;
2764 /* Shortcut for single character strings */
2765 if (PyUnicode_GET_SIZE(self
) == 1) {
2766 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
2775 e
= p
+ PyUnicode_GET_SIZE(self
);
2776 previous_is_cased
= 0;
2777 for (; p
< e
; p
++) {
2778 register const Py_UNICODE ch
= *p
;
2780 if (previous_is_cased
)
2781 *p
= Py_UNICODE_TOLOWER(ch
);
2783 *p
= Py_UNICODE_TOTITLE(ch
);
2785 if (Py_UNICODE_ISLOWER(ch
) ||
2786 Py_UNICODE_ISUPPER(ch
) ||
2787 Py_UNICODE_ISTITLE(ch
))
2788 previous_is_cased
= 1;
2790 previous_is_cased
= 0;
2795 PyObject
*PyUnicode_Join(PyObject
*separator
,
2800 PyUnicodeObject
*res
= NULL
;
2807 it
= PyObject_GetIter(seq
);
2811 if (separator
== NULL
) {
2812 Py_UNICODE blank
= ' ';
2817 separator
= PyUnicode_FromObject(separator
);
2818 if (separator
== NULL
)
2820 sep
= PyUnicode_AS_UNICODE(separator
);
2821 seplen
= PyUnicode_GET_SIZE(separator
);
2824 res
= _PyUnicode_New(sz
);
2827 p
= PyUnicode_AS_UNICODE(res
);
2830 for (i
= 0; ; ++i
) {
2832 PyObject
*item
= PyIter_Next(it
);
2834 if (PyErr_Occurred())
2838 if (!PyUnicode_Check(item
)) {
2840 v
= PyUnicode_FromObject(item
);
2846 itemlen
= PyUnicode_GET_SIZE(item
);
2847 while (reslen
+ itemlen
+ seplen
>= sz
) {
2848 if (_PyUnicode_Resize(&res
, sz
*2))
2851 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
2854 Py_UNICODE_COPY(p
, sep
, seplen
);
2858 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
2863 if (_PyUnicode_Resize(&res
, reslen
))
2866 Py_XDECREF(separator
);
2868 return (PyObject
*)res
;
2871 Py_XDECREF(separator
);
2878 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
2890 if (left
== 0 && right
== 0) {
2895 u
= _PyUnicode_New(left
+ self
->length
+ right
);
2898 Py_UNICODE_FILL(u
->str
, fill
, left
);
2899 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
2901 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
2907 #define SPLIT_APPEND(data, left, right) \
2908 str = PyUnicode_FromUnicode(data + left, right - left); \
2911 if (PyList_Append(list, str)) { \
2919 PyObject
*split_whitespace(PyUnicodeObject
*self
,
2925 int len
= self
->length
;
2928 for (i
= j
= 0; i
< len
; ) {
2930 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2933 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
2936 if (maxcount
-- <= 0)
2938 SPLIT_APPEND(self
->str
, j
, i
);
2939 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2945 SPLIT_APPEND(self
->str
, j
, len
);
2954 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
2964 string
= PyUnicode_FromObject(string
);
2967 data
= PyUnicode_AS_UNICODE(string
);
2968 len
= PyUnicode_GET_SIZE(string
);
2970 list
= PyList_New(0);
2974 for (i
= j
= 0; i
< len
; ) {
2977 /* Find a line and append it */
2978 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
2981 /* Skip the line break reading CRLF as one line break */
2984 if (data
[i
] == '\r' && i
+ 1 < len
&&
2992 SPLIT_APPEND(data
, j
, eol
);
2996 SPLIT_APPEND(data
, j
, len
);
3009 PyObject
*split_char(PyUnicodeObject
*self
,
3016 int len
= self
->length
;
3019 for (i
= j
= 0; i
< len
; ) {
3020 if (self
->str
[i
] == ch
) {
3021 if (maxcount
-- <= 0)
3023 SPLIT_APPEND(self
->str
, j
, i
);
3029 SPLIT_APPEND(self
->str
, j
, len
);
3039 PyObject
*split_substring(PyUnicodeObject
*self
,
3041 PyUnicodeObject
*substring
,
3046 int len
= self
->length
;
3047 int sublen
= substring
->length
;
3050 for (i
= j
= 0; i
<= len
- sublen
; ) {
3051 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
3052 if (maxcount
-- <= 0)
3054 SPLIT_APPEND(self
->str
, j
, i
);
3060 SPLIT_APPEND(self
->str
, j
, len
);
3072 PyObject
*split(PyUnicodeObject
*self
,
3073 PyUnicodeObject
*substring
,
3081 list
= PyList_New(0);
3085 if (substring
== NULL
)
3086 return split_whitespace(self
,list
,maxcount
);
3088 else if (substring
->length
== 1)
3089 return split_char(self
,list
,substring
->str
[0],maxcount
);
3091 else if (substring
->length
== 0) {
3093 PyErr_SetString(PyExc_ValueError
, "empty separator");
3097 return split_substring(self
,list
,substring
,maxcount
);
3101 PyObject
*strip(PyUnicodeObject
*self
,
3105 Py_UNICODE
*p
= self
->str
;
3107 int end
= self
->length
;
3110 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
3114 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
3117 if (start
== 0 && end
== self
->length
) {
3118 /* couldn't strip anything off, return original string */
3120 return (PyObject
*) self
;
3123 return (PyObject
*) PyUnicode_FromUnicode(
3130 PyObject
*replace(PyUnicodeObject
*self
,
3131 PyUnicodeObject
*str1
,
3132 PyUnicodeObject
*str2
,
3140 if (str1
->length
== 1 && str2
->length
== 1) {
3143 /* replace characters */
3144 if (!findchar(self
->str
, self
->length
, str1
->str
[0])) {
3145 /* nothing to replace, return original string */
3149 Py_UNICODE u1
= str1
->str
[0];
3150 Py_UNICODE u2
= str2
->str
[0];
3152 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3157 Py_UNICODE_COPY(u
->str
, self
->str
,
3159 for (i
= 0; i
< u
->length
; i
++)
3160 if (u
->str
[i
] == u1
) {
3172 /* replace strings */
3173 n
= count(self
, 0, self
->length
, str1
);
3177 /* nothing to replace, return original string */
3182 self
->length
+ n
* (str2
->length
- str1
->length
));
3186 while (i
<= self
->length
- str1
->length
)
3187 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3188 /* replace string segment */
3189 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3193 /* copy remaining part */
3194 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3198 *p
++ = self
->str
[i
++];
3203 return (PyObject
*) u
;
3206 /* --- Unicode Object Methods --------------------------------------------- */
3208 static char title__doc__
[] =
3209 "S.title() -> unicode\n\
3211 Return a titlecased version of S, i.e. words start with title case\n\
3212 characters, all remaining cased characters have lower case.";
3215 unicode_title(PyUnicodeObject
*self
, PyObject
*args
)
3217 if (!PyArg_NoArgs(args
))
3219 return fixup(self
, fixtitle
);
3222 static char capitalize__doc__
[] =
3223 "S.capitalize() -> unicode\n\
3225 Return a capitalized version of S, i.e. make the first character\n\
3229 unicode_capitalize(PyUnicodeObject
*self
, PyObject
*args
)
3231 if (!PyArg_NoArgs(args
))
3233 return fixup(self
, fixcapitalize
);
3237 static char capwords__doc__
[] =
3238 "S.capwords() -> unicode\n\
3240 Apply .capitalize() to all words in S and return the result with\n\
3241 normalized whitespace (all whitespace strings are replaced by ' ').";
3244 unicode_capwords(PyUnicodeObject
*self
, PyObject
*args
)
3250 if (!PyArg_NoArgs(args
))
3253 /* Split into words */
3254 list
= split(self
, NULL
, -1);
3258 /* Capitalize each word */
3259 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3260 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3264 Py_DECREF(PyList_GET_ITEM(list
, i
));
3265 PyList_SET_ITEM(list
, i
, item
);
3268 /* Join the words to form a new string */
3269 item
= PyUnicode_Join(NULL
, list
);
3273 return (PyObject
*)item
;
3277 static char center__doc__
[] =
3278 "S.center(width) -> unicode\n\
3280 Return S centered in a Unicode string of length width. Padding is done\n\
3284 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3289 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3292 if (self
->length
>= width
) {
3294 return (PyObject
*) self
;
3297 marg
= width
- self
->length
;
3298 left
= marg
/ 2 + (marg
& width
& 1);
3300 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3305 /* This code should go into some future Unicode collation support
3306 module. The basic comparison should compare ordinals on a naive
3307 basis (this is what Java does and thus JPython too). */
3309 /* speedy UTF-16 code point order comparison */
3311 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3313 static short utf16Fixup
[32] =
3315 0, 0, 0, 0, 0, 0, 0, 0,
3316 0, 0, 0, 0, 0, 0, 0, 0,
3317 0, 0, 0, 0, 0, 0, 0, 0,
3318 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3322 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3326 Py_UNICODE
*s1
= str1
->str
;
3327 Py_UNICODE
*s2
= str2
->str
;
3329 len1
= str1
->length
;
3330 len2
= str2
->length
;
3332 while (len1
> 0 && len2
> 0) {
3338 if (c1
> (1<<11) * 26)
3339 c1
+= utf16Fixup
[c1
>>11];
3340 if (c2
> (1<<11) * 26)
3341 c2
+= utf16Fixup
[c2
>>11];
3342 /* now c1 and c2 are in UTF-32-compatible order */
3345 return (c1
< c2
) ? -1 : 1;
3350 return (len1
< len2
) ? -1 : (len1
!= len2
);
3356 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3358 register int len1
, len2
;
3360 Py_UNICODE
*s1
= str1
->str
;
3361 Py_UNICODE
*s2
= str2
->str
;
3363 len1
= str1
->length
;
3364 len2
= str2
->length
;
3366 while (len1
> 0 && len2
> 0) {
3373 return (c1
< c2
) ? -1 : 1;
3378 return (len1
< len2
) ? -1 : (len1
!= len2
);
3383 int PyUnicode_Compare(PyObject
*left
,
3386 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3389 /* Coerce the two arguments */
3390 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3393 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3397 /* Shortcut for empty or interned objects */
3404 result
= unicode_compare(u
, v
);
3416 int PyUnicode_Contains(PyObject
*container
,
3419 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3421 register const Py_UNICODE
*p
, *e
;
3422 register Py_UNICODE ch
;
3424 /* Coerce the two arguments */
3425 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3427 PyErr_SetString(PyExc_TypeError
,
3428 "'in <string>' requires character as left operand");
3431 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3438 if (PyUnicode_GET_SIZE(v
) != 1) {
3439 PyErr_SetString(PyExc_TypeError
,
3440 "'in <string>' requires character as left operand");
3443 ch
= *PyUnicode_AS_UNICODE(v
);
3444 p
= PyUnicode_AS_UNICODE(u
);
3445 e
= p
+ PyUnicode_GET_SIZE(u
);
3464 /* Concat to string or Unicode object giving a new Unicode object. */
3466 PyObject
*PyUnicode_Concat(PyObject
*left
,
3469 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3471 /* Coerce the two arguments */
3472 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3475 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3480 if (v
== unicode_empty
) {
3482 return (PyObject
*)u
;
3484 if (u
== unicode_empty
) {
3486 return (PyObject
*)v
;
3489 /* Concat the two Unicode strings */
3490 w
= _PyUnicode_New(u
->length
+ v
->length
);
3493 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3494 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3498 return (PyObject
*)w
;
3506 static char count__doc__
[] =
3507 "S.count(sub[, start[, end]]) -> int\n\
3509 Return the number of occurrences of substring sub in Unicode string\n\
3510 S[start:end]. Optional arguments start and end are\n\
3511 interpreted as in slice notation.";
3514 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3516 PyUnicodeObject
*substring
;
3521 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3522 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3525 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3526 (PyObject
*)substring
);
3527 if (substring
== NULL
)
3531 start
+= self
->length
;
3534 if (end
> self
->length
)
3537 end
+= self
->length
;
3541 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3543 Py_DECREF(substring
);
3547 static char encode__doc__
[] =
3548 "S.encode([encoding[,errors]]) -> string\n\
3550 Return an encoded string version of S. Default encoding is the current\n\
3551 default string encoding. errors may be given to set a different error\n\
3552 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3553 a ValueError. Other possible values are 'ignore' and 'replace'.";
3556 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3558 char *encoding
= NULL
;
3559 char *errors
= NULL
;
3560 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3562 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3565 static char expandtabs__doc__
[] =
3566 "S.expandtabs([tabsize]) -> unicode\n\
3568 Return a copy of S where all tab characters are expanded using spaces.\n\
3569 If tabsize is not given, a tab size of 8 characters is assumed.";
3572 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3581 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3584 /* First pass: determine size of output string */
3586 e
= self
->str
+ self
->length
;
3587 for (p
= self
->str
; p
< e
; p
++)
3590 j
+= tabsize
- (j
% tabsize
);
3594 if (*p
== '\n' || *p
== '\r') {
3600 /* Second pass: create output string and fill it */
3601 u
= _PyUnicode_New(i
+ j
);
3608 for (p
= self
->str
; p
< e
; p
++)
3611 i
= tabsize
- (j
% tabsize
);
3620 if (*p
== '\n' || *p
== '\r')
3624 return (PyObject
*) u
;
3627 static char find__doc__
[] =
3628 "S.find(sub [,start [,end]]) -> int\n\
3630 Return the lowest index in S where substring sub is found,\n\
3631 such that sub is contained within s[start,end]. Optional\n\
3632 arguments start and end are interpreted as in slice notation.\n\
3634 Return -1 on failure.";
3637 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3639 PyUnicodeObject
*substring
;
3644 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
3645 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3647 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3648 (PyObject
*)substring
);
3649 if (substring
== NULL
)
3652 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
3654 Py_DECREF(substring
);
3659 unicode_getitem(PyUnicodeObject
*self
, int index
)
3661 if (index
< 0 || index
>= self
->length
) {
3662 PyErr_SetString(PyExc_IndexError
, "string index out of range");
3666 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
3670 unicode_hash(PyUnicodeObject
*self
)
3672 /* Since Unicode objects compare equal to their ASCII string
3673 counterparts, they should use the individual character values
3674 as basis for their hash value. This is needed to assure that
3675 strings and Unicode objects behave in the same way as
3679 register Py_UNICODE
*p
;
3682 if (self
->hash
!= -1)
3684 len
= PyUnicode_GET_SIZE(self
);
3685 p
= PyUnicode_AS_UNICODE(self
);
3688 x
= (1000003*x
) ^ *p
++;
3689 x
^= PyUnicode_GET_SIZE(self
);
3696 static char index__doc__
[] =
3697 "S.index(sub [,start [,end]]) -> int\n\
3699 Like S.find() but raise ValueError when the substring is not found.";
3702 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
3705 PyUnicodeObject
*substring
;
3709 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
3710 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3713 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3714 (PyObject
*)substring
);
3715 if (substring
== NULL
)
3718 result
= findstring(self
, substring
, start
, end
, 1);
3720 Py_DECREF(substring
);
3722 PyErr_SetString(PyExc_ValueError
, "substring not found");
3725 return PyInt_FromLong(result
);
3728 static char islower__doc__
[] =
3729 "S.islower() -> int\n\
3731 Return 1 if all cased characters in S are lowercase and there is\n\
3732 at least one cased character in S, 0 otherwise.";
3735 unicode_islower(PyUnicodeObject
*self
, PyObject
*args
)
3737 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3738 register const Py_UNICODE
*e
;
3741 if (!PyArg_NoArgs(args
))
3744 /* Shortcut for single character strings */
3745 if (PyUnicode_GET_SIZE(self
) == 1)
3746 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
3748 /* Special case for empty strings */
3749 if (PyString_GET_SIZE(self
) == 0)
3750 return PyInt_FromLong(0);
3752 e
= p
+ PyUnicode_GET_SIZE(self
);
3754 for (; p
< e
; p
++) {
3755 register const Py_UNICODE ch
= *p
;
3757 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
3758 return PyInt_FromLong(0);
3759 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
3762 return PyInt_FromLong(cased
);
3765 static char isupper__doc__
[] =
3766 "S.isupper() -> int\n\
3768 Return 1 if all cased characters in S are uppercase and there is\n\
3769 at least one cased character in S, 0 otherwise.";
3772 unicode_isupper(PyUnicodeObject
*self
, PyObject
*args
)
3774 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3775 register const Py_UNICODE
*e
;
3778 if (!PyArg_NoArgs(args
))
3781 /* Shortcut for single character strings */
3782 if (PyUnicode_GET_SIZE(self
) == 1)
3783 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
3785 /* Special case for empty strings */
3786 if (PyString_GET_SIZE(self
) == 0)
3787 return PyInt_FromLong(0);
3789 e
= p
+ PyUnicode_GET_SIZE(self
);
3791 for (; p
< e
; p
++) {
3792 register const Py_UNICODE ch
= *p
;
3794 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
3795 return PyInt_FromLong(0);
3796 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
3799 return PyInt_FromLong(cased
);
3802 static char istitle__doc__
[] =
3803 "S.istitle() -> int\n\
3805 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3806 may only follow uncased characters and lowercase characters only cased\n\
3807 ones. Return 0 otherwise.";
3810 unicode_istitle(PyUnicodeObject
*self
, PyObject
*args
)
3812 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3813 register const Py_UNICODE
*e
;
3814 int cased
, previous_is_cased
;
3816 if (!PyArg_NoArgs(args
))
3819 /* Shortcut for single character strings */
3820 if (PyUnicode_GET_SIZE(self
) == 1)
3821 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
3822 (Py_UNICODE_ISUPPER(*p
) != 0));
3824 /* Special case for empty strings */
3825 if (PyString_GET_SIZE(self
) == 0)
3826 return PyInt_FromLong(0);
3828 e
= p
+ PyUnicode_GET_SIZE(self
);
3830 previous_is_cased
= 0;
3831 for (; p
< e
; p
++) {
3832 register const Py_UNICODE ch
= *p
;
3834 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
3835 if (previous_is_cased
)
3836 return PyInt_FromLong(0);
3837 previous_is_cased
= 1;
3840 else if (Py_UNICODE_ISLOWER(ch
)) {
3841 if (!previous_is_cased
)
3842 return PyInt_FromLong(0);
3843 previous_is_cased
= 1;
3847 previous_is_cased
= 0;
3849 return PyInt_FromLong(cased
);
3852 static char isspace__doc__
[] =
3853 "S.isspace() -> int\n\
3855 Return 1 if there are only whitespace characters in S,\n\
3859 unicode_isspace(PyUnicodeObject
*self
, PyObject
*args
)
3861 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3862 register const Py_UNICODE
*e
;
3864 if (!PyArg_NoArgs(args
))
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self
) == 1 &&
3869 Py_UNICODE_ISSPACE(*p
))
3870 return PyInt_FromLong(1);
3872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self
) == 0)
3874 return PyInt_FromLong(0);
3876 e
= p
+ PyUnicode_GET_SIZE(self
);
3877 for (; p
< e
; p
++) {
3878 if (!Py_UNICODE_ISSPACE(*p
))
3879 return PyInt_FromLong(0);
3881 return PyInt_FromLong(1);
3884 static char isalpha__doc__
[] =
3885 "S.isalpha() -> int\n\
3887 Return 1 if all characters in S are alphabetic\n\
3888 and there is at least one character in S, 0 otherwise.";
3891 unicode_isalpha(PyUnicodeObject
*self
, PyObject
*args
)
3893 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3894 register const Py_UNICODE
*e
;
3896 if (!PyArg_NoArgs(args
))
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self
) == 1 &&
3901 Py_UNICODE_ISALPHA(*p
))
3902 return PyInt_FromLong(1);
3904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self
) == 0)
3906 return PyInt_FromLong(0);
3908 e
= p
+ PyUnicode_GET_SIZE(self
);
3909 for (; p
< e
; p
++) {
3910 if (!Py_UNICODE_ISALPHA(*p
))
3911 return PyInt_FromLong(0);
3913 return PyInt_FromLong(1);
3916 static char isalnum__doc__
[] =
3917 "S.isalnum() -> int\n\
3919 Return 1 if all characters in S are alphanumeric\n\
3920 and there is at least one character in S, 0 otherwise.";
3923 unicode_isalnum(PyUnicodeObject
*self
, PyObject
*args
)
3925 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3926 register const Py_UNICODE
*e
;
3928 if (!PyArg_NoArgs(args
))
3931 /* Shortcut for single character strings */
3932 if (PyUnicode_GET_SIZE(self
) == 1 &&
3933 Py_UNICODE_ISALNUM(*p
))
3934 return PyInt_FromLong(1);
3936 /* Special case for empty strings */
3937 if (PyString_GET_SIZE(self
) == 0)
3938 return PyInt_FromLong(0);
3940 e
= p
+ PyUnicode_GET_SIZE(self
);
3941 for (; p
< e
; p
++) {
3942 if (!Py_UNICODE_ISALNUM(*p
))
3943 return PyInt_FromLong(0);
3945 return PyInt_FromLong(1);
3948 static char isdecimal__doc__
[] =
3949 "S.isdecimal() -> int\n\
3951 Return 1 if there are only decimal characters in S,\n\
3955 unicode_isdecimal(PyUnicodeObject
*self
, PyObject
*args
)
3957 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3958 register const Py_UNICODE
*e
;
3960 if (!PyArg_NoArgs(args
))
3963 /* Shortcut for single character strings */
3964 if (PyUnicode_GET_SIZE(self
) == 1 &&
3965 Py_UNICODE_ISDECIMAL(*p
))
3966 return PyInt_FromLong(1);
3968 /* Special case for empty strings */
3969 if (PyString_GET_SIZE(self
) == 0)
3970 return PyInt_FromLong(0);
3972 e
= p
+ PyUnicode_GET_SIZE(self
);
3973 for (; p
< e
; p
++) {
3974 if (!Py_UNICODE_ISDECIMAL(*p
))
3975 return PyInt_FromLong(0);
3977 return PyInt_FromLong(1);
3980 static char isdigit__doc__
[] =
3981 "S.isdigit() -> int\n\
3983 Return 1 if there are only digit characters in S,\n\
3987 unicode_isdigit(PyUnicodeObject
*self
, PyObject
*args
)
3989 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3990 register const Py_UNICODE
*e
;
3992 if (!PyArg_NoArgs(args
))
3995 /* Shortcut for single character strings */
3996 if (PyUnicode_GET_SIZE(self
) == 1 &&
3997 Py_UNICODE_ISDIGIT(*p
))
3998 return PyInt_FromLong(1);
4000 /* Special case for empty strings */
4001 if (PyString_GET_SIZE(self
) == 0)
4002 return PyInt_FromLong(0);
4004 e
= p
+ PyUnicode_GET_SIZE(self
);
4005 for (; p
< e
; p
++) {
4006 if (!Py_UNICODE_ISDIGIT(*p
))
4007 return PyInt_FromLong(0);
4009 return PyInt_FromLong(1);
4012 static char isnumeric__doc__
[] =
4013 "S.isnumeric() -> int\n\
4015 Return 1 if there are only numeric characters in S,\n\
4019 unicode_isnumeric(PyUnicodeObject
*self
, PyObject
*args
)
4021 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4022 register const Py_UNICODE
*e
;
4024 if (!PyArg_NoArgs(args
))
4027 /* Shortcut for single character strings */
4028 if (PyUnicode_GET_SIZE(self
) == 1 &&
4029 Py_UNICODE_ISNUMERIC(*p
))
4030 return PyInt_FromLong(1);
4032 /* Special case for empty strings */
4033 if (PyString_GET_SIZE(self
) == 0)
4034 return PyInt_FromLong(0);
4036 e
= p
+ PyUnicode_GET_SIZE(self
);
4037 for (; p
< e
; p
++) {
4038 if (!Py_UNICODE_ISNUMERIC(*p
))
4039 return PyInt_FromLong(0);
4041 return PyInt_FromLong(1);
4044 static char join__doc__
[] =
4045 "S.join(sequence) -> unicode\n\
4047 Return a string which is the concatenation of the strings in the\n\
4048 sequence. The separator between elements is S.";
4051 unicode_join(PyUnicodeObject
*self
, PyObject
*args
)
4054 if (!PyArg_ParseTuple(args
, "O:join", &data
))
4057 return PyUnicode_Join((PyObject
*)self
, data
);
4061 unicode_length(PyUnicodeObject
*self
)
4063 return self
->length
;
4066 static char ljust__doc__
[] =
4067 "S.ljust(width) -> unicode\n\
4069 Return S left justified in a Unicode string of length width. Padding is\n\
4070 done using spaces.";
4073 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
4076 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
4079 if (self
->length
>= width
) {
4081 return (PyObject
*) self
;
4084 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
4087 static char lower__doc__
[] =
4088 "S.lower() -> unicode\n\
4090 Return a copy of the string S converted to lowercase.";
4093 unicode_lower(PyUnicodeObject
*self
, PyObject
*args
)
4095 if (!PyArg_NoArgs(args
))
4097 return fixup(self
, fixlower
);
4100 static char lstrip__doc__
[] =
4101 "S.lstrip() -> unicode\n\
4103 Return a copy of the string S with leading whitespace removed.";
4106 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
4108 if (!PyArg_NoArgs(args
))
4110 return strip(self
, 1, 0);
4114 unicode_repeat(PyUnicodeObject
*str
, int len
)
4125 /* no repeat, return original string */
4127 return (PyObject
*) str
;
4130 /* ensure # of chars needed doesn't overflow int and # of bytes
4131 * needed doesn't overflow size_t
4133 nchars
= len
* str
->length
;
4134 if (len
&& nchars
/ len
!= str
->length
) {
4135 PyErr_SetString(PyExc_OverflowError
,
4136 "repeated string is too long");
4139 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4140 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4141 PyErr_SetString(PyExc_OverflowError
,
4142 "repeated string is too long");
4145 u
= _PyUnicode_New(nchars
);
4152 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4156 return (PyObject
*) u
;
4159 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4169 self
= PyUnicode_FromObject(obj
);
4172 str1
= PyUnicode_FromObject(subobj
);
4177 str2
= PyUnicode_FromObject(replobj
);
4183 result
= replace((PyUnicodeObject
*)self
,
4184 (PyUnicodeObject
*)str1
,
4185 (PyUnicodeObject
*)str2
,
4193 static char replace__doc__
[] =
4194 "S.replace (old, new[, maxsplit]) -> unicode\n\
4196 Return a copy of S with all occurrences of substring\n\
4197 old replaced by new. If the optional argument maxsplit is\n\
4198 given, only the first maxsplit occurrences are replaced.";
4201 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4203 PyUnicodeObject
*str1
;
4204 PyUnicodeObject
*str2
;
4208 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4210 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4213 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4217 result
= replace(self
, str1
, str2
, maxcount
);
4225 PyObject
*unicode_repr(PyObject
*unicode
)
4227 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4228 PyUnicode_GET_SIZE(unicode
),
4232 static char rfind__doc__
[] =
4233 "S.rfind(sub [,start [,end]]) -> int\n\
4235 Return the highest index in S where substring sub is found,\n\
4236 such that sub is contained within s[start,end]. Optional\n\
4237 arguments start and end are interpreted as in slice notation.\n\
4239 Return -1 on failure.";
4242 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4244 PyUnicodeObject
*substring
;
4249 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4250 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4252 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4253 (PyObject
*)substring
);
4254 if (substring
== NULL
)
4257 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4259 Py_DECREF(substring
);
4263 static char rindex__doc__
[] =
4264 "S.rindex(sub [,start [,end]]) -> int\n\
4266 Like S.rfind() but raise ValueError when the substring is not found.";
4269 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4272 PyUnicodeObject
*substring
;
4276 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4277 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4279 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4280 (PyObject
*)substring
);
4281 if (substring
== NULL
)
4284 result
= findstring(self
, substring
, start
, end
, -1);
4286 Py_DECREF(substring
);
4288 PyErr_SetString(PyExc_ValueError
, "substring not found");
4291 return PyInt_FromLong(result
);
4294 static char rjust__doc__
[] =
4295 "S.rjust(width) -> unicode\n\
4297 Return S right justified in a Unicode string of length width. Padding is\n\
4298 done using spaces.";
4301 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4304 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4307 if (self
->length
>= width
) {
4309 return (PyObject
*) self
;
4312 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4315 static char rstrip__doc__
[] =
4316 "S.rstrip() -> unicode\n\
4318 Return a copy of the string S with trailing whitespace removed.";
4321 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
4323 if (!PyArg_NoArgs(args
))
4325 return strip(self
, 0, 1);
4329 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4331 /* standard clamping */
4336 if (end
> self
->length
)
4338 if (start
== 0 && end
== self
->length
) {
4339 /* full slice, return original string */
4341 return (PyObject
*) self
;
4346 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4350 PyObject
*PyUnicode_Split(PyObject
*s
,
4356 s
= PyUnicode_FromObject(s
);
4360 sep
= PyUnicode_FromObject(sep
);
4367 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4374 static char split__doc__
[] =
4375 "S.split([sep [,maxsplit]]) -> list of strings\n\
4377 Return a list of the words in S, using sep as the\n\
4378 delimiter string. If maxsplit is given, at most maxsplit\n\
4379 splits are done. If sep is not specified, any whitespace string\n\
4383 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4385 PyObject
*substring
= Py_None
;
4388 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4391 if (substring
== Py_None
)
4392 return split(self
, NULL
, maxcount
);
4393 else if (PyUnicode_Check(substring
))
4394 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4396 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4399 static char splitlines__doc__
[] =
4400 "S.splitlines([keepends]]) -> list of strings\n\
4402 Return a list of the lines in S, breaking at line boundaries.\n\
4403 Line breaks are not included in the resulting list unless keepends\n\
4404 is given and true.";
4407 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4411 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4414 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4418 PyObject
*unicode_str(PyUnicodeObject
*self
)
4420 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4423 static char strip__doc__
[] =
4424 "S.strip() -> unicode\n\
4426 Return a copy of S with leading and trailing whitespace removed.";
4429 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
4431 if (!PyArg_NoArgs(args
))
4433 return strip(self
, 1, 1);
4436 static char swapcase__doc__
[] =
4437 "S.swapcase() -> unicode\n\
4439 Return a copy of S with uppercase characters converted to lowercase\n\
4443 unicode_swapcase(PyUnicodeObject
*self
, PyObject
*args
)
4445 if (!PyArg_NoArgs(args
))
4447 return fixup(self
, fixswapcase
);
4450 static char translate__doc__
[] =
4451 "S.translate(table) -> unicode\n\
4453 Return a copy of the string S, where all characters have been mapped\n\
4454 through the given translation table, which must be a mapping of\n\
4455 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4456 are left untouched. Characters mapped to None are deleted.";
4459 unicode_translate(PyUnicodeObject
*self
, PyObject
*args
)
4463 if (!PyArg_ParseTuple(args
, "O:translate", &table
))
4465 return PyUnicode_TranslateCharmap(self
->str
,
4471 static char upper__doc__
[] =
4472 "S.upper() -> unicode\n\
4474 Return a copy of S converted to uppercase.";
4477 unicode_upper(PyUnicodeObject
*self
, PyObject
*args
)
4479 if (!PyArg_NoArgs(args
))
4481 return fixup(self
, fixupper
);
4485 static char zfill__doc__
[] =
4486 "S.zfill(width) -> unicode\n\
4488 Pad a numeric string x with zeros on the left, to fill a field\n\
4489 of the specified width. The string x is never truncated.";
4492 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4498 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4501 if (self
->length
>= width
) {
4503 return (PyObject
*) self
;
4506 fill
= width
- self
->length
;
4508 u
= pad(self
, fill
, 0, '0');
4510 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4511 /* move sign to beginning of string */
4512 u
->str
[0] = u
->str
[fill
];
4516 return (PyObject
*) u
;
4522 unicode_freelistsize(PyUnicodeObject
*self
, PyObject
*args
)
4524 if (!PyArg_NoArgs(args
))
4526 return PyInt_FromLong(unicode_freelist_size
);
4530 static char startswith__doc__
[] =
4531 "S.startswith(prefix[, start[, end]]) -> int\n\
4533 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4534 optional start, test S beginning at that position. With optional end, stop\n\
4535 comparing S at that position.";
4538 unicode_startswith(PyUnicodeObject
*self
,
4541 PyUnicodeObject
*substring
;
4546 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4547 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4549 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4550 (PyObject
*)substring
);
4551 if (substring
== NULL
)
4554 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4556 Py_DECREF(substring
);
4561 static char endswith__doc__
[] =
4562 "S.endswith(suffix[, start[, end]]) -> int\n\
4564 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4565 optional start, test S beginning at that position. With optional end, stop\n\
4566 comparing S at that position.";
4569 unicode_endswith(PyUnicodeObject
*self
,
4572 PyUnicodeObject
*substring
;
4577 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4578 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4580 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4581 (PyObject
*)substring
);
4582 if (substring
== NULL
)
4585 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4587 Py_DECREF(substring
);
4592 static PyMethodDef unicode_methods
[] = {
4594 /* Order is according to common usage: often used methods should
4595 appear first, since lookup is done sequentially. */
4597 {"encode", (PyCFunction
) unicode_encode
, 1, encode__doc__
},
4598 {"replace", (PyCFunction
) unicode_replace
, 1, replace__doc__
},
4599 {"split", (PyCFunction
) unicode_split
, 1, split__doc__
},
4600 {"join", (PyCFunction
) unicode_join
, 1, join__doc__
},
4601 {"capitalize", (PyCFunction
) unicode_capitalize
, 0, capitalize__doc__
},
4602 {"title", (PyCFunction
) unicode_title
, 0, title__doc__
},
4603 {"center", (PyCFunction
) unicode_center
, 1, center__doc__
},
4604 {"count", (PyCFunction
) unicode_count
, 1, count__doc__
},
4605 {"expandtabs", (PyCFunction
) unicode_expandtabs
, 1, expandtabs__doc__
},
4606 {"find", (PyCFunction
) unicode_find
, 1, find__doc__
},
4607 {"index", (PyCFunction
) unicode_index
, 1, index__doc__
},
4608 {"ljust", (PyCFunction
) unicode_ljust
, 1, ljust__doc__
},
4609 {"lower", (PyCFunction
) unicode_lower
, 0, lower__doc__
},
4610 {"lstrip", (PyCFunction
) unicode_lstrip
, 0, lstrip__doc__
},
4611 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4612 {"rfind", (PyCFunction
) unicode_rfind
, 1, rfind__doc__
},
4613 {"rindex", (PyCFunction
) unicode_rindex
, 1, rindex__doc__
},
4614 {"rjust", (PyCFunction
) unicode_rjust
, 1, rjust__doc__
},
4615 {"rstrip", (PyCFunction
) unicode_rstrip
, 0, rstrip__doc__
},
4616 {"splitlines", (PyCFunction
) unicode_splitlines
, 1, splitlines__doc__
},
4617 {"strip", (PyCFunction
) unicode_strip
, 0, strip__doc__
},
4618 {"swapcase", (PyCFunction
) unicode_swapcase
, 0, swapcase__doc__
},
4619 {"translate", (PyCFunction
) unicode_translate
, 1, translate__doc__
},
4620 {"upper", (PyCFunction
) unicode_upper
, 0, upper__doc__
},
4621 {"startswith", (PyCFunction
) unicode_startswith
, 1, startswith__doc__
},
4622 {"endswith", (PyCFunction
) unicode_endswith
, 1, endswith__doc__
},
4623 {"islower", (PyCFunction
) unicode_islower
, 0, islower__doc__
},
4624 {"isupper", (PyCFunction
) unicode_isupper
, 0, isupper__doc__
},
4625 {"istitle", (PyCFunction
) unicode_istitle
, 0, istitle__doc__
},
4626 {"isspace", (PyCFunction
) unicode_isspace
, 0, isspace__doc__
},
4627 {"isdecimal", (PyCFunction
) unicode_isdecimal
, 0, isdecimal__doc__
},
4628 {"isdigit", (PyCFunction
) unicode_isdigit
, 0, isdigit__doc__
},
4629 {"isnumeric", (PyCFunction
) unicode_isnumeric
, 0, isnumeric__doc__
},
4630 {"isalpha", (PyCFunction
) unicode_isalpha
, 0, isalpha__doc__
},
4631 {"isalnum", (PyCFunction
) unicode_isalnum
, 0, isalnum__doc__
},
4633 {"zfill", (PyCFunction
) unicode_zfill
, 1, zfill__doc__
},
4634 {"capwords", (PyCFunction
) unicode_capwords
, 0, capwords__doc__
},
4638 /* This one is just used for debugging the implementation. */
4639 {"freelistsize", (PyCFunction
) unicode_freelistsize
, 0},
4646 unicode_getattr(PyUnicodeObject
*self
, char *name
)
4648 return Py_FindMethod(unicode_methods
, (PyObject
*) self
, name
);
4651 static PySequenceMethods unicode_as_sequence
= {
4652 (inquiry
) unicode_length
, /* sq_length */
4653 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4654 (intargfunc
) unicode_repeat
, /* sq_repeat */
4655 (intargfunc
) unicode_getitem
, /* sq_item */
4656 (intintargfunc
) unicode_slice
, /* sq_slice */
4657 0, /* sq_ass_item */
4658 0, /* sq_ass_slice */
4659 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4663 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4668 PyErr_SetString(PyExc_SystemError
,
4669 "accessing non-existent unicode segment");
4672 *ptr
= (void *) self
->str
;
4673 return PyUnicode_GET_DATA_SIZE(self
);
4677 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4680 PyErr_SetString(PyExc_TypeError
,
4681 "cannot use unicode as modifyable buffer");
4686 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4690 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4695 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
4702 PyErr_SetString(PyExc_SystemError
,
4703 "accessing non-existent unicode segment");
4706 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
4709 *ptr
= (void *) PyString_AS_STRING(str
);
4710 return PyString_GET_SIZE(str
);
4713 /* Helpers for PyUnicode_Format() */
4716 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
4718 int argidx
= *p_argidx
;
4719 if (argidx
< arglen
) {
4724 return PyTuple_GetItem(args
, argidx
);
4726 PyErr_SetString(PyExc_TypeError
,
4727 "not enough arguments for format string");
4731 #define F_LJUST (1<<0)
4732 #define F_SIGN (1<<1)
4733 #define F_BLANK (1<<2)
4734 #define F_ALT (1<<3)
4735 #define F_ZERO (1<<4)
4738 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
4744 va_start(va
, format
);
4746 /* First, format the string as char array, then expand to Py_UNICODE
4748 charbuffer
= (char *)buffer
;
4749 len
= vsprintf(charbuffer
, format
, va
);
4750 for (i
= len
- 1; i
>= 0; i
--)
4751 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
4758 formatfloat(Py_UNICODE
*buf
,
4765 /* fmt = '%#.' + `prec` + `type`
4766 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4770 x
= PyFloat_AsDouble(v
);
4771 if (x
== -1.0 && PyErr_Occurred())
4775 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
4777 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4778 /* worst case length calc to ensure no buffer overrun:
4780 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4781 for any double rep.)
4782 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4783 If prec=0 the effective precision is 1 (the leading digit is
4784 always given), therefore increase by one to 10+prec. */
4785 if (buflen
<= (size_t)10 + (size_t)prec
) {
4786 PyErr_SetString(PyExc_OverflowError
,
4787 "formatted float is too long (precision too long?)");
4790 return usprintf(buf
, fmt
, x
);
4794 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
4798 PyObject
*str
; /* temporary string object. */
4799 PyUnicodeObject
*result
;
4801 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
4804 result
= _PyUnicode_New(len
);
4805 for (i
= 0; i
< len
; i
++)
4806 result
->str
[i
] = buf
[i
];
4807 result
->str
[len
] = 0;
4809 return (PyObject
*)result
;
4813 formatint(Py_UNICODE
*buf
,
4820 /* fmt = '%#.' + `prec` + 'l' + `type`
4821 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4823 char fmt
[64]; /* plenty big enough! */
4825 int use_native_c_format
= 1;
4827 x
= PyInt_AsLong(v
);
4828 if (x
== -1 && PyErr_Occurred())
4832 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4833 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4834 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
4835 PyErr_SetString(PyExc_OverflowError
,
4836 "formatted integer is too long (precision too long?)");
4839 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4840 * but we want it (for consistency with other %#x conversions, and
4841 * for consistency with Python's hex() function).
4842 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4843 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4844 * So add it only if the platform doesn't already.
4846 if (x
== 0 && (flags
& F_ALT
) && (type
== 'x' || type
== 'X')) {
4847 /* Only way to know what the platform does is to try it. */
4848 sprintf(fmt
, type
== 'x' ? "%#x" : "%#X", 0);
4849 if (fmt
[1] != (char)type
) {
4850 /* Supply our own leading 0x/0X -- needed under std C */
4851 use_native_c_format
= 0;
4852 sprintf(fmt
, "0%c%%#.%dl%c", type
, prec
, type
);
4855 if (use_native_c_format
)
4856 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4857 return usprintf(buf
, fmt
, x
);
4861 formatchar(Py_UNICODE
*buf
,
4865 /* presume that the buffer is at least 2 characters long */
4866 if (PyUnicode_Check(v
)) {
4867 if (PyUnicode_GET_SIZE(v
) != 1)
4869 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
4872 else if (PyString_Check(v
)) {
4873 if (PyString_GET_SIZE(v
) != 1)
4875 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
4879 /* Integer input truncated to a character */
4881 x
= PyInt_AsLong(v
);
4882 if (x
== -1 && PyErr_Occurred())
4890 PyErr_SetString(PyExc_TypeError
,
4891 "%c requires int or char");
4895 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4897 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4898 chars are formatted. XXX This is a magic number. Each formatting
4899 routine does bounds checking to ensure no overflow, but a better
4900 solution may be to malloc a buffer of appropriate size for each
4901 format. For now, the current solution is sufficient.
4903 #define FORMATBUFLEN (size_t)120
4905 PyObject
*PyUnicode_Format(PyObject
*format
,
4908 Py_UNICODE
*fmt
, *res
;
4909 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
4911 PyUnicodeObject
*result
= NULL
;
4912 PyObject
*dict
= NULL
;
4915 if (format
== NULL
|| args
== NULL
) {
4916 PyErr_BadInternalCall();
4919 uformat
= PyUnicode_FromObject(format
);
4920 if (uformat
== NULL
)
4922 fmt
= PyUnicode_AS_UNICODE(uformat
);
4923 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
4925 reslen
= rescnt
= fmtcnt
+ 100;
4926 result
= _PyUnicode_New(reslen
);
4929 res
= PyUnicode_AS_UNICODE(result
);
4931 if (PyTuple_Check(args
)) {
4932 arglen
= PyTuple_Size(args
);
4939 if (args
->ob_type
->tp_as_mapping
)
4942 while (--fmtcnt
>= 0) {
4945 rescnt
= fmtcnt
+ 100;
4947 if (_PyUnicode_Resize(&result
, reslen
) < 0)
4949 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
4955 /* Got a format specifier */
4959 Py_UNICODE c
= '\0';
4962 PyObject
*temp
= NULL
;
4966 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
4970 Py_UNICODE
*keystart
;
4976 PyErr_SetString(PyExc_TypeError
,
4977 "format requires a mapping");
4983 /* Skip over balanced parentheses */
4984 while (pcount
> 0 && --fmtcnt
>= 0) {
4987 else if (*fmt
== '(')
4991 keylen
= fmt
- keystart
- 1;
4992 if (fmtcnt
< 0 || pcount
> 0) {
4993 PyErr_SetString(PyExc_ValueError
,
4994 "incomplete format key");
4997 /* keys are converted to strings using UTF-8 and
4998 then looked up since Python uses strings to hold
4999 variables names etc. in its namespaces and we
5000 wouldn't want to break common idioms. */
5001 key
= PyUnicode_EncodeUTF8(keystart
,
5010 args
= PyObject_GetItem(dict
, key
);
5019 while (--fmtcnt
>= 0) {
5020 switch (c
= *fmt
++) {
5021 case '-': flags
|= F_LJUST
; continue;
5022 case '+': flags
|= F_SIGN
; continue;
5023 case ' ': flags
|= F_BLANK
; continue;
5024 case '#': flags
|= F_ALT
; continue;
5025 case '0': flags
|= F_ZERO
; continue;
5030 v
= getnextarg(args
, arglen
, &argidx
);
5033 if (!PyInt_Check(v
)) {
5034 PyErr_SetString(PyExc_TypeError
,
5038 width
= PyInt_AsLong(v
);
5046 else if (c
>= '0' && c
<= '9') {
5048 while (--fmtcnt
>= 0) {
5050 if (c
< '0' || c
> '9')
5052 if ((width
*10) / 10 != width
) {
5053 PyErr_SetString(PyExc_ValueError
,
5057 width
= width
*10 + (c
- '0');
5065 v
= getnextarg(args
, arglen
, &argidx
);
5068 if (!PyInt_Check(v
)) {
5069 PyErr_SetString(PyExc_TypeError
,
5073 prec
= PyInt_AsLong(v
);
5079 else if (c
>= '0' && c
<= '9') {
5081 while (--fmtcnt
>= 0) {
5082 c
= Py_CHARMASK(*fmt
++);
5083 if (c
< '0' || c
> '9')
5085 if ((prec
*10) / 10 != prec
) {
5086 PyErr_SetString(PyExc_ValueError
,
5090 prec
= prec
*10 + (c
- '0');
5095 if (c
== 'h' || c
== 'l' || c
== 'L') {
5101 PyErr_SetString(PyExc_ValueError
,
5102 "incomplete format");
5106 v
= getnextarg(args
, arglen
, &argidx
);
5116 /* presume that buffer length is at least 1 */
5123 if (PyUnicode_Check(v
) && c
== 's') {
5130 temp
= PyObject_Str(v
);
5132 temp
= PyObject_Repr(v
);
5135 if (!PyString_Check(temp
)) {
5136 /* XXX Note: this should never happen, since
5137 PyObject_Repr() and PyObject_Str() assure
5140 PyErr_SetString(PyExc_TypeError
,
5141 "%s argument has non-string str()");
5144 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5145 PyString_GET_SIZE(temp
),
5153 pbuf
= PyUnicode_AS_UNICODE(temp
);
5154 len
= PyUnicode_GET_SIZE(temp
);
5155 if (prec
>= 0 && len
> prec
)
5167 if (PyLong_Check(v
)) {
5168 temp
= formatlong(v
, flags
, prec
, c
);
5171 pbuf
= PyUnicode_AS_UNICODE(temp
);
5172 len
= PyUnicode_GET_SIZE(temp
);
5173 /* unbounded ints can always produce
5174 a sign character! */
5179 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5183 /* only d conversion is signed */
5196 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5207 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5213 PyErr_Format(PyExc_ValueError
,
5214 "unsupported format character '%c' (0x%x) "
5216 (31<=c
&& c
<=126) ? c
: '?',
5217 c
, fmt
-1 - PyUnicode_AS_UNICODE(uformat
));
5221 if (*pbuf
== '-' || *pbuf
== '+') {
5225 else if (flags
& F_SIGN
)
5227 else if (flags
& F_BLANK
)
5234 if (rescnt
< width
+ (sign
!= 0)) {
5236 rescnt
= width
+ fmtcnt
+ 100;
5238 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5240 res
= PyUnicode_AS_UNICODE(result
)
5250 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5251 assert(pbuf
[0] == '0');
5252 assert(pbuf
[1] == c
);
5263 if (width
> len
&& !(flags
& F_LJUST
)) {
5267 } while (--width
> len
);
5272 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5273 assert(pbuf
[0] == '0');
5274 assert(pbuf
[1] == c
);
5279 Py_UNICODE_COPY(res
, pbuf
, len
);
5282 while (--width
>= len
) {
5286 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5287 PyErr_SetString(PyExc_TypeError
,
5288 "not all arguments converted");
5294 if (argidx
< arglen
&& !dict
) {
5295 PyErr_SetString(PyExc_TypeError
,
5296 "not all arguments converted");
5304 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
5306 return (PyObject
*)result
;
5317 static PyBufferProcs unicode_as_buffer
= {
5318 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5319 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5320 (getsegcountproc
) unicode_buffer_getsegcount
,
5321 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5324 PyTypeObject PyUnicode_Type
= {
5325 PyObject_HEAD_INIT(&PyType_Type
)
5327 "unicode", /* tp_name */
5328 sizeof(PyUnicodeObject
), /* tp_size */
5329 0, /* tp_itemsize */
5331 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5333 (getattrfunc
)unicode_getattr
, /* tp_getattr */
5335 (cmpfunc
) unicode_compare
, /* tp_compare */
5336 (reprfunc
) unicode_repr
, /* tp_repr */
5337 0, /* tp_as_number */
5338 &unicode_as_sequence
, /* tp_as_sequence */
5339 0, /* tp_as_mapping */
5340 (hashfunc
) unicode_hash
, /* tp_hash*/
5342 (reprfunc
) unicode_str
, /* tp_str */
5343 (getattrofunc
) NULL
, /* tp_getattro */
5344 (setattrofunc
) NULL
, /* tp_setattro */
5345 &unicode_as_buffer
, /* tp_as_buffer */
5346 Py_TPFLAGS_DEFAULT
, /* tp_flags */
5349 /* Initialize the Unicode implementation */
5351 void _PyUnicode_Init(void)
5355 /* Init the implementation */
5356 unicode_freelist
= NULL
;
5357 unicode_freelist_size
= 0;
5358 unicode_empty
= _PyUnicode_New(0);
5359 strcpy(unicode_default_encoding
, "ascii");
5360 for (i
= 0; i
< 256; i
++)
5361 unicode_latin1
[i
] = NULL
;
5364 /* Finalize the Unicode implementation */
5367 _PyUnicode_Fini(void)
5372 Py_XDECREF(unicode_empty
);
5373 unicode_empty
= NULL
;
5375 for (i
= 0; i
< 256; i
++) {
5376 if (unicode_latin1
[i
]) {
5377 Py_DECREF(unicode_latin1
[i
]);
5378 unicode_latin1
[i
] = NULL
;
5382 for (u
= unicode_freelist
; u
!= NULL
;) {
5383 PyUnicodeObject
*v
= u
;
5384 u
= *(PyUnicodeObject
**)u
;
5387 Py_XDECREF(v
->defenc
);
5390 unicode_freelist
= NULL
;
5391 unicode_freelist_size
= 0;