3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
11 --------------------------------------------------------------------
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
29 * Written by Fredrik Lundh, January 1999.
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
67 #include "unicodeobject.h"
74 /* Limit for the Unicode object free list */
76 #define MAX_UNICODE_FREELIST_SIZE 1024
78 /* Limit for the Unicode object free list stay alive optimization.
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
84 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
85 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
86 malloc()-overhead) bytes of unused garbage.
88 Setting the limit to 0 effectively turns the feature off.
90 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
95 #define KEEPALIVE_SIZE_LIMIT 9
97 /* Endianness switches; defaults to little endian */
99 #ifdef WORDS_BIGENDIAN
100 # define BYTEORDER_IS_BIG_ENDIAN
102 # define BYTEORDER_IS_LITTLE_ENDIAN
105 /* --- Globals ------------------------------------------------------------
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
112 /* The empty Unicode object */
113 static PyUnicodeObject
*unicode_empty
;
115 /* Free list for Unicode objects */
116 static PyUnicodeObject
*unicode_freelist
;
117 static int unicode_freelist_size
;
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
127 static char unicode_default_encoding
[100];
129 /* --- Unicode Object ----------------------------------------------------- */
132 int _PyUnicode_Resize(register PyUnicodeObject
*unicode
,
137 /* Shortcut if there's nothing much to do. */
138 if (unicode
->length
== length
)
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode
== unicode_empty
) {
143 PyErr_SetString(PyExc_SystemError
,
144 "can't resize empty unicode object");
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr
= unicode
->str
;
151 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
153 unicode
->str
= oldstr
;
157 unicode
->str
[length
] = 0;
158 unicode
->length
= length
;
161 /* Reset the object caches */
162 if (unicode
->defenc
) {
163 Py_DECREF(unicode
->defenc
);
164 unicode
->defenc
= NULL
;
171 int PyUnicode_Resize(PyObject
**unicode
,
176 if (unicode
== NULL
) {
177 PyErr_BadInternalCall();
180 v
= (PyUnicodeObject
*)*unicode
;
181 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
182 PyErr_BadInternalCall();
185 return _PyUnicode_Resize(v
, length
);
188 /* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
197 PyUnicodeObject
*_PyUnicode_New(int length
)
199 register PyUnicodeObject
*unicode
;
201 /* Optimization for empty strings */
202 if (length
== 0 && unicode_empty
!= NULL
) {
203 Py_INCREF(unicode_empty
);
204 return unicode_empty
;
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist
) {
209 unicode
= unicode_freelist
;
210 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
211 unicode_freelist_size
--;
213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode
->length
< length
) &&
216 _PyUnicode_Resize(unicode
, length
)) {
217 PyMem_DEL(unicode
->str
);
222 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
224 PyObject_INIT(unicode
, &PyUnicode_Type
);
227 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
230 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
237 unicode
->str
[length
] = 0;
238 unicode
->length
= length
;
240 unicode
->defenc
= NULL
;
244 _Py_ForgetReference((PyObject
*)unicode
);
245 PyObject_DEL(unicode
);
250 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
252 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
253 /* Keep-Alive optimization */
254 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
255 PyMem_DEL(unicode
->str
);
259 if (unicode
->defenc
) {
260 Py_DECREF(unicode
->defenc
);
261 unicode
->defenc
= NULL
;
263 /* Add to free list */
264 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
265 unicode_freelist
= unicode
;
266 unicode_freelist_size
++;
269 PyMem_DEL(unicode
->str
);
270 Py_XDECREF(unicode
->defenc
);
271 PyObject_DEL(unicode
);
275 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
278 PyUnicodeObject
*unicode
;
280 unicode
= _PyUnicode_New(size
);
284 /* Copy the Unicode data into the new object */
286 memcpy(unicode
->str
, u
, size
* sizeof(Py_UNICODE
));
288 return (PyObject
*)unicode
;
293 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
296 PyUnicodeObject
*unicode
;
299 PyErr_BadInternalCall();
303 unicode
= _PyUnicode_New(size
);
307 /* Copy the wchar_t data into the new object */
308 #ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
312 register Py_UNICODE
*u
;
314 u
= PyUnicode_AS_UNICODE(unicode
);
315 for (i
= size
; i
>= 0; i
--)
320 return (PyObject
*)unicode
;
323 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
327 if (unicode
== NULL
) {
328 PyErr_BadInternalCall();
331 if (size
> PyUnicode_GET_SIZE(unicode
))
332 size
= PyUnicode_GET_SIZE(unicode
);
333 #ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
337 register Py_UNICODE
*u
;
339 u
= PyUnicode_AS_UNICODE(unicode
);
340 for (i
= size
; i
>= 0; i
--)
350 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
352 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
355 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
356 const char *encoding
,
365 PyErr_BadInternalCall();
370 if (PyInstance_Check(obj
)) {
372 func
= PyObject_GetAttrString(obj
, "__str__");
374 PyErr_SetString(PyExc_TypeError
,
375 "coercing to Unicode: instance doesn't define __str__");
378 obj
= PyEval_CallObject(func
, NULL
);
384 if (PyUnicode_Check(obj
)) {
388 PyErr_SetString(PyExc_TypeError
,
389 "decoding Unicode is not supported");
394 else if (PyString_Check(obj
)) {
395 s
= PyString_AS_STRING(obj
);
396 len
= PyString_GET_SIZE(obj
);
398 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError
))
402 PyErr_Format(PyExc_TypeError
,
403 "coercing to Unicode: need string or buffer, "
405 obj
->ob_type
->tp_name
);
409 /* Convert to Unicode */
411 Py_INCREF(unicode_empty
);
412 v
= (PyObject
*)unicode_empty
;
415 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
429 PyObject
*PyUnicode_Decode(const char *s
,
431 const char *encoding
,
434 PyObject
*buffer
= NULL
, *unicode
;
436 if (encoding
== NULL
)
437 encoding
= PyUnicode_GetDefaultEncoding();
439 /* Shortcuts for common default encodings */
440 if (strcmp(encoding
, "utf-8") == 0)
441 return PyUnicode_DecodeUTF8(s
, size
, errors
);
442 else if (strcmp(encoding
, "latin-1") == 0)
443 return PyUnicode_DecodeLatin1(s
, size
, errors
);
444 else if (strcmp(encoding
, "ascii") == 0)
445 return PyUnicode_DecodeASCII(s
, size
, errors
);
447 /* Decode via the codec registry */
448 buffer
= PyBuffer_FromMemory((void *)s
, size
);
451 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
454 if (!PyUnicode_Check(unicode
)) {
455 PyErr_Format(PyExc_TypeError
,
456 "decoder did not return an unicode object (type=%.400s)",
457 unicode
->ob_type
->tp_name
);
469 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
471 const char *encoding
,
474 PyObject
*v
, *unicode
;
476 unicode
= PyUnicode_FromUnicode(s
, size
);
479 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
484 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
485 const char *encoding
,
490 if (!PyUnicode_Check(unicode
)) {
495 if (encoding
== NULL
)
496 encoding
= PyUnicode_GetDefaultEncoding();
498 /* Shortcuts for common default encodings */
499 if (errors
== NULL
) {
500 if (strcmp(encoding
, "utf-8") == 0)
501 return PyUnicode_AsUTF8String(unicode
);
502 else if (strcmp(encoding
, "latin-1") == 0)
503 return PyUnicode_AsLatin1String(unicode
);
504 else if (strcmp(encoding
, "ascii") == 0)
505 return PyUnicode_AsASCIIString(unicode
);
508 /* Encode via the codec registry */
509 v
= PyCodec_Encode(unicode
, encoding
, errors
);
512 /* XXX Should we really enforce this ? */
513 if (!PyString_Check(v
)) {
514 PyErr_Format(PyExc_TypeError
,
515 "encoder did not return a string object (type=%.400s)",
516 v
->ob_type
->tp_name
);
526 /* Return a Python string holding the default encoded value of the
529 The resulting string is cached in the Unicode object for subsequent
530 usage by this function. The cached version is needed to implement
531 the character buffer interface and will live (at least) as long as
532 the Unicode object itself.
534 The refcount of the string is *not* incremented.
536 *** Exported for internal use by the interpreter only !!! ***
540 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
543 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
547 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
548 if (v
&& errors
== NULL
)
549 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
553 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
555 if (!PyUnicode_Check(unicode
)) {
559 return PyUnicode_AS_UNICODE(unicode
);
565 int PyUnicode_GetSize(PyObject
*unicode
)
567 if (!PyUnicode_Check(unicode
)) {
571 return PyUnicode_GET_SIZE(unicode
);
577 const char *PyUnicode_GetDefaultEncoding(void)
579 return unicode_default_encoding
;
582 int PyUnicode_SetDefaultEncoding(const char *encoding
)
586 /* Make sure the encoding is valid. As side effect, this also
587 loads the encoding into the codec registry cache. */
588 v
= _PyCodec_Lookup(encoding
);
592 strncpy(unicode_default_encoding
,
594 sizeof(unicode_default_encoding
));
601 /* --- UTF-8 Codec -------------------------------------------------------- */
604 char utf8_code_length
[256] = {
605 /* Map UTF-8 encoded prefix byte to sequence length. zero means
606 illegal prefix. see RFC 2279 for details */
607 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
622 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
626 int utf8_decoding_error(const char **source
,
631 if ((errors
== NULL
) ||
632 (strcmp(errors
,"strict") == 0)) {
633 PyErr_Format(PyExc_UnicodeError
,
634 "UTF-8 decoding error: %.400s",
638 else if (strcmp(errors
,"ignore") == 0) {
642 else if (strcmp(errors
,"replace") == 0) {
644 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
649 PyErr_Format(PyExc_ValueError
,
650 "UTF-8 decoding error; unknown error handling code: %.400s",
656 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
662 PyUnicodeObject
*unicode
;
664 const char *errmsg
= "";
666 /* Note: size will always be longer than the resulting Unicode
668 unicode
= _PyUnicode_New(size
);
672 return (PyObject
*)unicode
;
674 /* Unpack UTF-8 encoded data */
679 Py_UCS4 ch
= (unsigned char)*s
;
682 *p
++ = (Py_UNICODE
)ch
;
687 n
= utf8_code_length
[ch
];
690 errmsg
= "unexpected end of data";
697 errmsg
= "unexpected code byte";
702 errmsg
= "internal error";
707 if ((s
[1] & 0xc0) != 0x80) {
708 errmsg
= "invalid data";
711 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
713 errmsg
= "illegal encoding";
717 *p
++ = (Py_UNICODE
)ch
;
721 if ((s
[1] & 0xc0) != 0x80 ||
722 (s
[2] & 0xc0) != 0x80) {
723 errmsg
= "invalid data";
726 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
727 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
728 errmsg
= "illegal encoding";
732 *p
++ = (Py_UNICODE
)ch
;
736 if ((s
[1] & 0xc0) != 0x80 ||
737 (s
[2] & 0xc0) != 0x80 ||
738 (s
[3] & 0xc0) != 0x80) {
739 errmsg
= "invalid data";
742 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
743 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
744 /* validate and convert to UTF-16 */
745 if ((ch
< 0x10000) || /* minimum value allowed for 4
747 (ch
> 0x10ffff)) { /* maximum value allowed for
749 errmsg
= "illegal encoding";
752 /* compute and append the two surrogates: */
754 /* translate from 10000..10FFFF to 0..FFFF */
757 /* high surrogate = top 10 bits added to D800 */
758 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
760 /* low surrogate = bottom 10 bits added to DC00 */
761 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& ~0xFC00));
765 /* Other sizes are only needed for UCS-4 */
766 errmsg
= "unsupported Unicode code range";
774 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
779 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
782 return (PyObject
*)unicode
;
789 /* Not used anymore, now that the encoder supports UTF-16
793 int utf8_encoding_error(const Py_UNICODE
**source
,
798 if ((errors
== NULL
) ||
799 (strcmp(errors
,"strict") == 0)) {
800 PyErr_Format(PyExc_UnicodeError
,
801 "UTF-8 encoding error: %.400s",
805 else if (strcmp(errors
,"ignore") == 0) {
808 else if (strcmp(errors
,"replace") == 0) {
814 PyErr_Format(PyExc_ValueError
,
815 "UTF-8 encoding error; "
816 "unknown error handling code: %.400s",
823 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
831 unsigned int cbAllocated
= 3 * size
;
832 unsigned int cbWritten
= 0;
835 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
841 p
= q
= PyString_AS_STRING(v
);
848 else if (ch
< 0x0800) {
849 *p
++ = 0xc0 | (ch
>> 6);
850 *p
++ = 0x80 | (ch
& 0x3f);
854 /* Check for high surrogate */
855 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
858 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
860 if (cbWritten
>= (cbAllocated
- 4)) {
861 /* Provide enough room for some more
864 if (_PyString_Resize(&v
, cbAllocated
))
868 /* combine the two values */
869 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
871 *p
++ = (char)((ch
>> 18) | 0xf0);
872 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
879 *p
++ = (char)(0xe0 | (ch
>> 12));
882 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
883 *p
++ = (char)(0x80 | (ch
& 0x3f));
887 if (_PyString_Resize(&v
, p
- q
))
896 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
898 if (!PyUnicode_Check(unicode
)) {
902 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
903 PyUnicode_GET_SIZE(unicode
),
907 /* --- UTF-16 Codec ------------------------------------------------------- */
910 int utf16_decoding_error(const Py_UNICODE
**source
,
915 if ((errors
== NULL
) ||
916 (strcmp(errors
,"strict") == 0)) {
917 PyErr_Format(PyExc_UnicodeError
,
918 "UTF-16 decoding error: %.400s",
922 else if (strcmp(errors
,"ignore") == 0) {
925 else if (strcmp(errors
,"replace") == 0) {
927 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
933 PyErr_Format(PyExc_ValueError
,
934 "UTF-16 decoding error; "
935 "unknown error handling code: %.400s",
941 PyObject
*PyUnicode_DecodeUTF16(const char *s
,
946 PyUnicodeObject
*unicode
;
948 const Py_UNICODE
*q
, *e
;
950 const char *errmsg
= "";
952 /* size should be an even number */
953 if (size
% sizeof(Py_UNICODE
) != 0) {
954 if (utf16_decoding_error(NULL
, NULL
, errors
, "truncated data"))
956 /* The remaining input chars are ignored if we fall through
960 /* Note: size will always be longer than the resulting Unicode
962 unicode
= _PyUnicode_New(size
);
966 return (PyObject
*)unicode
;
968 /* Unpack UTF-16 encoded data */
971 e
= q
+ (size
/ sizeof(Py_UNICODE
));
977 register Py_UNICODE ch
= *q
++;
979 /* Check for BOM marks (U+FEFF) in the input and adjust
980 current byte order setting accordingly. Swap input
981 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
983 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
987 } else if (ch
== 0xFFFE) {
992 ch
= (ch
>> 8) | (ch
<< 8);
997 } else if (ch
== 0xFFFE) {
1002 ch
= (ch
>> 8) | (ch
<< 8);
1004 if (ch
< 0xD800 || ch
> 0xDFFF) {
1009 /* UTF-16 code pair: */
1011 errmsg
= "unexpected end of data";
1014 if (0xDC00 <= *q
&& *q
<= 0xDFFF) {
1016 if (0xD800 <= *q
&& *q
<= 0xDBFF) {
1017 /* This is valid data (a UTF-16 surrogate pair), but
1018 we are not able to store this information since our
1019 Py_UNICODE type only has 16 bits... this might
1020 change someday, even though it's unlikely. */
1021 errmsg
= "code pairs are not supported";
1027 errmsg
= "illegal encoding";
1028 /* Fall through to report the error */
1031 if (utf16_decoding_error(&q
, &p
, errors
, errmsg
))
1039 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
1042 return (PyObject
*)unicode
;
1051 PyObject
*PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1060 /* We don't create UTF-16 pairs... */
1061 v
= PyString_FromStringAndSize(NULL
,
1062 sizeof(Py_UNICODE
) * (size
+ (byteorder
== 0)));
1066 q
= PyString_AS_STRING(v
);
1067 p
= (Py_UNICODE
*)q
;
1072 if (byteorder
== 0 ||
1073 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1079 memcpy(p
, s
, size
* sizeof(Py_UNICODE
));
1081 while (size
-- > 0) {
1082 Py_UNICODE ch
= *s
++;
1083 *p
++ = (ch
>> 8) | (ch
<< 8);
1088 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1090 if (!PyUnicode_Check(unicode
)) {
1091 PyErr_BadArgument();
1094 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1095 PyUnicode_GET_SIZE(unicode
),
1100 /* --- Unicode Escape Codec ----------------------------------------------- */
1103 int unicodeescape_decoding_error(const char **source
,
1106 const char *details
)
1108 if ((errors
== NULL
) ||
1109 (strcmp(errors
,"strict") == 0)) {
1110 PyErr_Format(PyExc_UnicodeError
,
1111 "Unicode-Escape decoding error: %.400s",
1115 else if (strcmp(errors
,"ignore") == 0) {
1118 else if (strcmp(errors
,"replace") == 0) {
1119 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1123 PyErr_Format(PyExc_ValueError
,
1124 "Unicode-Escape decoding error; "
1125 "unknown error handling code: %.400s",
1131 static _Py_UCNHashAPI
*pucnHash
= NULL
;
1134 int mystrnicmp(const char *s1
, const char *s2
, size_t count
)
1142 c1
= tolower(*(s1
++));
1143 c2
= tolower(*(s2
++));
1145 while(--count
&& c1
== c2
);
1153 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1158 Py_UNICODE
*p
= NULL
, *buf
= NULL
;
1162 /* Escaped strings will always be longer than the resulting
1163 Unicode string, so we start with size here and then reduce the
1164 length after conversion to the true value. */
1165 v
= _PyUnicode_New(size
);
1169 return (PyObject
*)v
;
1170 p
= buf
= PyUnicode_AS_UNICODE(v
);
1177 /* Non-escape characters are interpreted as Unicode ordinals */
1179 *p
++ = (unsigned char)*s
++;
1189 case '\\': *p
++ = '\\'; break;
1190 case '\'': *p
++ = '\''; break;
1191 case '\"': *p
++ = '\"'; break;
1192 case 'b': *p
++ = '\b'; break;
1193 case 'f': *p
++ = '\014'; break; /* FF */
1194 case 't': *p
++ = '\t'; break;
1195 case 'n': *p
++ = '\n'; break;
1196 case 'r': *p
++ = '\r'; break;
1197 case 'v': *p
++ = '\013'; break; /* VT */
1198 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1200 /* \OOO (octal) escapes */
1201 case '0': case '1': case '2': case '3':
1202 case '4': case '5': case '6': case '7':
1204 if ('0' <= *s
&& *s
<= '7') {
1205 x
= (x
<<3) + *s
++ - '0';
1206 if ('0' <= *s
&& *s
<= '7')
1207 x
= (x
<<3) + *s
++ - '0';
1212 /* \xXX with two hex digits */
1214 for (x
= 0, i
= 0; i
< 2; i
++) {
1215 c
= (unsigned char)s
[i
];
1217 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1224 if (c
>= '0' && c
<= '9')
1226 else if (c
>= 'a' && c
<= 'f')
1235 /* \uXXXX with 4 hex digits */
1237 for (x
= 0, i
= 0; i
< 4; i
++) {
1238 c
= (unsigned char)s
[i
];
1240 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1241 "truncated \\uXXXX"))
1247 if (c
>= '0' && c
<= '9')
1249 else if (c
>= 'a' && c
<= 'f')
1258 /* \UXXXXXXXX with 8 hex digits */
1260 for (chr
= 0, i
= 0; i
< 8; i
++) {
1261 c
= (unsigned char)s
[i
];
1263 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1264 "truncated \\uXXXX"))
1269 chr
= (chr
<<4) & ~0xF;
1270 if (c
>= '0' && c
<= '9')
1272 else if (c
>= 'a' && c
<= 'f')
1273 chr
+= 10 + c
- 'a';
1275 chr
+= 10 + c
- 'A';
1281 /* Ok, we need to deal with Unicode Character Names now,
1282 * make sure we've imported the hash table data...
1284 if (pucnHash
== NULL
) {
1285 PyObject
*mod
= 0, *v
= 0;
1286 mod
= PyImport_ImportModule("ucnhash");
1289 v
= PyObject_GetAttrString(mod
,"ucnhashAPI");
1293 pucnHash
= PyCObject_AsVoidPtr(v
);
1295 if (pucnHash
== NULL
)
1300 const char *start
= s
+ 1;
1301 const char *endBrace
= start
;
1304 /* look for either the closing brace, or we
1305 * exceed the maximum length of the unicode character names
1307 while (*endBrace
!= '}' &&
1308 (unsigned int)(endBrace
- start
) <=
1314 if (endBrace
!= end
&& *endBrace
== '}') {
1315 j
= pucnHash
->hash(start
, endBrace
- start
);
1316 if (j
> pucnHash
->cKeys
||
1319 ((_Py_UnicodeCharacterName
*)
1320 (pucnHash
->getValue(j
)))->pszUCN
,
1321 (int)(endBrace
- start
)) != 0)
1323 if (unicodeescape_decoding_error(
1325 "Invalid Unicode Character Name"))
1329 goto ucnFallthrough
;
1331 chr
= ((_Py_UnicodeCharacterName
*)
1332 (pucnHash
->getValue(j
)))->value
;
1336 if (unicodeescape_decoding_error(
1338 "Unicode name missing closing brace"))
1340 goto ucnFallthrough
;
1344 if (unicodeescape_decoding_error(
1346 "Missing opening brace for Unicode Character Name escape"))
1349 /* fall through on purpose */
1352 *p
++ = (unsigned char)s
[-1];
1355 /* when we get here, chr is a 32-bit unicode character */
1357 /* UCS-2 character */
1358 *p
++ = (Py_UNICODE
) chr
;
1359 else if (chr
<= 0x10ffff) {
1360 /* UCS-4 character. store as two surrogate characters */
1362 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1363 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& ~0xFC00);
1365 if (unicodeescape_decoding_error(
1367 "Illegal Unicode character")
1373 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1375 return (PyObject
*)v
;
1382 /* Return a Unicode-Escape string version of the Unicode object.
1384 If quotes is true, the string is enclosed in u"" or u'' quotes as
1389 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1394 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1402 static const char *hexdigit
= "0123456789ABCDEF";
1404 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1408 p
= q
= PyString_AS_STRING(repr
);
1412 *p
++ = (findchar(s
, size
, '\'') &&
1413 !findchar(s
, size
, '"')) ? '"' : '\'';
1415 while (size
-- > 0) {
1416 Py_UNICODE ch
= *s
++;
1418 if (quotes
&& (ch
== q
[1] || ch
== '\\')) {
1422 /* Map 16-bit characters to '\uxxxx' */
1423 else if (ch
>= 256) {
1426 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1427 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1428 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1429 *p
++ = hexdigit
[ch
& 15];
1431 /* Map non-printable US ASCII to '\ooo' */
1432 else if (ch
< ' ' || ch
>= 128) {
1434 *p
++ = hexdigit
[(ch
>> 6) & 7];
1435 *p
++ = hexdigit
[(ch
>> 3) & 7];
1436 *p
++ = hexdigit
[ch
& 7];
1438 /* Copy everything else as-is */
1446 if (_PyString_Resize(&repr
, p
- q
))
1456 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1459 return unicodeescape_string(s
, size
, 0);
1462 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1464 if (!PyUnicode_Check(unicode
)) {
1465 PyErr_BadArgument();
1468 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1469 PyUnicode_GET_SIZE(unicode
));
1472 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1474 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1479 Py_UNICODE
*p
, *buf
;
1483 /* Escaped strings will always be longer than the resulting
1484 Unicode string, so we start with size here and then reduce the
1485 length after conversion to the true value. */
1486 v
= _PyUnicode_New(size
);
1490 return (PyObject
*)v
;
1491 p
= buf
= PyUnicode_AS_UNICODE(v
);
1498 /* Non-escape characters are interpreted as Unicode ordinals */
1500 *p
++ = (unsigned char)*s
++;
1504 /* \u-escapes are only interpreted iff the number of leading
1505 backslashes if odd */
1510 *p
++ = (unsigned char)*s
++;
1512 if (((s
- bs
) & 1) == 0 ||
1520 /* \uXXXX with 4 hex digits */
1521 for (x
= 0, i
= 0; i
< 4; i
++) {
1522 c
= (unsigned char)s
[i
];
1524 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1525 "truncated \\uXXXX"))
1531 if (c
>= '0' && c
<= '9')
1533 else if (c
>= 'a' && c
<= 'f')
1541 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1543 return (PyObject
*)v
;
1550 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1557 static const char *hexdigit
= "0123456789ABCDEF";
1559 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1565 p
= q
= PyString_AS_STRING(repr
);
1566 while (size
-- > 0) {
1567 Py_UNICODE ch
= *s
++;
1568 /* Map 16-bit characters to '\uxxxx' */
1572 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1573 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1574 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1575 *p
++ = hexdigit
[ch
& 15];
1577 /* Copy everything else as-is */
1582 if (_PyString_Resize(&repr
, p
- q
))
1592 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1594 if (!PyUnicode_Check(unicode
)) {
1595 PyErr_BadArgument();
1598 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1599 PyUnicode_GET_SIZE(unicode
));
1602 /* --- Latin-1 Codec ------------------------------------------------------ */
1604 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
1611 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612 v
= _PyUnicode_New(size
);
1616 return (PyObject
*)v
;
1617 p
= PyUnicode_AS_UNICODE(v
);
1619 *p
++ = (unsigned char)*s
++;
1620 return (PyObject
*)v
;
1628 int latin1_encoding_error(const Py_UNICODE
**source
,
1631 const char *details
)
1633 if ((errors
== NULL
) ||
1634 (strcmp(errors
,"strict") == 0)) {
1635 PyErr_Format(PyExc_UnicodeError
,
1636 "Latin-1 encoding error: %.400s",
1640 else if (strcmp(errors
,"ignore") == 0) {
1643 else if (strcmp(errors
,"replace") == 0) {
1649 PyErr_Format(PyExc_ValueError
,
1650 "Latin-1 encoding error; "
1651 "unknown error handling code: %.400s",
1657 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
1664 repr
= PyString_FromStringAndSize(NULL
, size
);
1670 s
= PyString_AS_STRING(repr
);
1672 while (size
-- > 0) {
1673 Py_UNICODE ch
= *p
++;
1675 if (latin1_encoding_error(&p
, &s
, errors
,
1676 "ordinal not in range(256)"))
1682 /* Resize if error handling skipped some characters */
1683 if (s
- start
< PyString_GET_SIZE(repr
))
1684 if (_PyString_Resize(&repr
, s
- start
))
1693 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
1695 if (!PyUnicode_Check(unicode
)) {
1696 PyErr_BadArgument();
1699 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
1700 PyUnicode_GET_SIZE(unicode
),
1704 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1707 int ascii_decoding_error(const char **source
,
1710 const char *details
)
1712 if ((errors
== NULL
) ||
1713 (strcmp(errors
,"strict") == 0)) {
1714 PyErr_Format(PyExc_UnicodeError
,
1715 "ASCII decoding error: %.400s",
1719 else if (strcmp(errors
,"ignore") == 0) {
1722 else if (strcmp(errors
,"replace") == 0) {
1723 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1728 PyErr_Format(PyExc_ValueError
,
1729 "ASCII decoding error; "
1730 "unknown error handling code: %.400s",
1736 PyObject
*PyUnicode_DecodeASCII(const char *s
,
1743 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744 v
= _PyUnicode_New(size
);
1748 return (PyObject
*)v
;
1749 p
= PyUnicode_AS_UNICODE(v
);
1750 while (size
-- > 0) {
1751 register unsigned char c
;
1753 c
= (unsigned char)*s
++;
1756 else if (ascii_decoding_error(&s
, &p
, errors
,
1757 "ordinal not in range(128)"))
1760 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
1761 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1763 return (PyObject
*)v
;
1771 int ascii_encoding_error(const Py_UNICODE
**source
,
1774 const char *details
)
1776 if ((errors
== NULL
) ||
1777 (strcmp(errors
,"strict") == 0)) {
1778 PyErr_Format(PyExc_UnicodeError
,
1779 "ASCII encoding error: %.400s",
1783 else if (strcmp(errors
,"ignore") == 0) {
1786 else if (strcmp(errors
,"replace") == 0) {
1792 PyErr_Format(PyExc_ValueError
,
1793 "ASCII encoding error; "
1794 "unknown error handling code: %.400s",
1800 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
1807 repr
= PyString_FromStringAndSize(NULL
, size
);
1813 s
= PyString_AS_STRING(repr
);
1815 while (size
-- > 0) {
1816 Py_UNICODE ch
= *p
++;
1818 if (ascii_encoding_error(&p
, &s
, errors
,
1819 "ordinal not in range(128)"))
1825 /* Resize if error handling skipped some characters */
1826 if (s
- start
< PyString_GET_SIZE(repr
))
1827 if (_PyString_Resize(&repr
, s
- start
))
1836 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
1838 if (!PyUnicode_Check(unicode
)) {
1839 PyErr_BadArgument();
1842 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
1843 PyUnicode_GET_SIZE(unicode
),
1849 /* --- MBCS codecs for Windows -------------------------------------------- */
1851 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
1858 /* First get the size of the result */
1859 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
1860 if (size
> 0 && usize
==0)
1861 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1863 v
= _PyUnicode_New(usize
);
1867 return (PyObject
*)v
;
1868 p
= PyUnicode_AS_UNICODE(v
);
1869 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1874 return (PyObject
*)v
;
1877 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
1885 /* If there are no characters, bail now! */
1887 return PyString_FromString("");
1889 /* First get the size of the result */
1890 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
1892 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1894 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
1900 /* Do the conversion */
1901 s
= PyString_AS_STRING(repr
);
1902 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
1904 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1909 #endif /* MS_WIN32 */
1911 /* --- Character Mapping Codec -------------------------------------------- */
1914 int charmap_decoding_error(const char **source
,
1917 const char *details
)
1919 if ((errors
== NULL
) ||
1920 (strcmp(errors
,"strict") == 0)) {
1921 PyErr_Format(PyExc_UnicodeError
,
1922 "charmap decoding error: %.400s",
1926 else if (strcmp(errors
,"ignore") == 0) {
1929 else if (strcmp(errors
,"replace") == 0) {
1930 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1935 PyErr_Format(PyExc_ValueError
,
1936 "charmap decoding error; "
1937 "unknown error handling code: %.400s",
1943 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
1951 /* Default to Latin-1 */
1952 if (mapping
== NULL
)
1953 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1955 v
= _PyUnicode_New(size
);
1959 return (PyObject
*)v
;
1960 p
= PyUnicode_AS_UNICODE(v
);
1961 while (size
-- > 0) {
1962 unsigned char ch
= *s
++;
1965 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1966 w
= PyInt_FromLong((long)ch
);
1969 x
= PyObject_GetItem(mapping
, w
);
1972 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
1973 /* No mapping found: default to Latin-1 mapping */
1975 *p
++ = (Py_UNICODE
)ch
;
1982 if (PyInt_Check(x
)) {
1983 long value
= PyInt_AS_LONG(x
);
1984 if (value
< 0 || value
> 65535) {
1985 PyErr_SetString(PyExc_TypeError
,
1986 "character mapping must be in range(65536)");
1990 *p
++ = (Py_UNICODE
)value
;
1992 else if (x
== Py_None
) {
1993 /* undefined mapping */
1994 if (charmap_decoding_error(&s
, &p
, errors
,
1995 "character maps to <undefined>")) {
2000 else if (PyUnicode_Check(x
)) {
2001 if (PyUnicode_GET_SIZE(x
) != 1) {
2003 PyErr_SetString(PyExc_NotImplementedError
,
2004 "1-n mappings are currently not implemented");
2008 *p
++ = *PyUnicode_AS_UNICODE(x
);
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError
,
2013 "character mapping must return integer, None or unicode");
2019 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2020 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2022 return (PyObject
*)v
;
2030 int charmap_encoding_error(const Py_UNICODE
**source
,
2033 const char *details
)
2035 if ((errors
== NULL
) ||
2036 (strcmp(errors
,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError
,
2038 "charmap encoding error: %.400s",
2042 else if (strcmp(errors
,"ignore") == 0) {
2045 else if (strcmp(errors
,"replace") == 0) {
2051 PyErr_Format(PyExc_ValueError
,
2052 "charmap encoding error; "
2053 "unknown error handling code: %.400s",
2059 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2067 /* Default to Latin-1 */
2068 if (mapping
== NULL
)
2069 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2071 v
= PyString_FromStringAndSize(NULL
, size
);
2076 s
= PyString_AS_STRING(v
);
2077 while (size
-- > 0) {
2078 Py_UNICODE ch
= *p
++;
2081 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2082 w
= PyInt_FromLong((long)ch
);
2085 x
= PyObject_GetItem(mapping
, w
);
2088 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2089 /* No mapping found: default to Latin-1 mapping if possible */
2095 else if (!charmap_encoding_error(&p
, &s
, errors
,
2096 "missing character mapping"))
2103 if (PyInt_Check(x
)) {
2104 long value
= PyInt_AS_LONG(x
);
2105 if (value
< 0 || value
> 255) {
2106 PyErr_SetString(PyExc_TypeError
,
2107 "character mapping must be in range(256)");
2113 else if (x
== Py_None
) {
2114 /* undefined mapping */
2115 if (charmap_encoding_error(&p
, &s
, errors
,
2116 "character maps to <undefined>")) {
2121 else if (PyString_Check(x
)) {
2122 if (PyString_GET_SIZE(x
) != 1) {
2124 PyErr_SetString(PyExc_NotImplementedError
,
2125 "1-n mappings are currently not implemented");
2129 *s
++ = *PyString_AS_STRING(x
);
2132 /* wrong return value */
2133 PyErr_SetString(PyExc_TypeError
,
2134 "character mapping must return integer, None or unicode");
2140 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2141 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2150 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2153 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2154 PyErr_BadArgument();
2157 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2158 PyUnicode_GET_SIZE(unicode
),
2164 int translate_error(const Py_UNICODE
**source
,
2167 const char *details
)
2169 if ((errors
== NULL
) ||
2170 (strcmp(errors
,"strict") == 0)) {
2171 PyErr_Format(PyExc_UnicodeError
,
2172 "translate error: %.400s",
2176 else if (strcmp(errors
,"ignore") == 0) {
2179 else if (strcmp(errors
,"replace") == 0) {
2185 PyErr_Format(PyExc_ValueError
,
2187 "unknown error handling code: %.400s",
2193 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2201 if (mapping
== NULL
) {
2202 PyErr_BadArgument();
2206 /* Output will never be longer than input */
2207 v
= _PyUnicode_New(size
);
2212 p
= PyUnicode_AS_UNICODE(v
);
2213 while (size
-- > 0) {
2214 Py_UNICODE ch
= *s
++;
2218 w
= PyInt_FromLong(ch
);
2221 x
= PyObject_GetItem(mapping
, w
);
2224 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2225 /* No mapping found: default to 1-1 mapping */
2235 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2236 else if (x
== Py_None
) {
2237 /* undefined mapping */
2238 if (translate_error(&s
, &p
, errors
,
2239 "character maps to <undefined>")) {
2244 else if (PyUnicode_Check(x
)) {
2245 if (PyUnicode_GET_SIZE(x
) != 1) {
2247 PyErr_SetString(PyExc_NotImplementedError
,
2248 "1-n mappings are currently not implemented");
2252 *p
++ = *PyUnicode_AS_UNICODE(x
);
2255 /* wrong return value */
2256 PyErr_SetString(PyExc_TypeError
,
2257 "translate mapping must return integer, None or unicode");
2263 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2264 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2268 return (PyObject
*)v
;
2275 PyObject
*PyUnicode_Translate(PyObject
*str
,
2281 str
= PyUnicode_FromObject(str
);
2284 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2285 PyUnicode_GET_SIZE(str
),
2296 /* --- Decimal Encoder ---------------------------------------------------- */
2298 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2303 Py_UNICODE
*p
, *end
;
2305 if (output
== NULL
) {
2306 PyErr_BadArgument();
2313 register Py_UNICODE ch
= *p
++;
2316 if (Py_UNICODE_ISSPACE(ch
)) {
2320 decimal
= Py_UNICODE_TODECIMAL(ch
);
2322 *output
++ = '0' + decimal
;
2325 if (0 < ch
&& ch
< 256) {
2326 *output
++ = (char)ch
;
2329 /* All other characters are considered invalid */
2330 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2331 PyErr_SetString(PyExc_ValueError
,
2332 "invalid decimal Unicode string");
2335 else if (strcmp(errors
, "ignore") == 0)
2337 else if (strcmp(errors
, "replace") == 0) {
2342 /* 0-terminate the output string */
2350 /* --- Helpers ------------------------------------------------------------ */
2353 int count(PyUnicodeObject
*self
,
2356 PyUnicodeObject
*substring
)
2360 if (substring
->length
== 0)
2361 return (end
- start
+ 1);
2363 end
-= substring
->length
;
2365 while (start
<= end
)
2366 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2368 start
+= substring
->length
;
2375 int PyUnicode_Count(PyObject
*str
,
2382 str
= PyUnicode_FromObject(str
);
2385 substr
= PyUnicode_FromObject(substr
);
2386 if (substr
== NULL
) {
2391 result
= count((PyUnicodeObject
*)str
,
2393 (PyUnicodeObject
*)substr
);
2401 int findstring(PyUnicodeObject
*self
,
2402 PyUnicodeObject
*substring
,
2408 start
+= self
->length
;
2412 if (substring
->length
== 0)
2415 if (end
> self
->length
)
2418 end
+= self
->length
;
2422 end
-= substring
->length
;
2424 if (direction
< 0) {
2425 for (; end
>= start
; end
--)
2426 if (Py_UNICODE_MATCH(self
, end
, substring
))
2429 for (; start
<= end
; start
++)
2430 if (Py_UNICODE_MATCH(self
, start
, substring
))
2437 int PyUnicode_Find(PyObject
*str
,
2445 str
= PyUnicode_FromObject(str
);
2448 substr
= PyUnicode_FromObject(substr
);
2449 if (substr
== NULL
) {
2454 result
= findstring((PyUnicodeObject
*)str
,
2455 (PyUnicodeObject
*)substr
,
2456 start
, end
, direction
);
2463 int tailmatch(PyUnicodeObject
*self
,
2464 PyUnicodeObject
*substring
,
2470 start
+= self
->length
;
2474 if (substring
->length
== 0)
2477 if (end
> self
->length
)
2480 end
+= self
->length
;
2484 end
-= substring
->length
;
2488 if (direction
> 0) {
2489 if (Py_UNICODE_MATCH(self
, end
, substring
))
2492 if (Py_UNICODE_MATCH(self
, start
, substring
))
2499 int PyUnicode_Tailmatch(PyObject
*str
,
2507 str
= PyUnicode_FromObject(str
);
2510 substr
= PyUnicode_FromObject(substr
);
2511 if (substr
== NULL
) {
2516 result
= tailmatch((PyUnicodeObject
*)str
,
2517 (PyUnicodeObject
*)substr
,
2518 start
, end
, direction
);
2525 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2529 /* like wcschr, but doesn't stop at NULL characters */
2531 while (size
-- > 0) {
2540 /* Apply fixfct filter to the Unicode object self and return a
2541 reference to the modified object */
2544 PyObject
*fixup(PyUnicodeObject
*self
,
2545 int (*fixfct
)(PyUnicodeObject
*s
))
2550 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(self
->str
,
2555 /* fixfct should return TRUE if it modified the buffer. If
2556 FALSE, return a reference to the original buffer instead
2557 (to save space, not time) */
2560 return (PyObject
*) self
;
2562 return (PyObject
*) u
;
2566 int fixupper(PyUnicodeObject
*self
)
2568 int len
= self
->length
;
2569 Py_UNICODE
*s
= self
->str
;
2573 register Py_UNICODE ch
;
2575 ch
= Py_UNICODE_TOUPPER(*s
);
2587 int fixlower(PyUnicodeObject
*self
)
2589 int len
= self
->length
;
2590 Py_UNICODE
*s
= self
->str
;
2594 register Py_UNICODE ch
;
2596 ch
= Py_UNICODE_TOLOWER(*s
);
2608 int fixswapcase(PyUnicodeObject
*self
)
2610 int len
= self
->length
;
2611 Py_UNICODE
*s
= self
->str
;
2615 if (Py_UNICODE_ISUPPER(*s
)) {
2616 *s
= Py_UNICODE_TOLOWER(*s
);
2618 } else if (Py_UNICODE_ISLOWER(*s
)) {
2619 *s
= Py_UNICODE_TOUPPER(*s
);
2629 int fixcapitalize(PyUnicodeObject
*self
)
2631 if (self
->length
> 0 && Py_UNICODE_ISLOWER(self
->str
[0])) {
2632 self
->str
[0] = Py_UNICODE_TOUPPER(self
->str
[0]);
2639 int fixtitle(PyUnicodeObject
*self
)
2641 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
2642 register Py_UNICODE
*e
;
2643 int previous_is_cased
;
2645 /* Shortcut for single character strings */
2646 if (PyUnicode_GET_SIZE(self
) == 1) {
2647 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
2656 e
= p
+ PyUnicode_GET_SIZE(self
);
2657 previous_is_cased
= 0;
2658 for (; p
< e
; p
++) {
2659 register const Py_UNICODE ch
= *p
;
2661 if (previous_is_cased
)
2662 *p
= Py_UNICODE_TOLOWER(ch
);
2664 *p
= Py_UNICODE_TOTITLE(ch
);
2666 if (Py_UNICODE_ISLOWER(ch
) ||
2667 Py_UNICODE_ISUPPER(ch
) ||
2668 Py_UNICODE_ISTITLE(ch
))
2669 previous_is_cased
= 1;
2671 previous_is_cased
= 0;
2676 PyObject
*PyUnicode_Join(PyObject
*separator
,
2681 PyUnicodeObject
*res
= NULL
;
2688 seqlen
= PySequence_Size(seq
);
2689 if (seqlen
< 0 && PyErr_Occurred())
2692 if (separator
== NULL
) {
2693 Py_UNICODE blank
= ' ';
2698 separator
= PyUnicode_FromObject(separator
);
2699 if (separator
== NULL
)
2701 sep
= PyUnicode_AS_UNICODE(separator
);
2702 seplen
= PyUnicode_GET_SIZE(separator
);
2705 res
= _PyUnicode_New(sz
);
2708 p
= PyUnicode_AS_UNICODE(res
);
2711 for (i
= 0; i
< seqlen
; i
++) {
2715 item
= PySequence_GetItem(seq
, i
);
2718 if (!PyUnicode_Check(item
)) {
2720 v
= PyUnicode_FromObject(item
);
2726 itemlen
= PyUnicode_GET_SIZE(item
);
2727 while (reslen
+ itemlen
+ seplen
>= sz
) {
2728 if (_PyUnicode_Resize(res
, sz
*2))
2731 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
2734 memcpy(p
, sep
, seplen
* sizeof(Py_UNICODE
));
2738 memcpy(p
, PyUnicode_AS_UNICODE(item
), itemlen
* sizeof(Py_UNICODE
));
2743 if (_PyUnicode_Resize(res
, reslen
))
2746 Py_XDECREF(separator
);
2747 return (PyObject
*)res
;
2750 Py_XDECREF(separator
);
2756 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
2768 if (left
== 0 && right
== 0) {
2773 u
= _PyUnicode_New(left
+ self
->length
+ right
);
2776 Py_UNICODE_FILL(u
->str
, fill
, left
);
2777 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
2779 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
2785 #define SPLIT_APPEND(data, left, right) \
2786 str = PyUnicode_FromUnicode(data + left, right - left); \
2789 if (PyList_Append(list, str)) { \
2797 PyObject
*split_whitespace(PyUnicodeObject
*self
,
2803 int len
= self
->length
;
2806 for (i
= j
= 0; i
< len
; ) {
2808 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2811 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
2814 if (maxcount
-- <= 0)
2816 SPLIT_APPEND(self
->str
, j
, i
);
2817 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2823 SPLIT_APPEND(self
->str
, j
, len
);
2832 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
2842 string
= PyUnicode_FromObject(string
);
2845 data
= PyUnicode_AS_UNICODE(string
);
2846 len
= PyUnicode_GET_SIZE(string
);
2848 list
= PyList_New(0);
2852 for (i
= j
= 0; i
< len
; ) {
2855 /* Find a line and append it */
2856 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
2859 /* Skip the line break reading CRLF as one line break */
2862 if (data
[i
] == '\r' && i
+ 1 < len
&&
2870 SPLIT_APPEND(data
, j
, eol
);
2874 SPLIT_APPEND(data
, j
, len
);
2887 PyObject
*split_char(PyUnicodeObject
*self
,
2894 int len
= self
->length
;
2897 for (i
= j
= 0; i
< len
; ) {
2898 if (self
->str
[i
] == ch
) {
2899 if (maxcount
-- <= 0)
2901 SPLIT_APPEND(self
->str
, j
, i
);
2907 SPLIT_APPEND(self
->str
, j
, len
);
2917 PyObject
*split_substring(PyUnicodeObject
*self
,
2919 PyUnicodeObject
*substring
,
2924 int len
= self
->length
;
2925 int sublen
= substring
->length
;
2928 for (i
= j
= 0; i
< len
- sublen
; ) {
2929 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
2930 if (maxcount
-- <= 0)
2932 SPLIT_APPEND(self
->str
, j
, i
);
2938 SPLIT_APPEND(self
->str
, j
, len
);
2950 PyObject
*split(PyUnicodeObject
*self
,
2951 PyUnicodeObject
*substring
,
2959 list
= PyList_New(0);
2963 if (substring
== NULL
)
2964 return split_whitespace(self
,list
,maxcount
);
2966 else if (substring
->length
== 1)
2967 return split_char(self
,list
,substring
->str
[0],maxcount
);
2969 else if (substring
->length
== 0) {
2971 PyErr_SetString(PyExc_ValueError
, "empty separator");
2975 return split_substring(self
,list
,substring
,maxcount
);
2979 PyObject
*strip(PyUnicodeObject
*self
,
2983 Py_UNICODE
*p
= self
->str
;
2985 int end
= self
->length
;
2988 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
2992 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
2995 if (start
== 0 && end
== self
->length
) {
2996 /* couldn't strip anything off, return original string */
2998 return (PyObject
*) self
;
3001 return (PyObject
*) PyUnicode_FromUnicode(
3008 PyObject
*replace(PyUnicodeObject
*self
,
3009 PyUnicodeObject
*str1
,
3010 PyUnicodeObject
*str2
,
3018 if (str1
->length
== 1 && str2
->length
== 1) {
3021 /* replace characters */
3022 if (!findchar(self
->str
, self
->length
, str1
->str
[0])) {
3023 /* nothing to replace, return original string */
3027 Py_UNICODE u1
= str1
->str
[0];
3028 Py_UNICODE u2
= str2
->str
[0];
3030 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3035 for (i
= 0; i
< u
->length
; i
++)
3036 if (u
->str
[i
] == u1
) {
3047 /* replace strings */
3048 n
= count(self
, 0, self
->length
, str1
);
3052 /* nothing to replace, return original string */
3057 self
->length
+ n
* (str2
->length
- str1
->length
));
3061 while (i
<= self
->length
- str1
->length
)
3062 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3063 /* replace string segment */
3064 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3068 /* copy remaining part */
3069 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3073 *p
++ = self
->str
[i
++];
3078 return (PyObject
*) u
;
3081 /* --- Unicode Object Methods --------------------------------------------- */
3083 static char title__doc__
[] =
3084 "S.title() -> unicode\n\
3086 Return a titlecased version of S, i.e. words start with title case\n\
3087 characters, all remaining cased characters have lower case.";
3090 unicode_title(PyUnicodeObject
*self
, PyObject
*args
)
3092 if (!PyArg_NoArgs(args
))
3094 return fixup(self
, fixtitle
);
3097 static char capitalize__doc__
[] =
3098 "S.capitalize() -> unicode\n\
3100 Return a capitalized version of S, i.e. make the first character\n\
3104 unicode_capitalize(PyUnicodeObject
*self
, PyObject
*args
)
3106 if (!PyArg_NoArgs(args
))
3108 return fixup(self
, fixcapitalize
);
3112 static char capwords__doc__
[] =
3113 "S.capwords() -> unicode\n\
3115 Apply .capitalize() to all words in S and return the result with\n\
3116 normalized whitespace (all whitespace strings are replaced by ' ').";
3119 unicode_capwords(PyUnicodeObject
*self
, PyObject
*args
)
3125 if (!PyArg_NoArgs(args
))
3128 /* Split into words */
3129 list
= split(self
, NULL
, -1);
3133 /* Capitalize each word */
3134 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3135 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3139 Py_DECREF(PyList_GET_ITEM(list
, i
));
3140 PyList_SET_ITEM(list
, i
, item
);
3143 /* Join the words to form a new string */
3144 item
= PyUnicode_Join(NULL
, list
);
3148 return (PyObject
*)item
;
3152 static char center__doc__
[] =
3153 "S.center(width) -> unicode\n\
3155 Return S centered in a Unicode string of length width. Padding is done\n\
3159 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3164 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3167 if (self
->length
>= width
) {
3169 return (PyObject
*) self
;
3172 marg
= width
- self
->length
;
3173 left
= marg
/ 2 + (marg
& width
& 1);
3175 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3180 /* This code should go into some future Unicode collation support
3181 module. The basic comparison should compare ordinals on a naive
3182 basis (this is what Java does and thus JPython too). */
3184 /* speedy UTF-16 code point order comparison */
3186 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3188 static short utf16Fixup
[32] =
3190 0, 0, 0, 0, 0, 0, 0, 0,
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
3193 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3197 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3201 Py_UNICODE
*s1
= str1
->str
;
3202 Py_UNICODE
*s2
= str2
->str
;
3204 len1
= str1
->length
;
3205 len2
= str2
->length
;
3207 while (len1
> 0 && len2
> 0) {
3213 if (c1
> (1<<11) * 26)
3214 c1
+= utf16Fixup
[c1
>>11];
3215 if (c2
> (1<<11) * 26)
3216 c2
+= utf16Fixup
[c2
>>11];
3218 /* now c1 and c2 are in UTF-32-compatible order */
3219 diff
= (long)c1
- (long)c2
;
3221 return (diff
< 0) ? -1 : (diff
!= 0);
3225 return (len1
< len2
) ? -1 : (len1
!= len2
);
3231 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3233 register int len1
, len2
;
3235 Py_UNICODE
*s1
= str1
->str
;
3236 Py_UNICODE
*s2
= str2
->str
;
3238 len1
= str1
->length
;
3239 len2
= str2
->length
;
3241 while (len1
> 0 && len2
> 0) {
3244 diff
= (long)*s1
++ - (long)*s2
++;
3246 return (diff
< 0) ? -1 : (diff
!= 0);
3250 return (len1
< len2
) ? -1 : (len1
!= len2
);
3255 int PyUnicode_Compare(PyObject
*left
,
3258 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3261 /* Coerce the two arguments */
3262 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3265 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3269 /* Shortcut for empty or interned objects */
3276 result
= unicode_compare(u
, v
);
3288 int PyUnicode_Contains(PyObject
*container
,
3291 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3293 register const Py_UNICODE
*p
, *e
;
3294 register Py_UNICODE ch
;
3296 /* Coerce the two arguments */
3297 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3299 PyErr_SetString(PyExc_TypeError
,
3300 "'in <string>' requires character as left operand");
3303 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3310 if (PyUnicode_GET_SIZE(v
) != 1) {
3311 PyErr_SetString(PyExc_TypeError
,
3312 "'in <string>' requires character as left operand");
3315 ch
= *PyUnicode_AS_UNICODE(v
);
3316 p
= PyUnicode_AS_UNICODE(u
);
3317 e
= p
+ PyUnicode_GET_SIZE(u
);
3336 /* Concat to string or Unicode object giving a new Unicode object. */
3338 PyObject
*PyUnicode_Concat(PyObject
*left
,
3341 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3343 /* Coerce the two arguments */
3344 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3347 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3352 if (v
== unicode_empty
) {
3354 return (PyObject
*)u
;
3356 if (u
== unicode_empty
) {
3358 return (PyObject
*)v
;
3361 /* Concat the two Unicode strings */
3362 w
= _PyUnicode_New(u
->length
+ v
->length
);
3365 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3366 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3370 return (PyObject
*)w
;
3378 static char count__doc__
[] =
3379 "S.count(sub[, start[, end]]) -> int\n\
3381 Return the number of occurrences of substring sub in Unicode string\n\
3382 S[start:end]. Optional arguments start and end are\n\
3383 interpreted as in slice notation.";
3386 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3388 PyUnicodeObject
*substring
;
3393 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3394 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3397 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3398 (PyObject
*)substring
);
3399 if (substring
== NULL
)
3403 start
+= self
->length
;
3406 if (end
> self
->length
)
3409 end
+= self
->length
;
3413 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3415 Py_DECREF(substring
);
3419 static char encode__doc__
[] =
3420 "S.encode([encoding[,errors]]) -> string\n\
3422 Return an encoded string version of S. Default encoding is the current\n\
3423 default string encoding. errors may be given to set a different error\n\
3424 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3425 a ValueError. Other possible values are 'ignore' and 'replace'.";
3428 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3430 char *encoding
= NULL
;
3431 char *errors
= NULL
;
3432 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3434 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3437 static char expandtabs__doc__
[] =
3438 "S.expandtabs([tabsize]) -> unicode\n\
3440 Return a copy of S where all tab characters are expanded using spaces.\n\
3441 If tabsize is not given, a tab size of 8 characters is assumed.";
3444 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3453 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3456 /* First pass: determine size of output string */
3458 e
= self
->str
+ self
->length
;
3459 for (p
= self
->str
; p
< e
; p
++)
3462 j
+= tabsize
- (j
% tabsize
);
3466 if (*p
== '\n' || *p
== '\r') {
3472 /* Second pass: create output string and fill it */
3473 u
= _PyUnicode_New(i
+ j
);
3480 for (p
= self
->str
; p
< e
; p
++)
3483 i
= tabsize
- (j
% tabsize
);
3492 if (*p
== '\n' || *p
== '\r')
3496 return (PyObject
*) u
;
3499 static char find__doc__
[] =
3500 "S.find(sub [,start [,end]]) -> int\n\
3502 Return the lowest index in S where substring sub is found,\n\
3503 such that sub is contained within s[start,end]. Optional\n\
3504 arguments start and end are interpreted as in slice notation.\n\
3506 Return -1 on failure.";
3509 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3511 PyUnicodeObject
*substring
;
3516 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
3517 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3519 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3520 (PyObject
*)substring
);
3521 if (substring
== NULL
)
3524 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
3526 Py_DECREF(substring
);
3531 unicode_getitem(PyUnicodeObject
*self
, int index
)
3533 if (index
< 0 || index
>= self
->length
) {
3534 PyErr_SetString(PyExc_IndexError
, "string index out of range");
3538 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
3542 unicode_hash(PyUnicodeObject
*self
)
3544 /* Since Unicode objects compare equal to their ASCII string
3545 counterparts, they should use the individual character values
3546 as basis for their hash value. This is needed to assure that
3547 strings and Unicode objects behave in the same way as
3551 register Py_UNICODE
*p
;
3554 if (self
->hash
!= -1)
3556 len
= PyUnicode_GET_SIZE(self
);
3557 p
= PyUnicode_AS_UNICODE(self
);
3560 x
= (1000003*x
) ^ *p
++;
3561 x
^= PyUnicode_GET_SIZE(self
);
3568 static char index__doc__
[] =
3569 "S.index(sub [,start [,end]]) -> int\n\
3571 Like S.find() but raise ValueError when the substring is not found.";
3574 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
3577 PyUnicodeObject
*substring
;
3581 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
3582 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3585 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3586 (PyObject
*)substring
);
3587 if (substring
== NULL
)
3590 result
= findstring(self
, substring
, start
, end
, 1);
3592 Py_DECREF(substring
);
3594 PyErr_SetString(PyExc_ValueError
, "substring not found");
3597 return PyInt_FromLong(result
);
3600 static char islower__doc__
[] =
3601 "S.islower() -> int\n\
3603 Return 1 if all cased characters in S are lowercase and there is\n\
3604 at least one cased character in S, 0 otherwise.";
3607 unicode_islower(PyUnicodeObject
*self
, PyObject
*args
)
3609 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3610 register const Py_UNICODE
*e
;
3613 if (!PyArg_NoArgs(args
))
3616 /* Shortcut for single character strings */
3617 if (PyUnicode_GET_SIZE(self
) == 1)
3618 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
3620 /* Special case for empty strings */
3621 if (PyString_GET_SIZE(self
) == 0)
3622 return PyInt_FromLong(0);
3624 e
= p
+ PyUnicode_GET_SIZE(self
);
3626 for (; p
< e
; p
++) {
3627 register const Py_UNICODE ch
= *p
;
3629 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
3630 return PyInt_FromLong(0);
3631 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
3634 return PyInt_FromLong(cased
);
3637 static char isupper__doc__
[] =
3638 "S.isupper() -> int\n\
3640 Return 1 if all cased characters in S are uppercase and there is\n\
3641 at least one cased character in S, 0 otherwise.";
3644 unicode_isupper(PyUnicodeObject
*self
, PyObject
*args
)
3646 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3647 register const Py_UNICODE
*e
;
3650 if (!PyArg_NoArgs(args
))
3653 /* Shortcut for single character strings */
3654 if (PyUnicode_GET_SIZE(self
) == 1)
3655 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
3657 /* Special case for empty strings */
3658 if (PyString_GET_SIZE(self
) == 0)
3659 return PyInt_FromLong(0);
3661 e
= p
+ PyUnicode_GET_SIZE(self
);
3663 for (; p
< e
; p
++) {
3664 register const Py_UNICODE ch
= *p
;
3666 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
3667 return PyInt_FromLong(0);
3668 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
3671 return PyInt_FromLong(cased
);
3674 static char istitle__doc__
[] =
3675 "S.istitle() -> int\n\
3677 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3678 may only follow uncased characters and lowercase characters only cased\n\
3679 ones. Return 0 otherwise.";
3682 unicode_istitle(PyUnicodeObject
*self
, PyObject
*args
)
3684 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3685 register const Py_UNICODE
*e
;
3686 int cased
, previous_is_cased
;
3688 if (!PyArg_NoArgs(args
))
3691 /* Shortcut for single character strings */
3692 if (PyUnicode_GET_SIZE(self
) == 1)
3693 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
3694 (Py_UNICODE_ISUPPER(*p
) != 0));
3696 /* Special case for empty strings */
3697 if (PyString_GET_SIZE(self
) == 0)
3698 return PyInt_FromLong(0);
3700 e
= p
+ PyUnicode_GET_SIZE(self
);
3702 previous_is_cased
= 0;
3703 for (; p
< e
; p
++) {
3704 register const Py_UNICODE ch
= *p
;
3706 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
3707 if (previous_is_cased
)
3708 return PyInt_FromLong(0);
3709 previous_is_cased
= 1;
3712 else if (Py_UNICODE_ISLOWER(ch
)) {
3713 if (!previous_is_cased
)
3714 return PyInt_FromLong(0);
3715 previous_is_cased
= 1;
3719 previous_is_cased
= 0;
3721 return PyInt_FromLong(cased
);
3724 static char isspace__doc__
[] =
3725 "S.isspace() -> int\n\
3727 Return 1 if there are only whitespace characters in S,\n\
3731 unicode_isspace(PyUnicodeObject
*self
, PyObject
*args
)
3733 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3734 register const Py_UNICODE
*e
;
3736 if (!PyArg_NoArgs(args
))
3739 /* Shortcut for single character strings */
3740 if (PyUnicode_GET_SIZE(self
) == 1 &&
3741 Py_UNICODE_ISSPACE(*p
))
3742 return PyInt_FromLong(1);
3744 /* Special case for empty strings */
3745 if (PyString_GET_SIZE(self
) == 0)
3746 return PyInt_FromLong(0);
3748 e
= p
+ PyUnicode_GET_SIZE(self
);
3749 for (; p
< e
; p
++) {
3750 if (!Py_UNICODE_ISSPACE(*p
))
3751 return PyInt_FromLong(0);
3753 return PyInt_FromLong(1);
3756 static char isalpha__doc__
[] =
3757 "S.isalpha() -> int\n\
3759 Return 1 if all characters in S are alphabetic\n\
3760 and there is at least one character in S, 0 otherwise.";
3763 unicode_isalpha(PyUnicodeObject
*self
, PyObject
*args
)
3765 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3766 register const Py_UNICODE
*e
;
3768 if (!PyArg_NoArgs(args
))
3771 /* Shortcut for single character strings */
3772 if (PyUnicode_GET_SIZE(self
) == 1 &&
3773 Py_UNICODE_ISALPHA(*p
))
3774 return PyInt_FromLong(1);
3776 /* Special case for empty strings */
3777 if (PyString_GET_SIZE(self
) == 0)
3778 return PyInt_FromLong(0);
3780 e
= p
+ PyUnicode_GET_SIZE(self
);
3781 for (; p
< e
; p
++) {
3782 if (!Py_UNICODE_ISALPHA(*p
))
3783 return PyInt_FromLong(0);
3785 return PyInt_FromLong(1);
3788 static char isalnum__doc__
[] =
3789 "S.isalnum() -> int\n\
3791 Return 1 if all characters in S are alphanumeric\n\
3792 and there is at least one character in S, 0 otherwise.";
3795 unicode_isalnum(PyUnicodeObject
*self
, PyObject
*args
)
3797 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3798 register const Py_UNICODE
*e
;
3800 if (!PyArg_NoArgs(args
))
3803 /* Shortcut for single character strings */
3804 if (PyUnicode_GET_SIZE(self
) == 1 &&
3805 Py_UNICODE_ISALNUM(*p
))
3806 return PyInt_FromLong(1);
3808 /* Special case for empty strings */
3809 if (PyString_GET_SIZE(self
) == 0)
3810 return PyInt_FromLong(0);
3812 e
= p
+ PyUnicode_GET_SIZE(self
);
3813 for (; p
< e
; p
++) {
3814 if (!Py_UNICODE_ISALNUM(*p
))
3815 return PyInt_FromLong(0);
3817 return PyInt_FromLong(1);
3820 static char isdecimal__doc__
[] =
3821 "S.isdecimal() -> int\n\
3823 Return 1 if there are only decimal characters in S,\n\
3827 unicode_isdecimal(PyUnicodeObject
*self
, PyObject
*args
)
3829 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3830 register const Py_UNICODE
*e
;
3832 if (!PyArg_NoArgs(args
))
3835 /* Shortcut for single character strings */
3836 if (PyUnicode_GET_SIZE(self
) == 1 &&
3837 Py_UNICODE_ISDECIMAL(*p
))
3838 return PyInt_FromLong(1);
3840 /* Special case for empty strings */
3841 if (PyString_GET_SIZE(self
) == 0)
3842 return PyInt_FromLong(0);
3844 e
= p
+ PyUnicode_GET_SIZE(self
);
3845 for (; p
< e
; p
++) {
3846 if (!Py_UNICODE_ISDECIMAL(*p
))
3847 return PyInt_FromLong(0);
3849 return PyInt_FromLong(1);
3852 static char isdigit__doc__
[] =
3853 "S.isdigit() -> int\n\
3855 Return 1 if there are only digit characters in S,\n\
3859 unicode_isdigit(PyUnicodeObject
*self
, PyObject
*args
)
3861 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3862 register const Py_UNICODE
*e
;
3864 if (!PyArg_NoArgs(args
))
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self
) == 1 &&
3869 Py_UNICODE_ISDIGIT(*p
))
3870 return PyInt_FromLong(1);
3872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self
) == 0)
3874 return PyInt_FromLong(0);
3876 e
= p
+ PyUnicode_GET_SIZE(self
);
3877 for (; p
< e
; p
++) {
3878 if (!Py_UNICODE_ISDIGIT(*p
))
3879 return PyInt_FromLong(0);
3881 return PyInt_FromLong(1);
3884 static char isnumeric__doc__
[] =
3885 "S.isnumeric() -> int\n\
3887 Return 1 if there are only numeric characters in S,\n\
3891 unicode_isnumeric(PyUnicodeObject
*self
, PyObject
*args
)
3893 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3894 register const Py_UNICODE
*e
;
3896 if (!PyArg_NoArgs(args
))
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self
) == 1 &&
3901 Py_UNICODE_ISNUMERIC(*p
))
3902 return PyInt_FromLong(1);
3904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self
) == 0)
3906 return PyInt_FromLong(0);
3908 e
= p
+ PyUnicode_GET_SIZE(self
);
3909 for (; p
< e
; p
++) {
3910 if (!Py_UNICODE_ISNUMERIC(*p
))
3911 return PyInt_FromLong(0);
3913 return PyInt_FromLong(1);
3916 static char join__doc__
[] =
3917 "S.join(sequence) -> unicode\n\
3919 Return a string which is the concatenation of the strings in the\n\
3920 sequence. The separator between elements is S.";
3923 unicode_join(PyUnicodeObject
*self
, PyObject
*args
)
3926 if (!PyArg_ParseTuple(args
, "O:join", &data
))
3929 return PyUnicode_Join((PyObject
*)self
, data
);
3933 unicode_length(PyUnicodeObject
*self
)
3935 return self
->length
;
3938 static char ljust__doc__
[] =
3939 "S.ljust(width) -> unicode\n\
3941 Return S left justified in a Unicode string of length width. Padding is\n\
3942 done using spaces.";
3945 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
3948 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
3951 if (self
->length
>= width
) {
3953 return (PyObject
*) self
;
3956 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
3959 static char lower__doc__
[] =
3960 "S.lower() -> unicode\n\
3962 Return a copy of the string S converted to lowercase.";
3965 unicode_lower(PyUnicodeObject
*self
, PyObject
*args
)
3967 if (!PyArg_NoArgs(args
))
3969 return fixup(self
, fixlower
);
3972 static char lstrip__doc__
[] =
3973 "S.lstrip() -> unicode\n\
3975 Return a copy of the string S with leading whitespace removed.";
3978 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
3980 if (!PyArg_NoArgs(args
))
3982 return strip(self
, 1, 0);
3986 unicode_repeat(PyUnicodeObject
*str
, int len
)
3997 /* no repeat, return original string */
3999 return (PyObject
*) str
;
4002 /* ensure # of chars needed doesn't overflow int and # of bytes
4003 * needed doesn't overflow size_t
4005 nchars
= len
* str
->length
;
4006 if (len
&& nchars
/ len
!= str
->length
) {
4007 PyErr_SetString(PyExc_OverflowError
,
4008 "repeated string is too long");
4011 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4012 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4013 PyErr_SetString(PyExc_OverflowError
,
4014 "repeated string is too long");
4017 u
= _PyUnicode_New(nchars
);
4024 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4028 return (PyObject
*) u
;
4031 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4041 self
= PyUnicode_FromObject(obj
);
4044 str1
= PyUnicode_FromObject(subobj
);
4049 str2
= PyUnicode_FromObject(replobj
);
4055 result
= replace((PyUnicodeObject
*)self
,
4056 (PyUnicodeObject
*)str1
,
4057 (PyUnicodeObject
*)str2
,
4065 static char replace__doc__
[] =
4066 "S.replace (old, new[, maxsplit]) -> unicode\n\
4068 Return a copy of S with all occurrences of substring\n\
4069 old replaced by new. If the optional argument maxsplit is\n\
4070 given, only the first maxsplit occurrences are replaced.";
4073 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4075 PyUnicodeObject
*str1
;
4076 PyUnicodeObject
*str2
;
4080 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4082 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4085 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4089 result
= replace(self
, str1
, str2
, maxcount
);
4097 PyObject
*unicode_repr(PyObject
*unicode
)
4099 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4100 PyUnicode_GET_SIZE(unicode
),
4104 static char rfind__doc__
[] =
4105 "S.rfind(sub [,start [,end]]) -> int\n\
4107 Return the highest index in S where substring sub is found,\n\
4108 such that sub is contained within s[start,end]. Optional\n\
4109 arguments start and end are interpreted as in slice notation.\n\
4111 Return -1 on failure.";
4114 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4116 PyUnicodeObject
*substring
;
4121 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4122 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4124 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4125 (PyObject
*)substring
);
4126 if (substring
== NULL
)
4129 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4131 Py_DECREF(substring
);
4135 static char rindex__doc__
[] =
4136 "S.rindex(sub [,start [,end]]) -> int\n\
4138 Like S.rfind() but raise ValueError when the substring is not found.";
4141 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4144 PyUnicodeObject
*substring
;
4148 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4149 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4151 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4152 (PyObject
*)substring
);
4153 if (substring
== NULL
)
4156 result
= findstring(self
, substring
, start
, end
, -1);
4158 Py_DECREF(substring
);
4160 PyErr_SetString(PyExc_ValueError
, "substring not found");
4163 return PyInt_FromLong(result
);
4166 static char rjust__doc__
[] =
4167 "S.rjust(width) -> unicode\n\
4169 Return S right justified in a Unicode string of length width. Padding is\n\
4170 done using spaces.";
4173 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4176 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4179 if (self
->length
>= width
) {
4181 return (PyObject
*) self
;
4184 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4187 static char rstrip__doc__
[] =
4188 "S.rstrip() -> unicode\n\
4190 Return a copy of the string S with trailing whitespace removed.";
4193 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
4195 if (!PyArg_NoArgs(args
))
4197 return strip(self
, 0, 1);
4201 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4203 /* standard clamping */
4208 if (end
> self
->length
)
4210 if (start
== 0 && end
== self
->length
) {
4211 /* full slice, return original string */
4213 return (PyObject
*) self
;
4218 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4222 PyObject
*PyUnicode_Split(PyObject
*s
,
4228 s
= PyUnicode_FromObject(s
);
4232 sep
= PyUnicode_FromObject(sep
);
4239 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4246 static char split__doc__
[] =
4247 "S.split([sep [,maxsplit]]) -> list of strings\n\
4249 Return a list of the words in S, using sep as the\n\
4250 delimiter string. If maxsplit is given, at most maxsplit\n\
4251 splits are done. If sep is not specified, any whitespace string\n\
4255 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4257 PyObject
*substring
= Py_None
;
4260 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4263 if (substring
== Py_None
)
4264 return split(self
, NULL
, maxcount
);
4265 else if (PyUnicode_Check(substring
))
4266 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4268 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4271 static char splitlines__doc__
[] =
4272 "S.splitlines([keepends]]) -> list of strings\n\
4274 Return a list of the lines in S, breaking at line boundaries.\n\
4275 Line breaks are not included in the resulting list unless keepends\n\
4276 is given and true.";
4279 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4283 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4286 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4290 PyObject
*unicode_str(PyUnicodeObject
*self
)
4292 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4295 static char strip__doc__
[] =
4296 "S.strip() -> unicode\n\
4298 Return a copy of S with leading and trailing whitespace removed.";
4301 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
4303 if (!PyArg_NoArgs(args
))
4305 return strip(self
, 1, 1);
4308 static char swapcase__doc__
[] =
4309 "S.swapcase() -> unicode\n\
4311 Return a copy of S with uppercase characters converted to lowercase\n\
4315 unicode_swapcase(PyUnicodeObject
*self
, PyObject
*args
)
4317 if (!PyArg_NoArgs(args
))
4319 return fixup(self
, fixswapcase
);
4322 static char translate__doc__
[] =
4323 "S.translate(table) -> unicode\n\
4325 Return a copy of the string S, where all characters have been mapped\n\
4326 through the given translation table, which must be a mapping of\n\
4327 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4328 are left untouched. Characters mapped to None are deleted.";
4331 unicode_translate(PyUnicodeObject
*self
, PyObject
*args
)
4335 if (!PyArg_ParseTuple(args
, "O:translate", &table
))
4337 return PyUnicode_TranslateCharmap(self
->str
,
4343 static char upper__doc__
[] =
4344 "S.upper() -> unicode\n\
4346 Return a copy of S converted to uppercase.";
4349 unicode_upper(PyUnicodeObject
*self
, PyObject
*args
)
4351 if (!PyArg_NoArgs(args
))
4353 return fixup(self
, fixupper
);
4357 static char zfill__doc__
[] =
4358 "S.zfill(width) -> unicode\n\
4360 Pad a numeric string x with zeros on the left, to fill a field\n\
4361 of the specified width. The string x is never truncated.";
4364 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4370 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4373 if (self
->length
>= width
) {
4375 return (PyObject
*) self
;
4378 fill
= width
- self
->length
;
4380 u
= pad(self
, fill
, 0, '0');
4382 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4383 /* move sign to beginning of string */
4384 u
->str
[0] = u
->str
[fill
];
4388 return (PyObject
*) u
;
4394 unicode_freelistsize(PyUnicodeObject
*self
, PyObject
*args
)
4396 if (!PyArg_NoArgs(args
))
4398 return PyInt_FromLong(unicode_freelist_size
);
4402 static char startswith__doc__
[] =
4403 "S.startswith(prefix[, start[, end]]) -> int\n\
4405 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4406 optional start, test S beginning at that position. With optional end, stop\n\
4407 comparing S at that position.";
4410 unicode_startswith(PyUnicodeObject
*self
,
4413 PyUnicodeObject
*substring
;
4418 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4419 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4421 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4422 (PyObject
*)substring
);
4423 if (substring
== NULL
)
4426 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4428 Py_DECREF(substring
);
4433 static char endswith__doc__
[] =
4434 "S.endswith(suffix[, start[, end]]) -> int\n\
4436 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4437 optional start, test S beginning at that position. With optional end, stop\n\
4438 comparing S at that position.";
4441 unicode_endswith(PyUnicodeObject
*self
,
4444 PyUnicodeObject
*substring
;
4449 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4450 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4452 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4453 (PyObject
*)substring
);
4454 if (substring
== NULL
)
4457 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4459 Py_DECREF(substring
);
4464 static PyMethodDef unicode_methods
[] = {
4466 /* Order is according to common usage: often used methods should
4467 appear first, since lookup is done sequentially. */
4469 {"encode", (PyCFunction
) unicode_encode
, 1, encode__doc__
},
4470 {"replace", (PyCFunction
) unicode_replace
, 1, replace__doc__
},
4471 {"split", (PyCFunction
) unicode_split
, 1, split__doc__
},
4472 {"join", (PyCFunction
) unicode_join
, 1, join__doc__
},
4473 {"capitalize", (PyCFunction
) unicode_capitalize
, 0, capitalize__doc__
},
4474 {"title", (PyCFunction
) unicode_title
, 0, title__doc__
},
4475 {"center", (PyCFunction
) unicode_center
, 1, center__doc__
},
4476 {"count", (PyCFunction
) unicode_count
, 1, count__doc__
},
4477 {"expandtabs", (PyCFunction
) unicode_expandtabs
, 1, expandtabs__doc__
},
4478 {"find", (PyCFunction
) unicode_find
, 1, find__doc__
},
4479 {"index", (PyCFunction
) unicode_index
, 1, index__doc__
},
4480 {"ljust", (PyCFunction
) unicode_ljust
, 1, ljust__doc__
},
4481 {"lower", (PyCFunction
) unicode_lower
, 0, lower__doc__
},
4482 {"lstrip", (PyCFunction
) unicode_lstrip
, 0, lstrip__doc__
},
4483 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4484 {"rfind", (PyCFunction
) unicode_rfind
, 1, rfind__doc__
},
4485 {"rindex", (PyCFunction
) unicode_rindex
, 1, rindex__doc__
},
4486 {"rjust", (PyCFunction
) unicode_rjust
, 1, rjust__doc__
},
4487 {"rstrip", (PyCFunction
) unicode_rstrip
, 0, rstrip__doc__
},
4488 {"splitlines", (PyCFunction
) unicode_splitlines
, 1, splitlines__doc__
},
4489 {"strip", (PyCFunction
) unicode_strip
, 0, strip__doc__
},
4490 {"swapcase", (PyCFunction
) unicode_swapcase
, 0, swapcase__doc__
},
4491 {"translate", (PyCFunction
) unicode_translate
, 1, translate__doc__
},
4492 {"upper", (PyCFunction
) unicode_upper
, 0, upper__doc__
},
4493 {"startswith", (PyCFunction
) unicode_startswith
, 1, startswith__doc__
},
4494 {"endswith", (PyCFunction
) unicode_endswith
, 1, endswith__doc__
},
4495 {"islower", (PyCFunction
) unicode_islower
, 0, islower__doc__
},
4496 {"isupper", (PyCFunction
) unicode_isupper
, 0, isupper__doc__
},
4497 {"istitle", (PyCFunction
) unicode_istitle
, 0, istitle__doc__
},
4498 {"isspace", (PyCFunction
) unicode_isspace
, 0, isspace__doc__
},
4499 {"isdecimal", (PyCFunction
) unicode_isdecimal
, 0, isdecimal__doc__
},
4500 {"isdigit", (PyCFunction
) unicode_isdigit
, 0, isdigit__doc__
},
4501 {"isnumeric", (PyCFunction
) unicode_isnumeric
, 0, isnumeric__doc__
},
4502 {"isalpha", (PyCFunction
) unicode_isalpha
, 0, isalpha__doc__
},
4503 {"isalnum", (PyCFunction
) unicode_isalnum
, 0, isalnum__doc__
},
4505 {"zfill", (PyCFunction
) unicode_zfill
, 1, zfill__doc__
},
4506 {"capwords", (PyCFunction
) unicode_capwords
, 0, capwords__doc__
},
4510 /* This one is just used for debugging the implementation. */
4511 {"freelistsize", (PyCFunction
) unicode_freelistsize
, 0},
4518 unicode_getattr(PyUnicodeObject
*self
, char *name
)
4520 return Py_FindMethod(unicode_methods
, (PyObject
*) self
, name
);
4523 static PySequenceMethods unicode_as_sequence
= {
4524 (inquiry
) unicode_length
, /* sq_length */
4525 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4526 (intargfunc
) unicode_repeat
, /* sq_repeat */
4527 (intargfunc
) unicode_getitem
, /* sq_item */
4528 (intintargfunc
) unicode_slice
, /* sq_slice */
4529 0, /* sq_ass_item */
4530 0, /* sq_ass_slice */
4531 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4535 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4540 PyErr_SetString(PyExc_SystemError
,
4541 "accessing non-existent unicode segment");
4544 *ptr
= (void *) self
->str
;
4545 return PyUnicode_GET_DATA_SIZE(self
);
4549 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4552 PyErr_SetString(PyExc_TypeError
,
4553 "cannot use unicode as modifyable buffer");
4558 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4562 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4567 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
4574 PyErr_SetString(PyExc_SystemError
,
4575 "accessing non-existent unicode segment");
4578 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
4581 *ptr
= (void *) PyString_AS_STRING(str
);
4582 return PyString_GET_SIZE(str
);
4585 /* Helpers for PyUnicode_Format() */
4588 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
4590 int argidx
= *p_argidx
;
4591 if (argidx
< arglen
) {
4596 return PyTuple_GetItem(args
, argidx
);
4598 PyErr_SetString(PyExc_TypeError
,
4599 "not enough arguments for format string");
4603 #define F_LJUST (1<<0)
4604 #define F_SIGN (1<<1)
4605 #define F_BLANK (1<<2)
4606 #define F_ALT (1<<3)
4607 #define F_ZERO (1<<4)
4610 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
4616 va_start(va
, format
);
4618 /* First, format the string as char array, then expand to Py_UNICODE
4620 charbuffer
= (char *)buffer
;
4621 len
= vsprintf(charbuffer
, format
, va
);
4622 for (i
= len
- 1; i
>= 0; i
--)
4623 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
4630 formatfloat(Py_UNICODE
*buf
,
4637 /* fmt = '%#.' + `prec` + `type`
4638 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4642 x
= PyFloat_AsDouble(v
);
4643 if (x
== -1.0 && PyErr_Occurred())
4647 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
4649 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4650 /* worst case length calc to ensure no buffer overrun:
4652 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4653 for any double rep.)
4654 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4655 If prec=0 the effective precision is 1 (the leading digit is
4656 always given), therefore increase by one to 10+prec. */
4657 if (buflen
<= (size_t)10 + (size_t)prec
) {
4658 PyErr_SetString(PyExc_OverflowError
,
4659 "formatted float is too long (precision too long?)");
4662 return usprintf(buf
, fmt
, x
);
4666 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
4670 PyObject
*str
; /* temporary string object. */
4671 PyUnicodeObject
*result
;
4673 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
4676 result
= _PyUnicode_New(len
);
4677 for (i
= 0; i
< len
; i
++)
4678 result
->str
[i
] = buf
[i
];
4679 result
->str
[len
] = 0;
4681 return (PyObject
*)result
;
4685 formatint(Py_UNICODE
*buf
,
4692 /* fmt = '%#.' + `prec` + 'l' + `type`
4693 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4695 char fmt
[64]; /* plenty big enough! */
4698 x
= PyInt_AsLong(v
);
4699 if (x
== -1 && PyErr_Occurred())
4703 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4704 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4705 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
4706 PyErr_SetString(PyExc_OverflowError
,
4707 "formatted integer is too long (precision too long?)");
4710 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4711 return usprintf(buf
, fmt
, x
);
4715 formatchar(Py_UNICODE
*buf
,
4719 /* presume that the buffer is at least 2 characters long */
4720 if (PyUnicode_Check(v
)) {
4721 if (PyUnicode_GET_SIZE(v
) != 1)
4723 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
4726 else if (PyString_Check(v
)) {
4727 if (PyString_GET_SIZE(v
) != 1)
4729 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
4733 /* Integer input truncated to a character */
4735 x
= PyInt_AsLong(v
);
4736 if (x
== -1 && PyErr_Occurred())
4744 PyErr_SetString(PyExc_TypeError
,
4745 "%c requires int or char");
4749 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4751 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4752 chars are formatted. XXX This is a magic number. Each formatting
4753 routine does bounds checking to ensure no overflow, but a better
4754 solution may be to malloc a buffer of appropriate size for each
4755 format. For now, the current solution is sufficient.
4757 #define FORMATBUFLEN (size_t)120
4759 PyObject
*PyUnicode_Format(PyObject
*format
,
4762 Py_UNICODE
*fmt
, *res
;
4763 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
4765 PyUnicodeObject
*result
= NULL
;
4766 PyObject
*dict
= NULL
;
4769 if (format
== NULL
|| args
== NULL
) {
4770 PyErr_BadInternalCall();
4773 uformat
= PyUnicode_FromObject(format
);
4774 if (uformat
== NULL
)
4776 fmt
= PyUnicode_AS_UNICODE(uformat
);
4777 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
4779 reslen
= rescnt
= fmtcnt
+ 100;
4780 result
= _PyUnicode_New(reslen
);
4783 res
= PyUnicode_AS_UNICODE(result
);
4785 if (PyTuple_Check(args
)) {
4786 arglen
= PyTuple_Size(args
);
4793 if (args
->ob_type
->tp_as_mapping
)
4796 while (--fmtcnt
>= 0) {
4799 rescnt
= fmtcnt
+ 100;
4801 if (_PyUnicode_Resize(result
, reslen
) < 0)
4803 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
4809 /* Got a format specifier */
4814 Py_UNICODE c
= '\0';
4817 PyObject
*temp
= NULL
;
4821 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
4825 Py_UNICODE
*keystart
;
4831 PyErr_SetString(PyExc_TypeError
,
4832 "format requires a mapping");
4838 /* Skip over balanced parentheses */
4839 while (pcount
> 0 && --fmtcnt
>= 0) {
4842 else if (*fmt
== '(')
4846 keylen
= fmt
- keystart
- 1;
4847 if (fmtcnt
< 0 || pcount
> 0) {
4848 PyErr_SetString(PyExc_ValueError
,
4849 "incomplete format key");
4852 /* keys are converted to strings using UTF-8 and
4853 then looked up since Python uses strings to hold
4854 variables names etc. in its namespaces and we
4855 wouldn't want to break common idioms. */
4856 key
= PyUnicode_EncodeUTF8(keystart
,
4865 args
= PyObject_GetItem(dict
, key
);
4874 while (--fmtcnt
>= 0) {
4875 switch (c
= *fmt
++) {
4876 case '-': flags
|= F_LJUST
; continue;
4877 case '+': flags
|= F_SIGN
; continue;
4878 case ' ': flags
|= F_BLANK
; continue;
4879 case '#': flags
|= F_ALT
; continue;
4880 case '0': flags
|= F_ZERO
; continue;
4885 v
= getnextarg(args
, arglen
, &argidx
);
4888 if (!PyInt_Check(v
)) {
4889 PyErr_SetString(PyExc_TypeError
,
4893 width
= PyInt_AsLong(v
);
4901 else if (c
>= '0' && c
<= '9') {
4903 while (--fmtcnt
>= 0) {
4905 if (c
< '0' || c
> '9')
4907 if ((width
*10) / 10 != width
) {
4908 PyErr_SetString(PyExc_ValueError
,
4912 width
= width
*10 + (c
- '0');
4920 v
= getnextarg(args
, arglen
, &argidx
);
4923 if (!PyInt_Check(v
)) {
4924 PyErr_SetString(PyExc_TypeError
,
4928 prec
= PyInt_AsLong(v
);
4934 else if (c
>= '0' && c
<= '9') {
4936 while (--fmtcnt
>= 0) {
4937 c
= Py_CHARMASK(*fmt
++);
4938 if (c
< '0' || c
> '9')
4940 if ((prec
*10) / 10 != prec
) {
4941 PyErr_SetString(PyExc_ValueError
,
4945 prec
= prec
*10 + (c
- '0');
4950 if (c
== 'h' || c
== 'l' || c
== 'L') {
4957 PyErr_SetString(PyExc_ValueError
,
4958 "incomplete format");
4962 v
= getnextarg(args
, arglen
, &argidx
);
4972 /* presume that buffer length is at least 1 */
4979 if (PyUnicode_Check(v
) && c
== 's') {
4986 temp
= PyObject_Str(v
);
4988 temp
= PyObject_Repr(v
);
4991 if (!PyString_Check(temp
)) {
4992 /* XXX Note: this should never happen, since
4993 PyObject_Repr() and PyObject_Str() assure
4996 PyErr_SetString(PyExc_TypeError
,
4997 "%s argument has non-string str()");
5000 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5001 PyString_GET_SIZE(temp
),
5009 pbuf
= PyUnicode_AS_UNICODE(temp
);
5010 len
= PyUnicode_GET_SIZE(temp
);
5011 if (prec
>= 0 && len
> prec
)
5023 if (PyLong_Check(v
) && PyLong_AsLong(v
) == -1
5024 && PyErr_Occurred()) {
5026 temp
= formatlong(v
, flags
, prec
, c
);
5029 pbuf
= PyUnicode_AS_UNICODE(temp
);
5030 len
= PyUnicode_GET_SIZE(temp
);
5031 /* unbounded ints can always produce
5032 a sign character! */
5037 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5041 /* only d conversion is signed */
5054 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5065 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5071 PyErr_Format(PyExc_ValueError
,
5072 "unsupported format character '%c' (0x%x)",
5077 if (*pbuf
== '-' || *pbuf
== '+') {
5081 else if (flags
& F_SIGN
)
5083 else if (flags
& F_BLANK
)
5090 if (rescnt
< width
+ (sign
!= 0)) {
5092 rescnt
= width
+ fmtcnt
+ 100;
5094 if (_PyUnicode_Resize(result
, reslen
) < 0)
5096 res
= PyUnicode_AS_UNICODE(result
)
5106 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5107 assert(pbuf
[0] == '0');
5108 assert(pbuf
[1] == c
);
5119 if (width
> len
&& !(flags
& F_LJUST
)) {
5123 } while (--width
> len
);
5128 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5129 assert(pbuf
[0] == '0');
5130 assert(pbuf
[1] == c
);
5135 memcpy(res
, pbuf
, len
* sizeof(Py_UNICODE
));
5138 while (--width
>= len
) {
5142 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5143 PyErr_SetString(PyExc_TypeError
,
5144 "not all arguments converted");
5150 if (argidx
< arglen
&& !dict
) {
5151 PyErr_SetString(PyExc_TypeError
,
5152 "not all arguments converted");
5160 if (_PyUnicode_Resize(result
, reslen
- rescnt
))
5162 return (PyObject
*)result
;
5173 static PyBufferProcs unicode_as_buffer
= {
5174 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5175 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5176 (getsegcountproc
) unicode_buffer_getsegcount
,
5177 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5180 PyTypeObject PyUnicode_Type
= {
5181 PyObject_HEAD_INIT(&PyType_Type
)
5183 "unicode", /* tp_name */
5184 sizeof(PyUnicodeObject
), /* tp_size */
5185 0, /* tp_itemsize */
5187 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5189 (getattrfunc
)unicode_getattr
, /* tp_getattr */
5191 (cmpfunc
) unicode_compare
, /* tp_compare */
5192 (reprfunc
) unicode_repr
, /* tp_repr */
5193 0, /* tp_as_number */
5194 &unicode_as_sequence
, /* tp_as_sequence */
5195 0, /* tp_as_mapping */
5196 (hashfunc
) unicode_hash
, /* tp_hash*/
5198 (reprfunc
) unicode_str
, /* tp_str */
5199 (getattrofunc
) NULL
, /* tp_getattro */
5200 (setattrofunc
) NULL
, /* tp_setattro */
5201 &unicode_as_buffer
, /* tp_as_buffer */
5202 Py_TPFLAGS_DEFAULT
, /* tp_flags */
5205 /* Initialize the Unicode implementation */
5207 void _PyUnicode_Init(void)
5209 /* Doublecheck the configuration... */
5210 if (sizeof(Py_UNICODE
) != 2)
5211 Py_FatalError("Unicode configuration error: "
5212 "sizeof(Py_UNICODE) != 2 bytes");
5214 /* Init the implementation */
5215 unicode_freelist
= NULL
;
5216 unicode_freelist_size
= 0;
5217 unicode_empty
= _PyUnicode_New(0);
5218 strcpy(unicode_default_encoding
, "ascii");
5221 /* Finalize the Unicode implementation */
5224 _PyUnicode_Fini(void)
5226 PyUnicodeObject
*u
= unicode_freelist
;
5229 PyUnicodeObject
*v
= u
;
5230 u
= *(PyUnicodeObject
**)u
;
5233 Py_XDECREF(v
->defenc
);
5236 unicode_freelist
= NULL
;
5237 unicode_freelist_size
= 0;
5238 Py_XDECREF(unicode_empty
);
5239 unicode_empty
= NULL
;