3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* The empty Unicode object */
87 static PyUnicodeObject
*unicode_empty
;
89 /* Free list for Unicode objects */
90 static PyUnicodeObject
*unicode_freelist
;
91 static int unicode_freelist_size
;
93 /* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
101 static char unicode_default_encoding
[100];
103 /* --- Unicode Object ----------------------------------------------------- */
106 int _PyUnicode_Resize(register PyUnicodeObject
*unicode
,
111 /* Shortcut if there's nothing much to do. */
112 if (unicode
->length
== length
)
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode
== unicode_empty
) {
117 PyErr_SetString(PyExc_SystemError
,
118 "can't resize empty unicode object");
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr
= unicode
->str
;
125 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
127 unicode
->str
= oldstr
;
131 unicode
->str
[length
] = 0;
132 unicode
->length
= length
;
135 /* Reset the object caches */
136 if (unicode
->defenc
) {
137 Py_DECREF(unicode
->defenc
);
138 unicode
->defenc
= NULL
;
145 int PyUnicode_Resize(PyObject
**unicode
,
150 if (unicode
== NULL
) {
151 PyErr_BadInternalCall();
154 v
= (PyUnicodeObject
*)*unicode
;
155 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
156 PyErr_BadInternalCall();
159 return _PyUnicode_Resize(v
, length
);
162 /* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
171 PyUnicodeObject
*_PyUnicode_New(int length
)
173 register PyUnicodeObject
*unicode
;
175 /* Optimization for empty strings */
176 if (length
== 0 && unicode_empty
!= NULL
) {
177 Py_INCREF(unicode_empty
);
178 return unicode_empty
;
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist
) {
183 unicode
= unicode_freelist
;
184 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
185 unicode_freelist_size
--;
187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode
->length
< length
) &&
190 _PyUnicode_Resize(unicode
, length
)) {
191 PyMem_DEL(unicode
->str
);
196 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
198 PyObject_INIT(unicode
, &PyUnicode_Type
);
201 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
204 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
211 unicode
->str
[length
] = 0;
212 unicode
->length
= length
;
214 unicode
->defenc
= NULL
;
218 _Py_ForgetReference((PyObject
*)unicode
);
219 PyObject_DEL(unicode
);
224 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
226 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
227 /* Keep-Alive optimization */
228 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
229 PyMem_DEL(unicode
->str
);
233 if (unicode
->defenc
) {
234 Py_DECREF(unicode
->defenc
);
235 unicode
->defenc
= NULL
;
237 /* Add to free list */
238 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
239 unicode_freelist
= unicode
;
240 unicode_freelist_size
++;
243 PyMem_DEL(unicode
->str
);
244 Py_XDECREF(unicode
->defenc
);
245 PyObject_DEL(unicode
);
249 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
252 PyUnicodeObject
*unicode
;
254 unicode
= _PyUnicode_New(size
);
258 /* Copy the Unicode data into the new object */
260 memcpy(unicode
->str
, u
, size
* sizeof(Py_UNICODE
));
262 return (PyObject
*)unicode
;
267 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
270 PyUnicodeObject
*unicode
;
273 PyErr_BadInternalCall();
277 unicode
= _PyUnicode_New(size
);
281 /* Copy the wchar_t data into the new object */
282 #ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
286 register Py_UNICODE
*u
;
288 u
= PyUnicode_AS_UNICODE(unicode
);
289 for (i
= size
; i
>= 0; i
--)
294 return (PyObject
*)unicode
;
297 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
301 if (unicode
== NULL
) {
302 PyErr_BadInternalCall();
305 if (size
> PyUnicode_GET_SIZE(unicode
))
306 size
= PyUnicode_GET_SIZE(unicode
);
307 #ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
311 register Py_UNICODE
*u
;
313 u
= PyUnicode_AS_UNICODE(unicode
);
314 for (i
= size
; i
>= 0; i
--)
324 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
326 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
329 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
330 const char *encoding
,
339 PyErr_BadInternalCall();
344 if (PyInstance_Check(obj
)) {
346 func
= PyObject_GetAttrString(obj
, "__str__");
348 PyErr_SetString(PyExc_TypeError
,
349 "coercing to Unicode: instance doesn't define __str__");
352 obj
= PyEval_CallObject(func
, NULL
);
358 if (PyUnicode_Check(obj
)) {
362 PyErr_SetString(PyExc_TypeError
,
363 "decoding Unicode is not supported");
368 else if (PyString_Check(obj
)) {
369 s
= PyString_AS_STRING(obj
);
370 len
= PyString_GET_SIZE(obj
);
372 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError
))
376 PyErr_Format(PyExc_TypeError
,
377 "coercing to Unicode: need string or buffer, "
379 obj
->ob_type
->tp_name
);
383 /* Convert to Unicode */
385 Py_INCREF(unicode_empty
);
386 v
= (PyObject
*)unicode_empty
;
389 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
404 PyObject
*PyUnicode_Decode(const char *s
,
406 const char *encoding
,
409 PyObject
*buffer
= NULL
, *unicode
;
411 if (encoding
== NULL
)
412 encoding
= PyUnicode_GetDefaultEncoding();
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding
, "utf-8") == 0)
416 return PyUnicode_DecodeUTF8(s
, size
, errors
);
417 else if (strcmp(encoding
, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s
, size
, errors
);
419 else if (strcmp(encoding
, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s
, size
, errors
);
422 /* Decode via the codec registry */
423 buffer
= PyBuffer_FromMemory((void *)s
, size
);
426 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
429 if (!PyUnicode_Check(unicode
)) {
430 PyErr_Format(PyExc_TypeError
,
431 "decoder did not return an unicode object (type=%.400s)",
432 unicode
->ob_type
->tp_name
);
444 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
446 const char *encoding
,
449 PyObject
*v
, *unicode
;
451 unicode
= PyUnicode_FromUnicode(s
, size
);
454 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
459 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
460 const char *encoding
,
465 if (!PyUnicode_Check(unicode
)) {
470 if (encoding
== NULL
)
471 encoding
= PyUnicode_GetDefaultEncoding();
473 /* Shortcuts for common default encodings */
474 if (errors
== NULL
) {
475 if (strcmp(encoding
, "utf-8") == 0)
476 return PyUnicode_AsUTF8String(unicode
);
477 else if (strcmp(encoding
, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode
);
479 else if (strcmp(encoding
, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode
);
483 /* Encode via the codec registry */
484 v
= PyCodec_Encode(unicode
, encoding
, errors
);
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v
)) {
489 PyErr_Format(PyExc_TypeError
,
490 "encoder did not return a string object (type=%.400s)",
491 v
->ob_type
->tp_name
);
501 /* Return a Python string holding the default encoded value of the
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
509 The refcount of the string is *not* incremented.
511 *** Exported for internal use by the interpreter only !!! ***
515 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
518 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
522 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
523 if (v
&& errors
== NULL
)
524 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
528 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
530 if (!PyUnicode_Check(unicode
)) {
534 return PyUnicode_AS_UNICODE(unicode
);
540 int PyUnicode_GetSize(PyObject
*unicode
)
542 if (!PyUnicode_Check(unicode
)) {
546 return PyUnicode_GET_SIZE(unicode
);
552 const char *PyUnicode_GetDefaultEncoding(void)
554 return unicode_default_encoding
;
557 int PyUnicode_SetDefaultEncoding(const char *encoding
)
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v
= _PyCodec_Lookup(encoding
);
567 strncpy(unicode_default_encoding
,
569 sizeof(unicode_default_encoding
));
576 /* --- UTF-8 Codec -------------------------------------------------------- */
579 char utf8_code_length
[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
601 int utf8_decoding_error(const char **source
,
606 if ((errors
== NULL
) ||
607 (strcmp(errors
,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError
,
609 "UTF-8 decoding error: %.400s",
613 else if (strcmp(errors
,"ignore") == 0) {
617 else if (strcmp(errors
,"replace") == 0) {
619 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
624 PyErr_Format(PyExc_ValueError
,
625 "UTF-8 decoding error; unknown error handling code: %.400s",
631 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
637 PyUnicodeObject
*unicode
;
639 const char *errmsg
= "";
641 /* Note: size will always be longer than the resulting Unicode
643 unicode
= _PyUnicode_New(size
);
647 return (PyObject
*)unicode
;
649 /* Unpack UTF-8 encoded data */
654 Py_UCS4 ch
= (unsigned char)*s
;
657 *p
++ = (Py_UNICODE
)ch
;
662 n
= utf8_code_length
[ch
];
665 errmsg
= "unexpected end of data";
672 errmsg
= "unexpected code byte";
677 errmsg
= "internal error";
682 if ((s
[1] & 0xc0) != 0x80) {
683 errmsg
= "invalid data";
686 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
688 errmsg
= "illegal encoding";
692 *p
++ = (Py_UNICODE
)ch
;
696 if ((s
[1] & 0xc0) != 0x80 ||
697 (s
[2] & 0xc0) != 0x80) {
698 errmsg
= "invalid data";
701 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
702 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
703 errmsg
= "illegal encoding";
707 *p
++ = (Py_UNICODE
)ch
;
711 if ((s
[1] & 0xc0) != 0x80 ||
712 (s
[2] & 0xc0) != 0x80 ||
713 (s
[3] & 0xc0) != 0x80) {
714 errmsg
= "invalid data";
717 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
718 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
719 /* validate and convert to UTF-16 */
720 if ((ch
< 0x10000) || /* minimum value allowed for 4
722 (ch
> 0x10ffff)) { /* maximum value allowed for
724 errmsg
= "illegal encoding";
727 /* compute and append the two surrogates: */
729 /* translate from 10000..10FFFF to 0..FFFF */
732 /* high surrogate = top 10 bits added to D800 */
733 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& ~0xFC00));
740 /* Other sizes are only needed for UCS-4 */
741 errmsg
= "unsupported Unicode code range";
749 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
754 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
757 return (PyObject
*)unicode
;
764 /* Not used anymore, now that the encoder supports UTF-16
768 int utf8_encoding_error(const Py_UNICODE
**source
,
773 if ((errors
== NULL
) ||
774 (strcmp(errors
,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError
,
776 "UTF-8 encoding error: %.400s",
780 else if (strcmp(errors
,"ignore") == 0) {
783 else if (strcmp(errors
,"replace") == 0) {
789 PyErr_Format(PyExc_ValueError
,
790 "UTF-8 encoding error; "
791 "unknown error handling code: %.400s",
798 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
806 unsigned int cbAllocated
= 3 * size
;
807 unsigned int cbWritten
= 0;
810 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
816 p
= q
= PyString_AS_STRING(v
);
823 else if (ch
< 0x0800) {
824 *p
++ = 0xc0 | (ch
>> 6);
825 *p
++ = 0x80 | (ch
& 0x3f);
829 /* Check for high surrogate */
830 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
833 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
835 if (cbWritten
>= (cbAllocated
- 4)) {
836 /* Provide enough room for some more
839 if (_PyString_Resize(&v
, cbAllocated
))
843 /* combine the two values */
844 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
846 *p
++ = (char)((ch
>> 18) | 0xf0);
847 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
854 *p
++ = (char)(0xe0 | (ch
>> 12));
857 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
858 *p
++ = (char)(0x80 | (ch
& 0x3f));
862 if (_PyString_Resize(&v
, p
- q
))
871 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
873 if (!PyUnicode_Check(unicode
)) {
877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
878 PyUnicode_GET_SIZE(unicode
),
882 /* --- UTF-16 Codec ------------------------------------------------------- */
885 int utf16_decoding_error(const Py_UNICODE
**source
,
890 if ((errors
== NULL
) ||
891 (strcmp(errors
,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError
,
893 "UTF-16 decoding error: %.400s",
897 else if (strcmp(errors
,"ignore") == 0) {
900 else if (strcmp(errors
,"replace") == 0) {
902 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
908 PyErr_Format(PyExc_ValueError
,
909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
916 PyObject
*PyUnicode_DecodeUTF16(const char *s
,
921 PyUnicodeObject
*unicode
;
923 const Py_UNICODE
*q
, *e
;
925 const char *errmsg
= "";
927 /* size should be an even number */
928 if (size
% sizeof(Py_UNICODE
) != 0) {
929 if (utf16_decoding_error(NULL
, NULL
, errors
, "truncated data"))
931 /* The remaining input chars are ignored if we fall through
935 /* Note: size will always be longer than the resulting Unicode
937 unicode
= _PyUnicode_New(size
);
941 return (PyObject
*)unicode
;
943 /* Unpack UTF-16 encoded data */
946 e
= q
+ (size
/ sizeof(Py_UNICODE
));
952 register Py_UNICODE ch
= *q
++;
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
958 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
962 } else if (ch
== 0xFFFE) {
967 ch
= (ch
>> 8) | (ch
<< 8);
972 } else if (ch
== 0xFFFE) {
977 ch
= (ch
>> 8) | (ch
<< 8);
979 if (ch
< 0xD800 || ch
> 0xDFFF) {
984 /* UTF-16 code pair: */
986 errmsg
= "unexpected end of data";
989 if (0xDC00 <= *q
&& *q
<= 0xDFFF) {
991 if (0xD800 <= *q
&& *q
<= 0xDBFF) {
992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
996 errmsg
= "code pairs are not supported";
1002 errmsg
= "illegal encoding";
1003 /* Fall through to report the error */
1006 if (utf16_decoding_error(&q
, &p
, errors
, errmsg
))
1014 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
1017 return (PyObject
*)unicode
;
1026 PyObject
*PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1035 /* We don't create UTF-16 pairs... */
1036 v
= PyString_FromStringAndSize(NULL
,
1037 sizeof(Py_UNICODE
) * (size
+ (byteorder
== 0)));
1041 q
= PyString_AS_STRING(v
);
1042 p
= (Py_UNICODE
*)q
;
1047 if (byteorder
== 0 ||
1048 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1054 memcpy(p
, s
, size
* sizeof(Py_UNICODE
));
1056 while (size
-- > 0) {
1057 Py_UNICODE ch
= *s
++;
1058 *p
++ = (ch
>> 8) | (ch
<< 8);
1063 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1065 if (!PyUnicode_Check(unicode
)) {
1066 PyErr_BadArgument();
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1070 PyUnicode_GET_SIZE(unicode
),
1075 /* --- Unicode Escape Codec ----------------------------------------------- */
1078 int unicodeescape_decoding_error(const char **source
,
1081 const char *details
)
1083 if ((errors
== NULL
) ||
1084 (strcmp(errors
,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError
,
1086 "Unicode-Escape decoding error: %.400s",
1090 else if (strcmp(errors
,"ignore") == 0) {
1093 else if (strcmp(errors
,"replace") == 0) {
1094 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1098 PyErr_Format(PyExc_ValueError
,
1099 "Unicode-Escape decoding error; "
1100 "unknown error handling code: %.400s",
1106 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1108 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1113 Py_UNICODE
*p
, *buf
;
1116 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1118 /* Escaped strings will always be longer than the resulting
1119 Unicode string, so we start with size here and then reduce the
1120 length after conversion to the true value. */
1121 v
= _PyUnicode_New(size
);
1125 return (PyObject
*)v
;
1127 p
= buf
= PyUnicode_AS_UNICODE(v
);
1135 /* Non-escape characters are interpreted as Unicode ordinals */
1137 *p
++ = (unsigned char) *s
++;
1147 case '\\': *p
++ = '\\'; break;
1148 case '\'': *p
++ = '\''; break;
1149 case '\"': *p
++ = '\"'; break;
1150 case 'b': *p
++ = '\b'; break;
1151 case 'f': *p
++ = '\014'; break; /* FF */
1152 case 't': *p
++ = '\t'; break;
1153 case 'n': *p
++ = '\n'; break;
1154 case 'r': *p
++ = '\r'; break;
1155 case 'v': *p
++ = '\013'; break; /* VT */
1156 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1158 /* \OOO (octal) escapes */
1159 case '0': case '1': case '2': case '3':
1160 case '4': case '5': case '6': case '7':
1162 if ('0' <= *s
&& *s
<= '7') {
1163 x
= (x
<<3) + *s
++ - '0';
1164 if ('0' <= *s
&& *s
<= '7')
1165 x
= (x
<<3) + *s
++ - '0';
1174 message
= "truncated \\xXX escape";
1180 message
= "truncated \\uXXXX escape";
1186 message
= "truncated \\UXXXXXXXX escape";
1189 for (i
= 0; i
< digits
; i
++) {
1190 c
= (unsigned char) s
[i
];
1192 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1198 chr
= (chr
<<4) & ~0xF;
1199 if (c
>= '0' && c
<= '9')
1201 else if (c
>= 'a' && c
<= 'f')
1202 chr
+= 10 + c
- 'a';
1204 chr
+= 10 + c
- 'A';
1208 /* when we get here, chr is a 32-bit unicode character */
1210 /* UCS-2 character */
1211 *p
++ = (Py_UNICODE
) chr
;
1212 else if (chr
<= 0x10ffff) {
1213 /* UCS-4 character. store as two surrogate characters */
1215 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1216 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& ~0xFC00);
1218 if (unicodeescape_decoding_error(
1220 "illegal Unicode character")
1223 *p
++ = x
; /* store replacement character */
1229 message
= "malformed \\N character escape";
1230 if (ucnhash_CAPI
== NULL
) {
1231 /* load the unicode data module */
1233 m
= PyImport_ImportModule("unicodedata");
1236 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1240 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1242 if (ucnhash_CAPI
== NULL
)
1246 const char *start
= s
+1;
1247 /* look for the closing brace */
1248 while (*s
!= '}' && s
< end
)
1250 if (s
> start
&& s
< end
&& *s
== '}') {
1251 /* found a name. look it up in the unicode database */
1252 message
= "unknown Unicode character name";
1254 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1258 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1265 *p
++ = (unsigned char)s
[-1];
1269 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1271 return (PyObject
*)v
;
1276 "\\N escapes not supported (can't load unicodedata module)"
1285 /* Return a Unicode-Escape string version of the Unicode object.
1287 If quotes is true, the string is enclosed in u"" or u'' quotes as
1292 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1297 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1305 static const char *hexdigit
= "0123456789abcdef";
1307 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1311 p
= q
= PyString_AS_STRING(repr
);
1315 *p
++ = (findchar(s
, size
, '\'') &&
1316 !findchar(s
, size
, '"')) ? '"' : '\'';
1318 while (size
-- > 0) {
1319 Py_UNICODE ch
= *s
++;
1321 if (quotes
&& (ch
== q
[1] || ch
== '\\')) {
1325 /* Map 16-bit characters to '\uxxxx' */
1326 else if (ch
>= 256) {
1329 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1330 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1331 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1332 *p
++ = hexdigit
[ch
& 15];
1334 /* Map special whitespace to '\t', \n', '\r' */
1335 else if (ch
== '\t') {
1339 else if (ch
== '\n') {
1343 else if (ch
== '\r') {
1347 /* Map non-printable US ASCII to '\xhh' */
1348 else if (ch
< ' ' || ch
>= 128) {
1351 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1352 *p
++ = hexdigit
[ch
& 15];
1354 /* Copy everything else as-is */
1362 if (_PyString_Resize(&repr
, p
- q
))
1372 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1375 return unicodeescape_string(s
, size
, 0);
1378 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1380 if (!PyUnicode_Check(unicode
)) {
1381 PyErr_BadArgument();
1384 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1385 PyUnicode_GET_SIZE(unicode
));
1388 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1390 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1395 Py_UNICODE
*p
, *buf
;
1399 /* Escaped strings will always be longer than the resulting
1400 Unicode string, so we start with size here and then reduce the
1401 length after conversion to the true value. */
1402 v
= _PyUnicode_New(size
);
1406 return (PyObject
*)v
;
1407 p
= buf
= PyUnicode_AS_UNICODE(v
);
1414 /* Non-escape characters are interpreted as Unicode ordinals */
1416 *p
++ = (unsigned char)*s
++;
1420 /* \u-escapes are only interpreted iff the number of leading
1421 backslashes if odd */
1426 *p
++ = (unsigned char)*s
++;
1428 if (((s
- bs
) & 1) == 0 ||
1436 /* \uXXXX with 4 hex digits */
1437 for (x
= 0, i
= 0; i
< 4; i
++) {
1438 c
= (unsigned char)s
[i
];
1440 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1441 "truncated \\uXXXX"))
1447 if (c
>= '0' && c
<= '9')
1449 else if (c
>= 'a' && c
<= 'f')
1457 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1459 return (PyObject
*)v
;
1466 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1473 static const char *hexdigit
= "0123456789abcdef";
1475 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1481 p
= q
= PyString_AS_STRING(repr
);
1482 while (size
-- > 0) {
1483 Py_UNICODE ch
= *s
++;
1484 /* Map 16-bit characters to '\uxxxx' */
1488 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1489 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1490 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1491 *p
++ = hexdigit
[ch
& 15];
1493 /* Copy everything else as-is */
1498 if (_PyString_Resize(&repr
, p
- q
))
1508 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1510 if (!PyUnicode_Check(unicode
)) {
1511 PyErr_BadArgument();
1514 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1515 PyUnicode_GET_SIZE(unicode
));
1518 /* --- Latin-1 Codec ------------------------------------------------------ */
1520 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
1527 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1528 v
= _PyUnicode_New(size
);
1532 return (PyObject
*)v
;
1533 p
= PyUnicode_AS_UNICODE(v
);
1535 *p
++ = (unsigned char)*s
++;
1536 return (PyObject
*)v
;
1544 int latin1_encoding_error(const Py_UNICODE
**source
,
1547 const char *details
)
1549 if ((errors
== NULL
) ||
1550 (strcmp(errors
,"strict") == 0)) {
1551 PyErr_Format(PyExc_UnicodeError
,
1552 "Latin-1 encoding error: %.400s",
1556 else if (strcmp(errors
,"ignore") == 0) {
1559 else if (strcmp(errors
,"replace") == 0) {
1565 PyErr_Format(PyExc_ValueError
,
1566 "Latin-1 encoding error; "
1567 "unknown error handling code: %.400s",
1573 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
1580 repr
= PyString_FromStringAndSize(NULL
, size
);
1586 s
= PyString_AS_STRING(repr
);
1588 while (size
-- > 0) {
1589 Py_UNICODE ch
= *p
++;
1591 if (latin1_encoding_error(&p
, &s
, errors
,
1592 "ordinal not in range(256)"))
1598 /* Resize if error handling skipped some characters */
1599 if (s
- start
< PyString_GET_SIZE(repr
))
1600 if (_PyString_Resize(&repr
, s
- start
))
1609 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
1611 if (!PyUnicode_Check(unicode
)) {
1612 PyErr_BadArgument();
1615 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
1616 PyUnicode_GET_SIZE(unicode
),
1620 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1623 int ascii_decoding_error(const char **source
,
1626 const char *details
)
1628 if ((errors
== NULL
) ||
1629 (strcmp(errors
,"strict") == 0)) {
1630 PyErr_Format(PyExc_UnicodeError
,
1631 "ASCII decoding error: %.400s",
1635 else if (strcmp(errors
,"ignore") == 0) {
1638 else if (strcmp(errors
,"replace") == 0) {
1639 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1644 PyErr_Format(PyExc_ValueError
,
1645 "ASCII decoding error; "
1646 "unknown error handling code: %.400s",
1652 PyObject
*PyUnicode_DecodeASCII(const char *s
,
1659 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1660 v
= _PyUnicode_New(size
);
1664 return (PyObject
*)v
;
1665 p
= PyUnicode_AS_UNICODE(v
);
1666 while (size
-- > 0) {
1667 register unsigned char c
;
1669 c
= (unsigned char)*s
++;
1672 else if (ascii_decoding_error(&s
, &p
, errors
,
1673 "ordinal not in range(128)"))
1676 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
1677 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1679 return (PyObject
*)v
;
1687 int ascii_encoding_error(const Py_UNICODE
**source
,
1690 const char *details
)
1692 if ((errors
== NULL
) ||
1693 (strcmp(errors
,"strict") == 0)) {
1694 PyErr_Format(PyExc_UnicodeError
,
1695 "ASCII encoding error: %.400s",
1699 else if (strcmp(errors
,"ignore") == 0) {
1702 else if (strcmp(errors
,"replace") == 0) {
1708 PyErr_Format(PyExc_ValueError
,
1709 "ASCII encoding error; "
1710 "unknown error handling code: %.400s",
1716 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
1723 repr
= PyString_FromStringAndSize(NULL
, size
);
1729 s
= PyString_AS_STRING(repr
);
1731 while (size
-- > 0) {
1732 Py_UNICODE ch
= *p
++;
1734 if (ascii_encoding_error(&p
, &s
, errors
,
1735 "ordinal not in range(128)"))
1741 /* Resize if error handling skipped some characters */
1742 if (s
- start
< PyString_GET_SIZE(repr
))
1743 if (_PyString_Resize(&repr
, s
- start
))
1752 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
1754 if (!PyUnicode_Check(unicode
)) {
1755 PyErr_BadArgument();
1758 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
1759 PyUnicode_GET_SIZE(unicode
),
1765 /* --- MBCS codecs for Windows -------------------------------------------- */
1767 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
1774 /* First get the size of the result */
1775 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
1776 if (size
> 0 && usize
==0)
1777 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1779 v
= _PyUnicode_New(usize
);
1783 return (PyObject
*)v
;
1784 p
= PyUnicode_AS_UNICODE(v
);
1785 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
1787 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1790 return (PyObject
*)v
;
1793 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
1801 /* If there are no characters, bail now! */
1803 return PyString_FromString("");
1805 /* First get the size of the result */
1806 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
1808 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1810 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
1816 /* Do the conversion */
1817 s
= PyString_AS_STRING(repr
);
1818 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
1820 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1825 #endif /* MS_WIN32 */
1827 /* --- Character Mapping Codec -------------------------------------------- */
1830 int charmap_decoding_error(const char **source
,
1833 const char *details
)
1835 if ((errors
== NULL
) ||
1836 (strcmp(errors
,"strict") == 0)) {
1837 PyErr_Format(PyExc_UnicodeError
,
1838 "charmap decoding error: %.400s",
1842 else if (strcmp(errors
,"ignore") == 0) {
1845 else if (strcmp(errors
,"replace") == 0) {
1846 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1851 PyErr_Format(PyExc_ValueError
,
1852 "charmap decoding error; "
1853 "unknown error handling code: %.400s",
1859 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
1868 /* Default to Latin-1 */
1869 if (mapping
== NULL
)
1870 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1872 v
= _PyUnicode_New(size
);
1876 return (PyObject
*)v
;
1877 p
= PyUnicode_AS_UNICODE(v
);
1878 while (size
-- > 0) {
1879 unsigned char ch
= *s
++;
1882 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1883 w
= PyInt_FromLong((long)ch
);
1886 x
= PyObject_GetItem(mapping
, w
);
1889 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
1890 /* No mapping found means: mapping is undefined. */
1899 if (PyInt_Check(x
)) {
1900 long value
= PyInt_AS_LONG(x
);
1901 if (value
< 0 || value
> 65535) {
1902 PyErr_SetString(PyExc_TypeError
,
1903 "character mapping must be in range(65536)");
1907 *p
++ = (Py_UNICODE
)value
;
1909 else if (x
== Py_None
) {
1910 /* undefined mapping */
1911 if (charmap_decoding_error(&s
, &p
, errors
,
1912 "character maps to <undefined>")) {
1917 else if (PyUnicode_Check(x
)) {
1918 int targetsize
= PyUnicode_GET_SIZE(x
);
1920 if (targetsize
== 1)
1922 *p
++ = *PyUnicode_AS_UNICODE(x
);
1924 else if (targetsize
> 1) {
1926 if (targetsize
> extrachars
) {
1928 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
1929 int needed
= (targetsize
- extrachars
) + \
1931 extrachars
+= needed
;
1932 if (_PyUnicode_Resize(v
, PyUnicode_GET_SIZE(v
) + needed
)) {
1936 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
1939 PyUnicode_AS_UNICODE(x
),
1942 extrachars
-= targetsize
;
1944 /* 1-0 mapping: skip the character */
1947 /* wrong return value */
1948 PyErr_SetString(PyExc_TypeError
,
1949 "character mapping must return integer, None or unicode");
1955 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
1956 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1958 return (PyObject
*)v
;
1966 int charmap_encoding_error(const Py_UNICODE
**source
,
1969 const char *details
)
1971 if ((errors
== NULL
) ||
1972 (strcmp(errors
,"strict") == 0)) {
1973 PyErr_Format(PyExc_UnicodeError
,
1974 "charmap encoding error: %.400s",
1978 else if (strcmp(errors
,"ignore") == 0) {
1981 else if (strcmp(errors
,"replace") == 0) {
1987 PyErr_Format(PyExc_ValueError
,
1988 "charmap encoding error; "
1989 "unknown error handling code: %.400s",
1995 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2004 /* Default to Latin-1 */
2005 if (mapping
== NULL
)
2006 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2008 v
= PyString_FromStringAndSize(NULL
, size
);
2013 s
= PyString_AS_STRING(v
);
2014 while (size
-- > 0) {
2015 Py_UNICODE ch
= *p
++;
2018 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2019 w
= PyInt_FromLong((long)ch
);
2022 x
= PyObject_GetItem(mapping
, w
);
2025 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2026 /* No mapping found means: mapping is undefined. */
2035 if (PyInt_Check(x
)) {
2036 long value
= PyInt_AS_LONG(x
);
2037 if (value
< 0 || value
> 255) {
2038 PyErr_SetString(PyExc_TypeError
,
2039 "character mapping must be in range(256)");
2045 else if (x
== Py_None
) {
2046 /* undefined mapping */
2047 if (charmap_encoding_error(&p
, &s
, errors
,
2048 "character maps to <undefined>")) {
2053 else if (PyString_Check(x
)) {
2054 int targetsize
= PyString_GET_SIZE(x
);
2056 if (targetsize
== 1)
2058 *s
++ = *PyString_AS_STRING(x
);
2060 else if (targetsize
> 1) {
2062 if (targetsize
> extrachars
) {
2064 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2065 int needed
= (targetsize
- extrachars
) + \
2067 extrachars
+= needed
;
2068 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2072 s
= PyString_AS_STRING(v
) + oldpos
;
2075 PyString_AS_STRING(x
),
2078 extrachars
-= targetsize
;
2080 /* 1-0 mapping: skip the character */
2083 /* wrong return value */
2084 PyErr_SetString(PyExc_TypeError
,
2085 "character mapping must return integer, None or unicode");
2091 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2092 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2101 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2104 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2105 PyErr_BadArgument();
2108 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2109 PyUnicode_GET_SIZE(unicode
),
2115 int translate_error(const Py_UNICODE
**source
,
2118 const char *details
)
2120 if ((errors
== NULL
) ||
2121 (strcmp(errors
,"strict") == 0)) {
2122 PyErr_Format(PyExc_UnicodeError
,
2123 "translate error: %.400s",
2127 else if (strcmp(errors
,"ignore") == 0) {
2130 else if (strcmp(errors
,"replace") == 0) {
2136 PyErr_Format(PyExc_ValueError
,
2138 "unknown error handling code: %.400s",
2144 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2152 if (mapping
== NULL
) {
2153 PyErr_BadArgument();
2157 /* Output will never be longer than input */
2158 v
= _PyUnicode_New(size
);
2163 p
= PyUnicode_AS_UNICODE(v
);
2164 while (size
-- > 0) {
2165 Py_UNICODE ch
= *s
++;
2169 w
= PyInt_FromLong(ch
);
2172 x
= PyObject_GetItem(mapping
, w
);
2175 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2176 /* No mapping found: default to 1-1 mapping */
2186 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2187 else if (x
== Py_None
) {
2188 /* undefined mapping */
2189 if (translate_error(&s
, &p
, errors
,
2190 "character maps to <undefined>")) {
2195 else if (PyUnicode_Check(x
)) {
2196 if (PyUnicode_GET_SIZE(x
) != 1) {
2198 PyErr_SetString(PyExc_NotImplementedError
,
2199 "1-n mappings are currently not implemented");
2203 *p
++ = *PyUnicode_AS_UNICODE(x
);
2206 /* wrong return value */
2207 PyErr_SetString(PyExc_TypeError
,
2208 "translate mapping must return integer, None or unicode");
2214 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2215 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2219 return (PyObject
*)v
;
2226 PyObject
*PyUnicode_Translate(PyObject
*str
,
2232 str
= PyUnicode_FromObject(str
);
2235 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2236 PyUnicode_GET_SIZE(str
),
2247 /* --- Decimal Encoder ---------------------------------------------------- */
2249 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2254 Py_UNICODE
*p
, *end
;
2256 if (output
== NULL
) {
2257 PyErr_BadArgument();
2264 register Py_UNICODE ch
= *p
++;
2267 if (Py_UNICODE_ISSPACE(ch
)) {
2271 decimal
= Py_UNICODE_TODECIMAL(ch
);
2273 *output
++ = '0' + decimal
;
2276 if (0 < ch
&& ch
< 256) {
2277 *output
++ = (char)ch
;
2280 /* All other characters are considered invalid */
2281 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2282 PyErr_SetString(PyExc_ValueError
,
2283 "invalid decimal Unicode string");
2286 else if (strcmp(errors
, "ignore") == 0)
2288 else if (strcmp(errors
, "replace") == 0) {
2293 /* 0-terminate the output string */
2301 /* --- Helpers ------------------------------------------------------------ */
2304 int count(PyUnicodeObject
*self
,
2307 PyUnicodeObject
*substring
)
2312 start
+= self
->length
;
2315 if (end
> self
->length
)
2318 end
+= self
->length
;
2322 if (substring
->length
== 0)
2323 return (end
- start
+ 1);
2325 end
-= substring
->length
;
2327 while (start
<= end
)
2328 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2330 start
+= substring
->length
;
2337 int PyUnicode_Count(PyObject
*str
,
2344 str
= PyUnicode_FromObject(str
);
2347 substr
= PyUnicode_FromObject(substr
);
2348 if (substr
== NULL
) {
2353 result
= count((PyUnicodeObject
*)str
,
2355 (PyUnicodeObject
*)substr
);
2363 int findstring(PyUnicodeObject
*self
,
2364 PyUnicodeObject
*substring
,
2370 start
+= self
->length
;
2374 if (substring
->length
== 0)
2377 if (end
> self
->length
)
2380 end
+= self
->length
;
2384 end
-= substring
->length
;
2386 if (direction
< 0) {
2387 for (; end
>= start
; end
--)
2388 if (Py_UNICODE_MATCH(self
, end
, substring
))
2391 for (; start
<= end
; start
++)
2392 if (Py_UNICODE_MATCH(self
, start
, substring
))
2399 int PyUnicode_Find(PyObject
*str
,
2407 str
= PyUnicode_FromObject(str
);
2410 substr
= PyUnicode_FromObject(substr
);
2411 if (substr
== NULL
) {
2416 result
= findstring((PyUnicodeObject
*)str
,
2417 (PyUnicodeObject
*)substr
,
2418 start
, end
, direction
);
2425 int tailmatch(PyUnicodeObject
*self
,
2426 PyUnicodeObject
*substring
,
2432 start
+= self
->length
;
2436 if (substring
->length
== 0)
2439 if (end
> self
->length
)
2442 end
+= self
->length
;
2446 end
-= substring
->length
;
2450 if (direction
> 0) {
2451 if (Py_UNICODE_MATCH(self
, end
, substring
))
2454 if (Py_UNICODE_MATCH(self
, start
, substring
))
2461 int PyUnicode_Tailmatch(PyObject
*str
,
2469 str
= PyUnicode_FromObject(str
);
2472 substr
= PyUnicode_FromObject(substr
);
2473 if (substr
== NULL
) {
2478 result
= tailmatch((PyUnicodeObject
*)str
,
2479 (PyUnicodeObject
*)substr
,
2480 start
, end
, direction
);
2487 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2491 /* like wcschr, but doesn't stop at NULL characters */
2493 while (size
-- > 0) {
2502 /* Apply fixfct filter to the Unicode object self and return a
2503 reference to the modified object */
2506 PyObject
*fixup(PyUnicodeObject
*self
,
2507 int (*fixfct
)(PyUnicodeObject
*s
))
2512 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(self
->str
,
2517 /* fixfct should return TRUE if it modified the buffer. If
2518 FALSE, return a reference to the original buffer instead
2519 (to save space, not time) */
2522 return (PyObject
*) self
;
2524 return (PyObject
*) u
;
2528 int fixupper(PyUnicodeObject
*self
)
2530 int len
= self
->length
;
2531 Py_UNICODE
*s
= self
->str
;
2535 register Py_UNICODE ch
;
2537 ch
= Py_UNICODE_TOUPPER(*s
);
2549 int fixlower(PyUnicodeObject
*self
)
2551 int len
= self
->length
;
2552 Py_UNICODE
*s
= self
->str
;
2556 register Py_UNICODE ch
;
2558 ch
= Py_UNICODE_TOLOWER(*s
);
2570 int fixswapcase(PyUnicodeObject
*self
)
2572 int len
= self
->length
;
2573 Py_UNICODE
*s
= self
->str
;
2577 if (Py_UNICODE_ISUPPER(*s
)) {
2578 *s
= Py_UNICODE_TOLOWER(*s
);
2580 } else if (Py_UNICODE_ISLOWER(*s
)) {
2581 *s
= Py_UNICODE_TOUPPER(*s
);
2591 int fixcapitalize(PyUnicodeObject
*self
)
2593 int len
= self
->length
;
2594 Py_UNICODE
*s
= self
->str
;
2599 if (Py_UNICODE_ISLOWER(*s
)) {
2600 *s
= Py_UNICODE_TOUPPER(*s
);
2605 if (Py_UNICODE_ISUPPER(*s
)) {
2606 *s
= Py_UNICODE_TOLOWER(*s
);
2615 int fixtitle(PyUnicodeObject
*self
)
2617 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
2618 register Py_UNICODE
*e
;
2619 int previous_is_cased
;
2621 /* Shortcut for single character strings */
2622 if (PyUnicode_GET_SIZE(self
) == 1) {
2623 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
2632 e
= p
+ PyUnicode_GET_SIZE(self
);
2633 previous_is_cased
= 0;
2634 for (; p
< e
; p
++) {
2635 register const Py_UNICODE ch
= *p
;
2637 if (previous_is_cased
)
2638 *p
= Py_UNICODE_TOLOWER(ch
);
2640 *p
= Py_UNICODE_TOTITLE(ch
);
2642 if (Py_UNICODE_ISLOWER(ch
) ||
2643 Py_UNICODE_ISUPPER(ch
) ||
2644 Py_UNICODE_ISTITLE(ch
))
2645 previous_is_cased
= 1;
2647 previous_is_cased
= 0;
2652 PyObject
*PyUnicode_Join(PyObject
*separator
,
2657 PyUnicodeObject
*res
= NULL
;
2664 seqlen
= PySequence_Size(seq
);
2665 if (seqlen
< 0 && PyErr_Occurred())
2668 if (separator
== NULL
) {
2669 Py_UNICODE blank
= ' ';
2674 separator
= PyUnicode_FromObject(separator
);
2675 if (separator
== NULL
)
2677 sep
= PyUnicode_AS_UNICODE(separator
);
2678 seplen
= PyUnicode_GET_SIZE(separator
);
2681 res
= _PyUnicode_New(sz
);
2684 p
= PyUnicode_AS_UNICODE(res
);
2687 for (i
= 0; i
< seqlen
; i
++) {
2691 item
= PySequence_GetItem(seq
, i
);
2694 if (!PyUnicode_Check(item
)) {
2696 v
= PyUnicode_FromObject(item
);
2702 itemlen
= PyUnicode_GET_SIZE(item
);
2703 while (reslen
+ itemlen
+ seplen
>= sz
) {
2704 if (_PyUnicode_Resize(res
, sz
*2))
2707 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
2710 memcpy(p
, sep
, seplen
* sizeof(Py_UNICODE
));
2714 memcpy(p
, PyUnicode_AS_UNICODE(item
), itemlen
* sizeof(Py_UNICODE
));
2719 if (_PyUnicode_Resize(res
, reslen
))
2722 Py_XDECREF(separator
);
2723 return (PyObject
*)res
;
2726 Py_XDECREF(separator
);
2732 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
2744 if (left
== 0 && right
== 0) {
2749 u
= _PyUnicode_New(left
+ self
->length
+ right
);
2752 Py_UNICODE_FILL(u
->str
, fill
, left
);
2753 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
2755 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
2761 #define SPLIT_APPEND(data, left, right) \
2762 str = PyUnicode_FromUnicode(data + left, right - left); \
2765 if (PyList_Append(list, str)) { \
2773 PyObject
*split_whitespace(PyUnicodeObject
*self
,
2779 int len
= self
->length
;
2782 for (i
= j
= 0; i
< len
; ) {
2784 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2787 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
2790 if (maxcount
-- <= 0)
2792 SPLIT_APPEND(self
->str
, j
, i
);
2793 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2799 SPLIT_APPEND(self
->str
, j
, len
);
2808 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
2818 string
= PyUnicode_FromObject(string
);
2821 data
= PyUnicode_AS_UNICODE(string
);
2822 len
= PyUnicode_GET_SIZE(string
);
2824 list
= PyList_New(0);
2828 for (i
= j
= 0; i
< len
; ) {
2831 /* Find a line and append it */
2832 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
2835 /* Skip the line break reading CRLF as one line break */
2838 if (data
[i
] == '\r' && i
+ 1 < len
&&
2846 SPLIT_APPEND(data
, j
, eol
);
2850 SPLIT_APPEND(data
, j
, len
);
2863 PyObject
*split_char(PyUnicodeObject
*self
,
2870 int len
= self
->length
;
2873 for (i
= j
= 0; i
< len
; ) {
2874 if (self
->str
[i
] == ch
) {
2875 if (maxcount
-- <= 0)
2877 SPLIT_APPEND(self
->str
, j
, i
);
2883 SPLIT_APPEND(self
->str
, j
, len
);
2893 PyObject
*split_substring(PyUnicodeObject
*self
,
2895 PyUnicodeObject
*substring
,
2900 int len
= self
->length
;
2901 int sublen
= substring
->length
;
2904 for (i
= j
= 0; i
<= len
- sublen
; ) {
2905 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
2906 if (maxcount
-- <= 0)
2908 SPLIT_APPEND(self
->str
, j
, i
);
2914 SPLIT_APPEND(self
->str
, j
, len
);
2926 PyObject
*split(PyUnicodeObject
*self
,
2927 PyUnicodeObject
*substring
,
2935 list
= PyList_New(0);
2939 if (substring
== NULL
)
2940 return split_whitespace(self
,list
,maxcount
);
2942 else if (substring
->length
== 1)
2943 return split_char(self
,list
,substring
->str
[0],maxcount
);
2945 else if (substring
->length
== 0) {
2947 PyErr_SetString(PyExc_ValueError
, "empty separator");
2951 return split_substring(self
,list
,substring
,maxcount
);
2955 PyObject
*strip(PyUnicodeObject
*self
,
2959 Py_UNICODE
*p
= self
->str
;
2961 int end
= self
->length
;
2964 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
2968 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
2971 if (start
== 0 && end
== self
->length
) {
2972 /* couldn't strip anything off, return original string */
2974 return (PyObject
*) self
;
2977 return (PyObject
*) PyUnicode_FromUnicode(
2984 PyObject
*replace(PyUnicodeObject
*self
,
2985 PyUnicodeObject
*str1
,
2986 PyUnicodeObject
*str2
,
2994 if (str1
->length
== 1 && str2
->length
== 1) {
2997 /* replace characters */
2998 if (!findchar(self
->str
, self
->length
, str1
->str
[0])) {
2999 /* nothing to replace, return original string */
3003 Py_UNICODE u1
= str1
->str
[0];
3004 Py_UNICODE u2
= str2
->str
[0];
3006 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3011 for (i
= 0; i
< u
->length
; i
++)
3012 if (u
->str
[i
] == u1
) {
3023 /* replace strings */
3024 n
= count(self
, 0, self
->length
, str1
);
3028 /* nothing to replace, return original string */
3033 self
->length
+ n
* (str2
->length
- str1
->length
));
3037 while (i
<= self
->length
- str1
->length
)
3038 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3039 /* replace string segment */
3040 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3044 /* copy remaining part */
3045 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3049 *p
++ = self
->str
[i
++];
3054 return (PyObject
*) u
;
3057 /* --- Unicode Object Methods --------------------------------------------- */
3059 static char title__doc__
[] =
3060 "S.title() -> unicode\n\
3062 Return a titlecased version of S, i.e. words start with title case\n\
3063 characters, all remaining cased characters have lower case.";
3066 unicode_title(PyUnicodeObject
*self
, PyObject
*args
)
3068 if (!PyArg_NoArgs(args
))
3070 return fixup(self
, fixtitle
);
3073 static char capitalize__doc__
[] =
3074 "S.capitalize() -> unicode\n\
3076 Return a capitalized version of S, i.e. make the first character\n\
3080 unicode_capitalize(PyUnicodeObject
*self
, PyObject
*args
)
3082 if (!PyArg_NoArgs(args
))
3084 return fixup(self
, fixcapitalize
);
3088 static char capwords__doc__
[] =
3089 "S.capwords() -> unicode\n\
3091 Apply .capitalize() to all words in S and return the result with\n\
3092 normalized whitespace (all whitespace strings are replaced by ' ').";
3095 unicode_capwords(PyUnicodeObject
*self
, PyObject
*args
)
3101 if (!PyArg_NoArgs(args
))
3104 /* Split into words */
3105 list
= split(self
, NULL
, -1);
3109 /* Capitalize each word */
3110 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3111 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3115 Py_DECREF(PyList_GET_ITEM(list
, i
));
3116 PyList_SET_ITEM(list
, i
, item
);
3119 /* Join the words to form a new string */
3120 item
= PyUnicode_Join(NULL
, list
);
3124 return (PyObject
*)item
;
3128 static char center__doc__
[] =
3129 "S.center(width) -> unicode\n\
3131 Return S centered in a Unicode string of length width. Padding is done\n\
3135 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3140 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3143 if (self
->length
>= width
) {
3145 return (PyObject
*) self
;
3148 marg
= width
- self
->length
;
3149 left
= marg
/ 2 + (marg
& width
& 1);
3151 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3156 /* This code should go into some future Unicode collation support
3157 module. The basic comparison should compare ordinals on a naive
3158 basis (this is what Java does and thus JPython too). */
3160 /* speedy UTF-16 code point order comparison */
3162 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3164 static short utf16Fixup
[32] =
3166 0, 0, 0, 0, 0, 0, 0, 0,
3167 0, 0, 0, 0, 0, 0, 0, 0,
3168 0, 0, 0, 0, 0, 0, 0, 0,
3169 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3173 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3177 Py_UNICODE
*s1
= str1
->str
;
3178 Py_UNICODE
*s2
= str2
->str
;
3180 len1
= str1
->length
;
3181 len2
= str2
->length
;
3183 while (len1
> 0 && len2
> 0) {
3189 if (c1
> (1<<11) * 26)
3190 c1
+= utf16Fixup
[c1
>>11];
3191 if (c2
> (1<<11) * 26)
3192 c2
+= utf16Fixup
[c2
>>11];
3194 /* now c1 and c2 are in UTF-32-compatible order */
3195 diff
= (long)c1
- (long)c2
;
3197 return (diff
< 0) ? -1 : (diff
!= 0);
3201 return (len1
< len2
) ? -1 : (len1
!= len2
);
3207 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3209 register int len1
, len2
;
3211 Py_UNICODE
*s1
= str1
->str
;
3212 Py_UNICODE
*s2
= str2
->str
;
3214 len1
= str1
->length
;
3215 len2
= str2
->length
;
3217 while (len1
> 0 && len2
> 0) {
3220 diff
= (long)*s1
++ - (long)*s2
++;
3222 return (diff
< 0) ? -1 : (diff
!= 0);
3226 return (len1
< len2
) ? -1 : (len1
!= len2
);
3231 int PyUnicode_Compare(PyObject
*left
,
3234 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3237 /* Coerce the two arguments */
3238 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3241 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3245 /* Shortcut for empty or interned objects */
3252 result
= unicode_compare(u
, v
);
3264 int PyUnicode_Contains(PyObject
*container
,
3267 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3269 register const Py_UNICODE
*p
, *e
;
3270 register Py_UNICODE ch
;
3272 /* Coerce the two arguments */
3273 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3275 PyErr_SetString(PyExc_TypeError
,
3276 "'in <string>' requires character as left operand");
3279 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3286 if (PyUnicode_GET_SIZE(v
) != 1) {
3287 PyErr_SetString(PyExc_TypeError
,
3288 "'in <string>' requires character as left operand");
3291 ch
= *PyUnicode_AS_UNICODE(v
);
3292 p
= PyUnicode_AS_UNICODE(u
);
3293 e
= p
+ PyUnicode_GET_SIZE(u
);
3312 /* Concat to string or Unicode object giving a new Unicode object. */
3314 PyObject
*PyUnicode_Concat(PyObject
*left
,
3317 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3319 /* Coerce the two arguments */
3320 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3323 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3328 if (v
== unicode_empty
) {
3330 return (PyObject
*)u
;
3332 if (u
== unicode_empty
) {
3334 return (PyObject
*)v
;
3337 /* Concat the two Unicode strings */
3338 w
= _PyUnicode_New(u
->length
+ v
->length
);
3341 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3342 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3346 return (PyObject
*)w
;
3354 static char count__doc__
[] =
3355 "S.count(sub[, start[, end]]) -> int\n\
3357 Return the number of occurrences of substring sub in Unicode string\n\
3358 S[start:end]. Optional arguments start and end are\n\
3359 interpreted as in slice notation.";
3362 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3364 PyUnicodeObject
*substring
;
3369 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3370 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3373 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3374 (PyObject
*)substring
);
3375 if (substring
== NULL
)
3379 start
+= self
->length
;
3382 if (end
> self
->length
)
3385 end
+= self
->length
;
3389 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3391 Py_DECREF(substring
);
3395 static char encode__doc__
[] =
3396 "S.encode([encoding[,errors]]) -> string\n\
3398 Return an encoded string version of S. Default encoding is the current\n\
3399 default string encoding. errors may be given to set a different error\n\
3400 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3401 a ValueError. Other possible values are 'ignore' and 'replace'.";
3404 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3406 char *encoding
= NULL
;
3407 char *errors
= NULL
;
3408 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3410 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3413 static char expandtabs__doc__
[] =
3414 "S.expandtabs([tabsize]) -> unicode\n\
3416 Return a copy of S where all tab characters are expanded using spaces.\n\
3417 If tabsize is not given, a tab size of 8 characters is assumed.";
3420 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3429 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3432 /* First pass: determine size of output string */
3434 e
= self
->str
+ self
->length
;
3435 for (p
= self
->str
; p
< e
; p
++)
3438 j
+= tabsize
- (j
% tabsize
);
3442 if (*p
== '\n' || *p
== '\r') {
3448 /* Second pass: create output string and fill it */
3449 u
= _PyUnicode_New(i
+ j
);
3456 for (p
= self
->str
; p
< e
; p
++)
3459 i
= tabsize
- (j
% tabsize
);
3468 if (*p
== '\n' || *p
== '\r')
3472 return (PyObject
*) u
;
3475 static char find__doc__
[] =
3476 "S.find(sub [,start [,end]]) -> int\n\
3478 Return the lowest index in S where substring sub is found,\n\
3479 such that sub is contained within s[start,end]. Optional\n\
3480 arguments start and end are interpreted as in slice notation.\n\
3482 Return -1 on failure.";
3485 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3487 PyUnicodeObject
*substring
;
3492 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
3493 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3495 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3496 (PyObject
*)substring
);
3497 if (substring
== NULL
)
3500 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
3502 Py_DECREF(substring
);
3507 unicode_getitem(PyUnicodeObject
*self
, int index
)
3509 if (index
< 0 || index
>= self
->length
) {
3510 PyErr_SetString(PyExc_IndexError
, "string index out of range");
3514 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
3518 unicode_hash(PyUnicodeObject
*self
)
3520 /* Since Unicode objects compare equal to their ASCII string
3521 counterparts, they should use the individual character values
3522 as basis for their hash value. This is needed to assure that
3523 strings and Unicode objects behave in the same way as
3527 register Py_UNICODE
*p
;
3530 if (self
->hash
!= -1)
3532 len
= PyUnicode_GET_SIZE(self
);
3533 p
= PyUnicode_AS_UNICODE(self
);
3536 x
= (1000003*x
) ^ *p
++;
3537 x
^= PyUnicode_GET_SIZE(self
);
3544 static char index__doc__
[] =
3545 "S.index(sub [,start [,end]]) -> int\n\
3547 Like S.find() but raise ValueError when the substring is not found.";
3550 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
3553 PyUnicodeObject
*substring
;
3557 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
3558 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3561 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3562 (PyObject
*)substring
);
3563 if (substring
== NULL
)
3566 result
= findstring(self
, substring
, start
, end
, 1);
3568 Py_DECREF(substring
);
3570 PyErr_SetString(PyExc_ValueError
, "substring not found");
3573 return PyInt_FromLong(result
);
3576 static char islower__doc__
[] =
3577 "S.islower() -> int\n\
3579 Return 1 if all cased characters in S are lowercase and there is\n\
3580 at least one cased character in S, 0 otherwise.";
3583 unicode_islower(PyUnicodeObject
*self
, PyObject
*args
)
3585 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3586 register const Py_UNICODE
*e
;
3589 if (!PyArg_NoArgs(args
))
3592 /* Shortcut for single character strings */
3593 if (PyUnicode_GET_SIZE(self
) == 1)
3594 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
3596 /* Special case for empty strings */
3597 if (PyString_GET_SIZE(self
) == 0)
3598 return PyInt_FromLong(0);
3600 e
= p
+ PyUnicode_GET_SIZE(self
);
3602 for (; p
< e
; p
++) {
3603 register const Py_UNICODE ch
= *p
;
3605 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
3606 return PyInt_FromLong(0);
3607 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
3610 return PyInt_FromLong(cased
);
3613 static char isupper__doc__
[] =
3614 "S.isupper() -> int\n\
3616 Return 1 if all cased characters in S are uppercase and there is\n\
3617 at least one cased character in S, 0 otherwise.";
3620 unicode_isupper(PyUnicodeObject
*self
, PyObject
*args
)
3622 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3623 register const Py_UNICODE
*e
;
3626 if (!PyArg_NoArgs(args
))
3629 /* Shortcut for single character strings */
3630 if (PyUnicode_GET_SIZE(self
) == 1)
3631 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
3633 /* Special case for empty strings */
3634 if (PyString_GET_SIZE(self
) == 0)
3635 return PyInt_FromLong(0);
3637 e
= p
+ PyUnicode_GET_SIZE(self
);
3639 for (; p
< e
; p
++) {
3640 register const Py_UNICODE ch
= *p
;
3642 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
3643 return PyInt_FromLong(0);
3644 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
3647 return PyInt_FromLong(cased
);
3650 static char istitle__doc__
[] =
3651 "S.istitle() -> int\n\
3653 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3654 may only follow uncased characters and lowercase characters only cased\n\
3655 ones. Return 0 otherwise.";
3658 unicode_istitle(PyUnicodeObject
*self
, PyObject
*args
)
3660 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3661 register const Py_UNICODE
*e
;
3662 int cased
, previous_is_cased
;
3664 if (!PyArg_NoArgs(args
))
3667 /* Shortcut for single character strings */
3668 if (PyUnicode_GET_SIZE(self
) == 1)
3669 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
3670 (Py_UNICODE_ISUPPER(*p
) != 0));
3672 /* Special case for empty strings */
3673 if (PyString_GET_SIZE(self
) == 0)
3674 return PyInt_FromLong(0);
3676 e
= p
+ PyUnicode_GET_SIZE(self
);
3678 previous_is_cased
= 0;
3679 for (; p
< e
; p
++) {
3680 register const Py_UNICODE ch
= *p
;
3682 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
3683 if (previous_is_cased
)
3684 return PyInt_FromLong(0);
3685 previous_is_cased
= 1;
3688 else if (Py_UNICODE_ISLOWER(ch
)) {
3689 if (!previous_is_cased
)
3690 return PyInt_FromLong(0);
3691 previous_is_cased
= 1;
3695 previous_is_cased
= 0;
3697 return PyInt_FromLong(cased
);
3700 static char isspace__doc__
[] =
3701 "S.isspace() -> int\n\
3703 Return 1 if there are only whitespace characters in S,\n\
3707 unicode_isspace(PyUnicodeObject
*self
, PyObject
*args
)
3709 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3710 register const Py_UNICODE
*e
;
3712 if (!PyArg_NoArgs(args
))
3715 /* Shortcut for single character strings */
3716 if (PyUnicode_GET_SIZE(self
) == 1 &&
3717 Py_UNICODE_ISSPACE(*p
))
3718 return PyInt_FromLong(1);
3720 /* Special case for empty strings */
3721 if (PyString_GET_SIZE(self
) == 0)
3722 return PyInt_FromLong(0);
3724 e
= p
+ PyUnicode_GET_SIZE(self
);
3725 for (; p
< e
; p
++) {
3726 if (!Py_UNICODE_ISSPACE(*p
))
3727 return PyInt_FromLong(0);
3729 return PyInt_FromLong(1);
3732 static char isalpha__doc__
[] =
3733 "S.isalpha() -> int\n\
3735 Return 1 if all characters in S are alphabetic\n\
3736 and there is at least one character in S, 0 otherwise.";
3739 unicode_isalpha(PyUnicodeObject
*self
, PyObject
*args
)
3741 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3742 register const Py_UNICODE
*e
;
3744 if (!PyArg_NoArgs(args
))
3747 /* Shortcut for single character strings */
3748 if (PyUnicode_GET_SIZE(self
) == 1 &&
3749 Py_UNICODE_ISALPHA(*p
))
3750 return PyInt_FromLong(1);
3752 /* Special case for empty strings */
3753 if (PyString_GET_SIZE(self
) == 0)
3754 return PyInt_FromLong(0);
3756 e
= p
+ PyUnicode_GET_SIZE(self
);
3757 for (; p
< e
; p
++) {
3758 if (!Py_UNICODE_ISALPHA(*p
))
3759 return PyInt_FromLong(0);
3761 return PyInt_FromLong(1);
3764 static char isalnum__doc__
[] =
3765 "S.isalnum() -> int\n\
3767 Return 1 if all characters in S are alphanumeric\n\
3768 and there is at least one character in S, 0 otherwise.";
3771 unicode_isalnum(PyUnicodeObject
*self
, PyObject
*args
)
3773 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3774 register const Py_UNICODE
*e
;
3776 if (!PyArg_NoArgs(args
))
3779 /* Shortcut for single character strings */
3780 if (PyUnicode_GET_SIZE(self
) == 1 &&
3781 Py_UNICODE_ISALNUM(*p
))
3782 return PyInt_FromLong(1);
3784 /* Special case for empty strings */
3785 if (PyString_GET_SIZE(self
) == 0)
3786 return PyInt_FromLong(0);
3788 e
= p
+ PyUnicode_GET_SIZE(self
);
3789 for (; p
< e
; p
++) {
3790 if (!Py_UNICODE_ISALNUM(*p
))
3791 return PyInt_FromLong(0);
3793 return PyInt_FromLong(1);
3796 static char isdecimal__doc__
[] =
3797 "S.isdecimal() -> int\n\
3799 Return 1 if there are only decimal characters in S,\n\
3803 unicode_isdecimal(PyUnicodeObject
*self
, PyObject
*args
)
3805 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3806 register const Py_UNICODE
*e
;
3808 if (!PyArg_NoArgs(args
))
3811 /* Shortcut for single character strings */
3812 if (PyUnicode_GET_SIZE(self
) == 1 &&
3813 Py_UNICODE_ISDECIMAL(*p
))
3814 return PyInt_FromLong(1);
3816 /* Special case for empty strings */
3817 if (PyString_GET_SIZE(self
) == 0)
3818 return PyInt_FromLong(0);
3820 e
= p
+ PyUnicode_GET_SIZE(self
);
3821 for (; p
< e
; p
++) {
3822 if (!Py_UNICODE_ISDECIMAL(*p
))
3823 return PyInt_FromLong(0);
3825 return PyInt_FromLong(1);
3828 static char isdigit__doc__
[] =
3829 "S.isdigit() -> int\n\
3831 Return 1 if there are only digit characters in S,\n\
3835 unicode_isdigit(PyUnicodeObject
*self
, PyObject
*args
)
3837 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3838 register const Py_UNICODE
*e
;
3840 if (!PyArg_NoArgs(args
))
3843 /* Shortcut for single character strings */
3844 if (PyUnicode_GET_SIZE(self
) == 1 &&
3845 Py_UNICODE_ISDIGIT(*p
))
3846 return PyInt_FromLong(1);
3848 /* Special case for empty strings */
3849 if (PyString_GET_SIZE(self
) == 0)
3850 return PyInt_FromLong(0);
3852 e
= p
+ PyUnicode_GET_SIZE(self
);
3853 for (; p
< e
; p
++) {
3854 if (!Py_UNICODE_ISDIGIT(*p
))
3855 return PyInt_FromLong(0);
3857 return PyInt_FromLong(1);
3860 static char isnumeric__doc__
[] =
3861 "S.isnumeric() -> int\n\
3863 Return 1 if there are only numeric characters in S,\n\
3867 unicode_isnumeric(PyUnicodeObject
*self
, PyObject
*args
)
3869 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3870 register const Py_UNICODE
*e
;
3872 if (!PyArg_NoArgs(args
))
3875 /* Shortcut for single character strings */
3876 if (PyUnicode_GET_SIZE(self
) == 1 &&
3877 Py_UNICODE_ISNUMERIC(*p
))
3878 return PyInt_FromLong(1);
3880 /* Special case for empty strings */
3881 if (PyString_GET_SIZE(self
) == 0)
3882 return PyInt_FromLong(0);
3884 e
= p
+ PyUnicode_GET_SIZE(self
);
3885 for (; p
< e
; p
++) {
3886 if (!Py_UNICODE_ISNUMERIC(*p
))
3887 return PyInt_FromLong(0);
3889 return PyInt_FromLong(1);
3892 static char join__doc__
[] =
3893 "S.join(sequence) -> unicode\n\
3895 Return a string which is the concatenation of the strings in the\n\
3896 sequence. The separator between elements is S.";
3899 unicode_join(PyUnicodeObject
*self
, PyObject
*args
)
3902 if (!PyArg_ParseTuple(args
, "O:join", &data
))
3905 return PyUnicode_Join((PyObject
*)self
, data
);
3909 unicode_length(PyUnicodeObject
*self
)
3911 return self
->length
;
3914 static char ljust__doc__
[] =
3915 "S.ljust(width) -> unicode\n\
3917 Return S left justified in a Unicode string of length width. Padding is\n\
3918 done using spaces.";
3921 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
3924 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
3927 if (self
->length
>= width
) {
3929 return (PyObject
*) self
;
3932 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
3935 static char lower__doc__
[] =
3936 "S.lower() -> unicode\n\
3938 Return a copy of the string S converted to lowercase.";
3941 unicode_lower(PyUnicodeObject
*self
, PyObject
*args
)
3943 if (!PyArg_NoArgs(args
))
3945 return fixup(self
, fixlower
);
3948 static char lstrip__doc__
[] =
3949 "S.lstrip() -> unicode\n\
3951 Return a copy of the string S with leading whitespace removed.";
3954 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
3956 if (!PyArg_NoArgs(args
))
3958 return strip(self
, 1, 0);
3962 unicode_repeat(PyUnicodeObject
*str
, int len
)
3973 /* no repeat, return original string */
3975 return (PyObject
*) str
;
3978 /* ensure # of chars needed doesn't overflow int and # of bytes
3979 * needed doesn't overflow size_t
3981 nchars
= len
* str
->length
;
3982 if (len
&& nchars
/ len
!= str
->length
) {
3983 PyErr_SetString(PyExc_OverflowError
,
3984 "repeated string is too long");
3987 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
3988 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
3989 PyErr_SetString(PyExc_OverflowError
,
3990 "repeated string is too long");
3993 u
= _PyUnicode_New(nchars
);
4000 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4004 return (PyObject
*) u
;
4007 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4017 self
= PyUnicode_FromObject(obj
);
4020 str1
= PyUnicode_FromObject(subobj
);
4025 str2
= PyUnicode_FromObject(replobj
);
4031 result
= replace((PyUnicodeObject
*)self
,
4032 (PyUnicodeObject
*)str1
,
4033 (PyUnicodeObject
*)str2
,
4041 static char replace__doc__
[] =
4042 "S.replace (old, new[, maxsplit]) -> unicode\n\
4044 Return a copy of S with all occurrences of substring\n\
4045 old replaced by new. If the optional argument maxsplit is\n\
4046 given, only the first maxsplit occurrences are replaced.";
4049 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4051 PyUnicodeObject
*str1
;
4052 PyUnicodeObject
*str2
;
4056 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4058 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4061 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4065 result
= replace(self
, str1
, str2
, maxcount
);
4073 PyObject
*unicode_repr(PyObject
*unicode
)
4075 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4076 PyUnicode_GET_SIZE(unicode
),
4080 static char rfind__doc__
[] =
4081 "S.rfind(sub [,start [,end]]) -> int\n\
4083 Return the highest index in S where substring sub is found,\n\
4084 such that sub is contained within s[start,end]. Optional\n\
4085 arguments start and end are interpreted as in slice notation.\n\
4087 Return -1 on failure.";
4090 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4092 PyUnicodeObject
*substring
;
4097 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4098 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4100 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4101 (PyObject
*)substring
);
4102 if (substring
== NULL
)
4105 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4107 Py_DECREF(substring
);
4111 static char rindex__doc__
[] =
4112 "S.rindex(sub [,start [,end]]) -> int\n\
4114 Like S.rfind() but raise ValueError when the substring is not found.";
4117 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4120 PyUnicodeObject
*substring
;
4124 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4125 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4127 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4128 (PyObject
*)substring
);
4129 if (substring
== NULL
)
4132 result
= findstring(self
, substring
, start
, end
, -1);
4134 Py_DECREF(substring
);
4136 PyErr_SetString(PyExc_ValueError
, "substring not found");
4139 return PyInt_FromLong(result
);
4142 static char rjust__doc__
[] =
4143 "S.rjust(width) -> unicode\n\
4145 Return S right justified in a Unicode string of length width. Padding is\n\
4146 done using spaces.";
4149 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4152 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4155 if (self
->length
>= width
) {
4157 return (PyObject
*) self
;
4160 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4163 static char rstrip__doc__
[] =
4164 "S.rstrip() -> unicode\n\
4166 Return a copy of the string S with trailing whitespace removed.";
4169 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
4171 if (!PyArg_NoArgs(args
))
4173 return strip(self
, 0, 1);
4177 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4179 /* standard clamping */
4184 if (end
> self
->length
)
4186 if (start
== 0 && end
== self
->length
) {
4187 /* full slice, return original string */
4189 return (PyObject
*) self
;
4194 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4198 PyObject
*PyUnicode_Split(PyObject
*s
,
4204 s
= PyUnicode_FromObject(s
);
4208 sep
= PyUnicode_FromObject(sep
);
4215 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4222 static char split__doc__
[] =
4223 "S.split([sep [,maxsplit]]) -> list of strings\n\
4225 Return a list of the words in S, using sep as the\n\
4226 delimiter string. If maxsplit is given, at most maxsplit\n\
4227 splits are done. If sep is not specified, any whitespace string\n\
4231 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4233 PyObject
*substring
= Py_None
;
4236 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4239 if (substring
== Py_None
)
4240 return split(self
, NULL
, maxcount
);
4241 else if (PyUnicode_Check(substring
))
4242 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4244 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4247 static char splitlines__doc__
[] =
4248 "S.splitlines([keepends]]) -> list of strings\n\
4250 Return a list of the lines in S, breaking at line boundaries.\n\
4251 Line breaks are not included in the resulting list unless keepends\n\
4252 is given and true.";
4255 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4259 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4262 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4266 PyObject
*unicode_str(PyUnicodeObject
*self
)
4268 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4271 static char strip__doc__
[] =
4272 "S.strip() -> unicode\n\
4274 Return a copy of S with leading and trailing whitespace removed.";
4277 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
4279 if (!PyArg_NoArgs(args
))
4281 return strip(self
, 1, 1);
4284 static char swapcase__doc__
[] =
4285 "S.swapcase() -> unicode\n\
4287 Return a copy of S with uppercase characters converted to lowercase\n\
4291 unicode_swapcase(PyUnicodeObject
*self
, PyObject
*args
)
4293 if (!PyArg_NoArgs(args
))
4295 return fixup(self
, fixswapcase
);
4298 static char translate__doc__
[] =
4299 "S.translate(table) -> unicode\n\
4301 Return a copy of the string S, where all characters have been mapped\n\
4302 through the given translation table, which must be a mapping of\n\
4303 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4304 are left untouched. Characters mapped to None are deleted.";
4307 unicode_translate(PyUnicodeObject
*self
, PyObject
*args
)
4311 if (!PyArg_ParseTuple(args
, "O:translate", &table
))
4313 return PyUnicode_TranslateCharmap(self
->str
,
4319 static char upper__doc__
[] =
4320 "S.upper() -> unicode\n\
4322 Return a copy of S converted to uppercase.";
4325 unicode_upper(PyUnicodeObject
*self
, PyObject
*args
)
4327 if (!PyArg_NoArgs(args
))
4329 return fixup(self
, fixupper
);
4333 static char zfill__doc__
[] =
4334 "S.zfill(width) -> unicode\n\
4336 Pad a numeric string x with zeros on the left, to fill a field\n\
4337 of the specified width. The string x is never truncated.";
4340 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4346 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4349 if (self
->length
>= width
) {
4351 return (PyObject
*) self
;
4354 fill
= width
- self
->length
;
4356 u
= pad(self
, fill
, 0, '0');
4358 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4359 /* move sign to beginning of string */
4360 u
->str
[0] = u
->str
[fill
];
4364 return (PyObject
*) u
;
4370 unicode_freelistsize(PyUnicodeObject
*self
, PyObject
*args
)
4372 if (!PyArg_NoArgs(args
))
4374 return PyInt_FromLong(unicode_freelist_size
);
4378 static char startswith__doc__
[] =
4379 "S.startswith(prefix[, start[, end]]) -> int\n\
4381 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4382 optional start, test S beginning at that position. With optional end, stop\n\
4383 comparing S at that position.";
4386 unicode_startswith(PyUnicodeObject
*self
,
4389 PyUnicodeObject
*substring
;
4394 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4395 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4397 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4398 (PyObject
*)substring
);
4399 if (substring
== NULL
)
4402 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4404 Py_DECREF(substring
);
4409 static char endswith__doc__
[] =
4410 "S.endswith(suffix[, start[, end]]) -> int\n\
4412 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4413 optional start, test S beginning at that position. With optional end, stop\n\
4414 comparing S at that position.";
4417 unicode_endswith(PyUnicodeObject
*self
,
4420 PyUnicodeObject
*substring
;
4425 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4426 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4428 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4429 (PyObject
*)substring
);
4430 if (substring
== NULL
)
4433 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4435 Py_DECREF(substring
);
4440 static PyMethodDef unicode_methods
[] = {
4442 /* Order is according to common usage: often used methods should
4443 appear first, since lookup is done sequentially. */
4445 {"encode", (PyCFunction
) unicode_encode
, 1, encode__doc__
},
4446 {"replace", (PyCFunction
) unicode_replace
, 1, replace__doc__
},
4447 {"split", (PyCFunction
) unicode_split
, 1, split__doc__
},
4448 {"join", (PyCFunction
) unicode_join
, 1, join__doc__
},
4449 {"capitalize", (PyCFunction
) unicode_capitalize
, 0, capitalize__doc__
},
4450 {"title", (PyCFunction
) unicode_title
, 0, title__doc__
},
4451 {"center", (PyCFunction
) unicode_center
, 1, center__doc__
},
4452 {"count", (PyCFunction
) unicode_count
, 1, count__doc__
},
4453 {"expandtabs", (PyCFunction
) unicode_expandtabs
, 1, expandtabs__doc__
},
4454 {"find", (PyCFunction
) unicode_find
, 1, find__doc__
},
4455 {"index", (PyCFunction
) unicode_index
, 1, index__doc__
},
4456 {"ljust", (PyCFunction
) unicode_ljust
, 1, ljust__doc__
},
4457 {"lower", (PyCFunction
) unicode_lower
, 0, lower__doc__
},
4458 {"lstrip", (PyCFunction
) unicode_lstrip
, 0, lstrip__doc__
},
4459 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4460 {"rfind", (PyCFunction
) unicode_rfind
, 1, rfind__doc__
},
4461 {"rindex", (PyCFunction
) unicode_rindex
, 1, rindex__doc__
},
4462 {"rjust", (PyCFunction
) unicode_rjust
, 1, rjust__doc__
},
4463 {"rstrip", (PyCFunction
) unicode_rstrip
, 0, rstrip__doc__
},
4464 {"splitlines", (PyCFunction
) unicode_splitlines
, 1, splitlines__doc__
},
4465 {"strip", (PyCFunction
) unicode_strip
, 0, strip__doc__
},
4466 {"swapcase", (PyCFunction
) unicode_swapcase
, 0, swapcase__doc__
},
4467 {"translate", (PyCFunction
) unicode_translate
, 1, translate__doc__
},
4468 {"upper", (PyCFunction
) unicode_upper
, 0, upper__doc__
},
4469 {"startswith", (PyCFunction
) unicode_startswith
, 1, startswith__doc__
},
4470 {"endswith", (PyCFunction
) unicode_endswith
, 1, endswith__doc__
},
4471 {"islower", (PyCFunction
) unicode_islower
, 0, islower__doc__
},
4472 {"isupper", (PyCFunction
) unicode_isupper
, 0, isupper__doc__
},
4473 {"istitle", (PyCFunction
) unicode_istitle
, 0, istitle__doc__
},
4474 {"isspace", (PyCFunction
) unicode_isspace
, 0, isspace__doc__
},
4475 {"isdecimal", (PyCFunction
) unicode_isdecimal
, 0, isdecimal__doc__
},
4476 {"isdigit", (PyCFunction
) unicode_isdigit
, 0, isdigit__doc__
},
4477 {"isnumeric", (PyCFunction
) unicode_isnumeric
, 0, isnumeric__doc__
},
4478 {"isalpha", (PyCFunction
) unicode_isalpha
, 0, isalpha__doc__
},
4479 {"isalnum", (PyCFunction
) unicode_isalnum
, 0, isalnum__doc__
},
4481 {"zfill", (PyCFunction
) unicode_zfill
, 1, zfill__doc__
},
4482 {"capwords", (PyCFunction
) unicode_capwords
, 0, capwords__doc__
},
4486 /* This one is just used for debugging the implementation. */
4487 {"freelistsize", (PyCFunction
) unicode_freelistsize
, 0},
4494 unicode_getattr(PyUnicodeObject
*self
, char *name
)
4496 return Py_FindMethod(unicode_methods
, (PyObject
*) self
, name
);
4499 static PySequenceMethods unicode_as_sequence
= {
4500 (inquiry
) unicode_length
, /* sq_length */
4501 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4502 (intargfunc
) unicode_repeat
, /* sq_repeat */
4503 (intargfunc
) unicode_getitem
, /* sq_item */
4504 (intintargfunc
) unicode_slice
, /* sq_slice */
4505 0, /* sq_ass_item */
4506 0, /* sq_ass_slice */
4507 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4511 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4516 PyErr_SetString(PyExc_SystemError
,
4517 "accessing non-existent unicode segment");
4520 *ptr
= (void *) self
->str
;
4521 return PyUnicode_GET_DATA_SIZE(self
);
4525 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4528 PyErr_SetString(PyExc_TypeError
,
4529 "cannot use unicode as modifyable buffer");
4534 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4538 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4543 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
4550 PyErr_SetString(PyExc_SystemError
,
4551 "accessing non-existent unicode segment");
4554 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
4557 *ptr
= (void *) PyString_AS_STRING(str
);
4558 return PyString_GET_SIZE(str
);
4561 /* Helpers for PyUnicode_Format() */
4564 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
4566 int argidx
= *p_argidx
;
4567 if (argidx
< arglen
) {
4572 return PyTuple_GetItem(args
, argidx
);
4574 PyErr_SetString(PyExc_TypeError
,
4575 "not enough arguments for format string");
4579 #define F_LJUST (1<<0)
4580 #define F_SIGN (1<<1)
4581 #define F_BLANK (1<<2)
4582 #define F_ALT (1<<3)
4583 #define F_ZERO (1<<4)
4586 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
4592 va_start(va
, format
);
4594 /* First, format the string as char array, then expand to Py_UNICODE
4596 charbuffer
= (char *)buffer
;
4597 len
= vsprintf(charbuffer
, format
, va
);
4598 for (i
= len
- 1; i
>= 0; i
--)
4599 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
4606 formatfloat(Py_UNICODE
*buf
,
4613 /* fmt = '%#.' + `prec` + `type`
4614 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4618 x
= PyFloat_AsDouble(v
);
4619 if (x
== -1.0 && PyErr_Occurred())
4623 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
4625 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4626 /* worst case length calc to ensure no buffer overrun:
4628 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4629 for any double rep.)
4630 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4631 If prec=0 the effective precision is 1 (the leading digit is
4632 always given), therefore increase by one to 10+prec. */
4633 if (buflen
<= (size_t)10 + (size_t)prec
) {
4634 PyErr_SetString(PyExc_OverflowError
,
4635 "formatted float is too long (precision too long?)");
4638 return usprintf(buf
, fmt
, x
);
4642 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
4646 PyObject
*str
; /* temporary string object. */
4647 PyUnicodeObject
*result
;
4649 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
4652 result
= _PyUnicode_New(len
);
4653 for (i
= 0; i
< len
; i
++)
4654 result
->str
[i
] = buf
[i
];
4655 result
->str
[len
] = 0;
4657 return (PyObject
*)result
;
4661 formatint(Py_UNICODE
*buf
,
4668 /* fmt = '%#.' + `prec` + 'l' + `type`
4669 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4671 char fmt
[64]; /* plenty big enough! */
4674 x
= PyInt_AsLong(v
);
4675 if (x
== -1 && PyErr_Occurred())
4679 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4680 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4681 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
4682 PyErr_SetString(PyExc_OverflowError
,
4683 "formatted integer is too long (precision too long?)");
4686 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4687 return usprintf(buf
, fmt
, x
);
4691 formatchar(Py_UNICODE
*buf
,
4695 /* presume that the buffer is at least 2 characters long */
4696 if (PyUnicode_Check(v
)) {
4697 if (PyUnicode_GET_SIZE(v
) != 1)
4699 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
4702 else if (PyString_Check(v
)) {
4703 if (PyString_GET_SIZE(v
) != 1)
4705 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
4709 /* Integer input truncated to a character */
4711 x
= PyInt_AsLong(v
);
4712 if (x
== -1 && PyErr_Occurred())
4720 PyErr_SetString(PyExc_TypeError
,
4721 "%c requires int or char");
4725 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4727 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4728 chars are formatted. XXX This is a magic number. Each formatting
4729 routine does bounds checking to ensure no overflow, but a better
4730 solution may be to malloc a buffer of appropriate size for each
4731 format. For now, the current solution is sufficient.
4733 #define FORMATBUFLEN (size_t)120
4735 PyObject
*PyUnicode_Format(PyObject
*format
,
4738 Py_UNICODE
*fmt
, *res
;
4739 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
4741 PyUnicodeObject
*result
= NULL
;
4742 PyObject
*dict
= NULL
;
4745 if (format
== NULL
|| args
== NULL
) {
4746 PyErr_BadInternalCall();
4749 uformat
= PyUnicode_FromObject(format
);
4750 if (uformat
== NULL
)
4752 fmt
= PyUnicode_AS_UNICODE(uformat
);
4753 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
4755 reslen
= rescnt
= fmtcnt
+ 100;
4756 result
= _PyUnicode_New(reslen
);
4759 res
= PyUnicode_AS_UNICODE(result
);
4761 if (PyTuple_Check(args
)) {
4762 arglen
= PyTuple_Size(args
);
4769 if (args
->ob_type
->tp_as_mapping
)
4772 while (--fmtcnt
>= 0) {
4775 rescnt
= fmtcnt
+ 100;
4777 if (_PyUnicode_Resize(result
, reslen
) < 0)
4779 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
4785 /* Got a format specifier */
4790 Py_UNICODE c
= '\0';
4793 PyObject
*temp
= NULL
;
4797 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
4801 Py_UNICODE
*keystart
;
4807 PyErr_SetString(PyExc_TypeError
,
4808 "format requires a mapping");
4814 /* Skip over balanced parentheses */
4815 while (pcount
> 0 && --fmtcnt
>= 0) {
4818 else if (*fmt
== '(')
4822 keylen
= fmt
- keystart
- 1;
4823 if (fmtcnt
< 0 || pcount
> 0) {
4824 PyErr_SetString(PyExc_ValueError
,
4825 "incomplete format key");
4828 /* keys are converted to strings using UTF-8 and
4829 then looked up since Python uses strings to hold
4830 variables names etc. in its namespaces and we
4831 wouldn't want to break common idioms. */
4832 key
= PyUnicode_EncodeUTF8(keystart
,
4841 args
= PyObject_GetItem(dict
, key
);
4850 while (--fmtcnt
>= 0) {
4851 switch (c
= *fmt
++) {
4852 case '-': flags
|= F_LJUST
; continue;
4853 case '+': flags
|= F_SIGN
; continue;
4854 case ' ': flags
|= F_BLANK
; continue;
4855 case '#': flags
|= F_ALT
; continue;
4856 case '0': flags
|= F_ZERO
; continue;
4861 v
= getnextarg(args
, arglen
, &argidx
);
4864 if (!PyInt_Check(v
)) {
4865 PyErr_SetString(PyExc_TypeError
,
4869 width
= PyInt_AsLong(v
);
4877 else if (c
>= '0' && c
<= '9') {
4879 while (--fmtcnt
>= 0) {
4881 if (c
< '0' || c
> '9')
4883 if ((width
*10) / 10 != width
) {
4884 PyErr_SetString(PyExc_ValueError
,
4888 width
= width
*10 + (c
- '0');
4896 v
= getnextarg(args
, arglen
, &argidx
);
4899 if (!PyInt_Check(v
)) {
4900 PyErr_SetString(PyExc_TypeError
,
4904 prec
= PyInt_AsLong(v
);
4910 else if (c
>= '0' && c
<= '9') {
4912 while (--fmtcnt
>= 0) {
4913 c
= Py_CHARMASK(*fmt
++);
4914 if (c
< '0' || c
> '9')
4916 if ((prec
*10) / 10 != prec
) {
4917 PyErr_SetString(PyExc_ValueError
,
4921 prec
= prec
*10 + (c
- '0');
4926 if (c
== 'h' || c
== 'l' || c
== 'L') {
4933 PyErr_SetString(PyExc_ValueError
,
4934 "incomplete format");
4938 v
= getnextarg(args
, arglen
, &argidx
);
4948 /* presume that buffer length is at least 1 */
4955 if (PyUnicode_Check(v
) && c
== 's') {
4962 temp
= PyObject_Str(v
);
4964 temp
= PyObject_Repr(v
);
4967 if (!PyString_Check(temp
)) {
4968 /* XXX Note: this should never happen, since
4969 PyObject_Repr() and PyObject_Str() assure
4972 PyErr_SetString(PyExc_TypeError
,
4973 "%s argument has non-string str()");
4976 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
4977 PyString_GET_SIZE(temp
),
4985 pbuf
= PyUnicode_AS_UNICODE(temp
);
4986 len
= PyUnicode_GET_SIZE(temp
);
4987 if (prec
>= 0 && len
> prec
)
4999 if (PyLong_Check(v
)) {
5000 temp
= formatlong(v
, flags
, prec
, c
);
5003 pbuf
= PyUnicode_AS_UNICODE(temp
);
5004 len
= PyUnicode_GET_SIZE(temp
);
5005 /* unbounded ints can always produce
5006 a sign character! */
5011 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5015 /* only d conversion is signed */
5028 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5039 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5045 PyErr_Format(PyExc_ValueError
,
5046 "unsupported format character '%c' (0x%x) "
5048 (31<=c
&& c
<=126) ? c
: '?',
5049 c
, fmt
-1 - PyUnicode_AS_UNICODE(uformat
));
5053 if (*pbuf
== '-' || *pbuf
== '+') {
5057 else if (flags
& F_SIGN
)
5059 else if (flags
& F_BLANK
)
5066 if (rescnt
< width
+ (sign
!= 0)) {
5068 rescnt
= width
+ fmtcnt
+ 100;
5070 if (_PyUnicode_Resize(result
, reslen
) < 0)
5072 res
= PyUnicode_AS_UNICODE(result
)
5082 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5083 assert(pbuf
[0] == '0');
5084 assert(pbuf
[1] == c
);
5095 if (width
> len
&& !(flags
& F_LJUST
)) {
5099 } while (--width
> len
);
5104 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5105 assert(pbuf
[0] == '0');
5106 assert(pbuf
[1] == c
);
5111 memcpy(res
, pbuf
, len
* sizeof(Py_UNICODE
));
5114 while (--width
>= len
) {
5118 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5119 PyErr_SetString(PyExc_TypeError
,
5120 "not all arguments converted");
5126 if (argidx
< arglen
&& !dict
) {
5127 PyErr_SetString(PyExc_TypeError
,
5128 "not all arguments converted");
5136 if (_PyUnicode_Resize(result
, reslen
- rescnt
))
5138 return (PyObject
*)result
;
5149 static PyBufferProcs unicode_as_buffer
= {
5150 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5151 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5152 (getsegcountproc
) unicode_buffer_getsegcount
,
5153 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5156 PyTypeObject PyUnicode_Type
= {
5157 PyObject_HEAD_INIT(&PyType_Type
)
5159 "unicode", /* tp_name */
5160 sizeof(PyUnicodeObject
), /* tp_size */
5161 0, /* tp_itemsize */
5163 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5165 (getattrfunc
)unicode_getattr
, /* tp_getattr */
5167 (cmpfunc
) unicode_compare
, /* tp_compare */
5168 (reprfunc
) unicode_repr
, /* tp_repr */
5169 0, /* tp_as_number */
5170 &unicode_as_sequence
, /* tp_as_sequence */
5171 0, /* tp_as_mapping */
5172 (hashfunc
) unicode_hash
, /* tp_hash*/
5174 (reprfunc
) unicode_str
, /* tp_str */
5175 (getattrofunc
) NULL
, /* tp_getattro */
5176 (setattrofunc
) NULL
, /* tp_setattro */
5177 &unicode_as_buffer
, /* tp_as_buffer */
5178 Py_TPFLAGS_DEFAULT
, /* tp_flags */
5181 /* Initialize the Unicode implementation */
5183 void _PyUnicode_Init(void)
5185 /* Doublecheck the configuration... */
5186 if (sizeof(Py_UNICODE
) != 2)
5187 Py_FatalError("Unicode configuration error: "
5188 "sizeof(Py_UNICODE) != 2 bytes");
5190 /* Init the implementation */
5191 unicode_freelist
= NULL
;
5192 unicode_freelist_size
= 0;
5193 unicode_empty
= _PyUnicode_New(0);
5194 strcpy(unicode_default_encoding
, "ascii");
5197 /* Finalize the Unicode implementation */
5200 _PyUnicode_Fini(void)
5204 Py_XDECREF(unicode_empty
);
5205 unicode_empty
= NULL
;
5207 for (u
= unicode_freelist
; u
!= NULL
;) {
5208 PyUnicodeObject
*v
= u
;
5209 u
= *(PyUnicodeObject
**)u
;
5212 Py_XDECREF(v
->defenc
);
5215 unicode_freelist
= NULL
;
5216 unicode_freelist_size
= 0;