3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
11 --------------------------------------------------------------------
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
29 * Written by Fredrik Lundh, January 1999.
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
67 #include "unicodeobject.h"
70 #if defined(HAVE_LIMITS_H)
73 #define INT_MAX 2147483647
80 /* Limit for the Unicode object free list */
82 #define MAX_UNICODE_FREELIST_SIZE 1024
84 /* Limit for the Unicode object free list stay alive optimization.
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
90 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
91 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
92 malloc()-overhead) bytes of unused garbage.
94 Setting the limit to 0 effectively turns the feature off.
96 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
101 #define KEEPALIVE_SIZE_LIMIT 9
103 /* Endianness switches; defaults to little endian */
105 #ifdef WORDS_BIGENDIAN
106 # define BYTEORDER_IS_BIG_ENDIAN
108 # define BYTEORDER_IS_LITTLE_ENDIAN
111 /* --- Globals ------------------------------------------------------------
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
118 /* The empty Unicode object */
119 static PyUnicodeObject
*unicode_empty
;
121 /* Free list for Unicode objects */
122 static PyUnicodeObject
*unicode_freelist
;
123 static int unicode_freelist_size
;
125 /* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
133 static char unicode_default_encoding
[100];
135 /* --- Unicode Object ----------------------------------------------------- */
138 int _PyUnicode_Resize(register PyUnicodeObject
*unicode
,
143 /* Shortcut if there's nothing much to do. */
144 if (unicode
->length
== length
)
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode
== unicode_empty
) {
149 PyErr_SetString(PyExc_SystemError
,
150 "can't resize empty unicode object");
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr
= unicode
->str
;
157 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
159 unicode
->str
= oldstr
;
163 unicode
->str
[length
] = 0;
164 unicode
->length
= length
;
167 /* Reset the object caches */
168 if (unicode
->defenc
) {
169 Py_DECREF(unicode
->defenc
);
170 unicode
->defenc
= NULL
;
177 int PyUnicode_Resize(PyObject
**unicode
,
182 if (unicode
== NULL
) {
183 PyErr_BadInternalCall();
186 v
= (PyUnicodeObject
*)*unicode
;
187 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
188 PyErr_BadInternalCall();
191 return _PyUnicode_Resize(v
, length
);
194 /* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
203 PyUnicodeObject
*_PyUnicode_New(int length
)
205 register PyUnicodeObject
*unicode
;
207 /* Optimization for empty strings */
208 if (length
== 0 && unicode_empty
!= NULL
) {
209 Py_INCREF(unicode_empty
);
210 return unicode_empty
;
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist
) {
215 unicode
= unicode_freelist
;
216 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
217 unicode_freelist_size
--;
219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode
->length
< length
) &&
222 _PyUnicode_Resize(unicode
, length
)) {
223 PyMem_DEL(unicode
->str
);
228 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
230 PyObject_INIT(unicode
, &PyUnicode_Type
);
233 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
236 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
243 unicode
->str
[length
] = 0;
244 unicode
->length
= length
;
246 unicode
->defenc
= NULL
;
250 _Py_ForgetReference((PyObject
*)unicode
);
251 PyObject_DEL(unicode
);
256 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
258 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
259 /* Keep-Alive optimization */
260 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
261 PyMem_DEL(unicode
->str
);
265 if (unicode
->defenc
) {
266 Py_DECREF(unicode
->defenc
);
267 unicode
->defenc
= NULL
;
269 /* Add to free list */
270 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
271 unicode_freelist
= unicode
;
272 unicode_freelist_size
++;
275 PyMem_DEL(unicode
->str
);
276 Py_XDECREF(unicode
->defenc
);
277 PyObject_DEL(unicode
);
281 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
284 PyUnicodeObject
*unicode
;
286 unicode
= _PyUnicode_New(size
);
290 /* Copy the Unicode data into the new object */
292 memcpy(unicode
->str
, u
, size
* sizeof(Py_UNICODE
));
294 return (PyObject
*)unicode
;
299 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
302 PyUnicodeObject
*unicode
;
305 PyErr_BadInternalCall();
309 unicode
= _PyUnicode_New(size
);
313 /* Copy the wchar_t data into the new object */
314 #ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
318 register Py_UNICODE
*u
;
320 u
= PyUnicode_AS_UNICODE(unicode
);
321 for (i
= size
; i
>= 0; i
--)
326 return (PyObject
*)unicode
;
329 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
333 if (unicode
== NULL
) {
334 PyErr_BadInternalCall();
337 if (size
> PyUnicode_GET_SIZE(unicode
))
338 size
= PyUnicode_GET_SIZE(unicode
);
339 #ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
343 register Py_UNICODE
*u
;
345 u
= PyUnicode_AS_UNICODE(unicode
);
346 for (i
= size
; i
>= 0; i
--)
356 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
358 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
361 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
362 const char *encoding
,
371 PyErr_BadInternalCall();
376 if (PyInstance_Check(obj
)) {
378 func
= PyObject_GetAttrString(obj
, "__str__");
380 PyErr_SetString(PyExc_TypeError
,
381 "coercing to Unicode: instance doesn't define __str__");
384 obj
= PyEval_CallObject(func
, NULL
);
390 if (PyUnicode_Check(obj
)) {
394 PyErr_SetString(PyExc_TypeError
,
395 "decoding Unicode is not supported");
400 else if (PyString_Check(obj
)) {
401 s
= PyString_AS_STRING(obj
);
402 len
= PyString_GET_SIZE(obj
);
404 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
405 /* Overwrite the error message with something more useful in
406 case of a TypeError. */
407 if (PyErr_ExceptionMatches(PyExc_TypeError
))
408 PyErr_Format(PyExc_TypeError
,
409 "coercing to Unicode: need string or buffer, "
411 obj
->ob_type
->tp_name
);
415 /* Convert to Unicode */
417 Py_INCREF(unicode_empty
);
418 v
= (PyObject
*)unicode_empty
;
421 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
435 PyObject
*PyUnicode_Decode(const char *s
,
437 const char *encoding
,
440 PyObject
*buffer
= NULL
, *unicode
;
442 if (encoding
== NULL
)
443 encoding
= PyUnicode_GetDefaultEncoding();
445 /* Shortcuts for common default encodings */
446 if (strcmp(encoding
, "utf-8") == 0)
447 return PyUnicode_DecodeUTF8(s
, size
, errors
);
448 else if (strcmp(encoding
, "latin-1") == 0)
449 return PyUnicode_DecodeLatin1(s
, size
, errors
);
450 else if (strcmp(encoding
, "ascii") == 0)
451 return PyUnicode_DecodeASCII(s
, size
, errors
);
453 /* Decode via the codec registry */
454 buffer
= PyBuffer_FromMemory((void *)s
, size
);
457 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
460 if (!PyUnicode_Check(unicode
)) {
461 PyErr_Format(PyExc_TypeError
,
462 "decoder did not return an unicode object (type=%.400s)",
463 unicode
->ob_type
->tp_name
);
475 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
477 const char *encoding
,
480 PyObject
*v
, *unicode
;
482 unicode
= PyUnicode_FromUnicode(s
, size
);
485 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
490 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
491 const char *encoding
,
496 if (!PyUnicode_Check(unicode
)) {
501 if (encoding
== NULL
)
502 encoding
= PyUnicode_GetDefaultEncoding();
504 /* Shortcuts for common default encodings */
505 if (errors
== NULL
) {
506 if (strcmp(encoding
, "utf-8") == 0)
507 return PyUnicode_AsUTF8String(unicode
);
508 else if (strcmp(encoding
, "latin-1") == 0)
509 return PyUnicode_AsLatin1String(unicode
);
510 else if (strcmp(encoding
, "ascii") == 0)
511 return PyUnicode_AsASCIIString(unicode
);
514 /* Encode via the codec registry */
515 v
= PyCodec_Encode(unicode
, encoding
, errors
);
518 /* XXX Should we really enforce this ? */
519 if (!PyString_Check(v
)) {
520 PyErr_Format(PyExc_TypeError
,
521 "encoder did not return a string object (type=%.400s)",
522 v
->ob_type
->tp_name
);
532 /* Return a Python string holding the default encoded value of the
535 The resulting string is cached in the Unicode object for subsequent
536 usage by this function. The cached version is needed to implement
537 the character buffer interface and will live (at least) as long as
538 the Unicode object itself.
540 The refcount of the string is *not* incremented.
542 *** Exported for internal use by the interpreter only !!! ***
546 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
549 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
553 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
554 if (v
&& errors
== NULL
)
555 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
559 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
561 if (!PyUnicode_Check(unicode
)) {
565 return PyUnicode_AS_UNICODE(unicode
);
571 int PyUnicode_GetSize(PyObject
*unicode
)
573 if (!PyUnicode_Check(unicode
)) {
577 return PyUnicode_GET_SIZE(unicode
);
583 const char *PyUnicode_GetDefaultEncoding(void)
585 return unicode_default_encoding
;
588 int PyUnicode_SetDefaultEncoding(const char *encoding
)
592 /* Make sure the encoding is valid. As side effect, this also
593 loads the encoding into the codec registry cache. */
594 v
= _PyCodec_Lookup(encoding
);
598 strncpy(unicode_default_encoding
,
600 sizeof(unicode_default_encoding
));
607 /* --- UTF-8 Codec -------------------------------------------------------- */
610 char utf8_code_length
[256] = {
611 /* Map UTF-8 encoded prefix byte to sequence length. zero means
612 illegal prefix. see RFC 2279 for details */
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
617 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
618 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
619 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
620 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
621 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
622 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
623 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
624 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
625 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
626 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
627 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
628 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
632 int utf8_decoding_error(const char **source
,
637 if ((errors
== NULL
) ||
638 (strcmp(errors
,"strict") == 0)) {
639 PyErr_Format(PyExc_UnicodeError
,
640 "UTF-8 decoding error: %.400s",
644 else if (strcmp(errors
,"ignore") == 0) {
648 else if (strcmp(errors
,"replace") == 0) {
650 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
655 PyErr_Format(PyExc_ValueError
,
656 "UTF-8 decoding error; unknown error handling code: %.400s",
662 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
668 PyUnicodeObject
*unicode
;
670 const char *errmsg
= "";
672 /* Note: size will always be longer than the resulting Unicode
674 unicode
= _PyUnicode_New(size
);
678 return (PyObject
*)unicode
;
680 /* Unpack UTF-8 encoded data */
685 Py_UCS4 ch
= (unsigned char)*s
;
688 *p
++ = (Py_UNICODE
)ch
;
693 n
= utf8_code_length
[ch
];
696 errmsg
= "unexpected end of data";
703 errmsg
= "unexpected code byte";
708 errmsg
= "internal error";
713 if ((s
[1] & 0xc0) != 0x80) {
714 errmsg
= "invalid data";
717 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
719 errmsg
= "illegal encoding";
723 *p
++ = (Py_UNICODE
)ch
;
727 if ((s
[1] & 0xc0) != 0x80 ||
728 (s
[2] & 0xc0) != 0x80) {
729 errmsg
= "invalid data";
732 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
733 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
734 errmsg
= "illegal encoding";
738 *p
++ = (Py_UNICODE
)ch
;
742 if ((s
[1] & 0xc0) != 0x80 ||
743 (s
[2] & 0xc0) != 0x80 ||
744 (s
[3] & 0xc0) != 0x80) {
745 errmsg
= "invalid data";
748 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
749 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
750 /* validate and convert to UTF-16 */
751 if ((ch
< 0x10000) || /* minimum value allowed for 4
753 (ch
> 0x10ffff)) { /* maximum value allowed for
755 errmsg
= "illegal encoding";
758 /* compute and append the two surrogates: */
760 /* translate from 10000..10FFFF to 0..FFFF */
763 /* high surrogate = top 10 bits added to D800 */
764 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
766 /* low surrogate = bottom 10 bits added to DC00 */
767 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& ~0xFC00));
771 /* Other sizes are only needed for UCS-4 */
772 errmsg
= "unsupported Unicode code range";
780 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
785 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
788 return (PyObject
*)unicode
;
795 /* Not used anymore, now that the encoder supports UTF-16
799 int utf8_encoding_error(const Py_UNICODE
**source
,
804 if ((errors
== NULL
) ||
805 (strcmp(errors
,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError
,
807 "UTF-8 encoding error: %.400s",
811 else if (strcmp(errors
,"ignore") == 0) {
814 else if (strcmp(errors
,"replace") == 0) {
820 PyErr_Format(PyExc_ValueError
,
821 "UTF-8 encoding error; "
822 "unknown error handling code: %.400s",
829 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
837 unsigned int cbAllocated
= 3 * size
;
838 unsigned int cbWritten
= 0;
841 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
847 p
= q
= PyString_AS_STRING(v
);
854 else if (ch
< 0x0800) {
855 *p
++ = 0xc0 | (ch
>> 6);
856 *p
++ = 0x80 | (ch
& 0x3f);
860 /* Check for high surrogate */
861 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
864 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
866 if (cbWritten
>= (cbAllocated
- 4)) {
867 /* Provide enough room for some more
870 if (_PyString_Resize(&v
, cbAllocated
))
874 /* combine the two values */
875 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
877 *p
++ = (char)((ch
>> 18) | 0xf0);
878 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
885 *p
++ = (char)(0xe0 | (ch
>> 12));
888 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
889 *p
++ = (char)(0x80 | (ch
& 0x3f));
893 if (_PyString_Resize(&v
, p
- q
))
902 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
906 if (!PyUnicode_Check(unicode
)) {
910 str
= PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
911 PyUnicode_GET_SIZE(unicode
),
919 /* --- UTF-16 Codec ------------------------------------------------------- */
922 int utf16_decoding_error(const Py_UNICODE
**source
,
927 if ((errors
== NULL
) ||
928 (strcmp(errors
,"strict") == 0)) {
929 PyErr_Format(PyExc_UnicodeError
,
930 "UTF-16 decoding error: %.400s",
934 else if (strcmp(errors
,"ignore") == 0) {
937 else if (strcmp(errors
,"replace") == 0) {
939 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
945 PyErr_Format(PyExc_ValueError
,
946 "UTF-16 decoding error; "
947 "unknown error handling code: %.400s",
953 PyObject
*PyUnicode_DecodeUTF16(const char *s
,
958 PyUnicodeObject
*unicode
;
960 const Py_UNICODE
*q
, *e
;
962 const char *errmsg
= "";
964 /* size should be an even number */
965 if (size
% sizeof(Py_UNICODE
) != 0) {
966 if (utf16_decoding_error(NULL
, NULL
, errors
, "truncated data"))
968 /* The remaining input chars are ignored if we fall through
972 /* Note: size will always be longer than the resulting Unicode
974 unicode
= _PyUnicode_New(size
);
978 return (PyObject
*)unicode
;
980 /* Unpack UTF-16 encoded data */
983 e
= q
+ (size
/ sizeof(Py_UNICODE
));
989 register Py_UNICODE ch
= *q
++;
991 /* Check for BOM marks (U+FEFF) in the input and adjust
992 current byte order setting accordingly. Swap input
993 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
995 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
999 } else if (ch
== 0xFFFE) {
1004 ch
= (ch
>> 8) | (ch
<< 8);
1009 } else if (ch
== 0xFFFE) {
1014 ch
= (ch
>> 8) | (ch
<< 8);
1016 if (ch
< 0xD800 || ch
> 0xDFFF) {
1021 /* UTF-16 code pair: */
1023 errmsg
= "unexpected end of data";
1026 if (0xDC00 <= *q
&& *q
<= 0xDFFF) {
1028 if (0xD800 <= *q
&& *q
<= 0xDBFF) {
1029 /* This is valid data (a UTF-16 surrogate pair), but
1030 we are not able to store this information since our
1031 Py_UNICODE type only has 16 bits... this might
1032 change someday, even though it's unlikely. */
1033 errmsg
= "code pairs are not supported";
1039 errmsg
= "illegal encoding";
1040 /* Fall through to report the error */
1043 if (utf16_decoding_error(&q
, &p
, errors
, errmsg
))
1051 if (_PyUnicode_Resize(unicode
, p
- unicode
->str
))
1054 return (PyObject
*)unicode
;
1063 PyObject
*PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1072 /* We don't create UTF-16 pairs... */
1073 v
= PyString_FromStringAndSize(NULL
,
1074 sizeof(Py_UNICODE
) * (size
+ (byteorder
== 0)));
1078 q
= PyString_AS_STRING(v
);
1079 p
= (Py_UNICODE
*)q
;
1084 if (byteorder
== 0 ||
1085 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1091 memcpy(p
, s
, size
* sizeof(Py_UNICODE
));
1093 while (size
-- > 0) {
1094 Py_UNICODE ch
= *s
++;
1095 *p
++ = (ch
>> 8) | (ch
<< 8);
1100 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1102 if (!PyUnicode_Check(unicode
)) {
1103 PyErr_BadArgument();
1106 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1107 PyUnicode_GET_SIZE(unicode
),
1112 /* --- Unicode Escape Codec ----------------------------------------------- */
1115 int unicodeescape_decoding_error(const char **source
,
1118 const char *details
)
1120 if ((errors
== NULL
) ||
1121 (strcmp(errors
,"strict") == 0)) {
1122 PyErr_Format(PyExc_UnicodeError
,
1123 "Unicode-Escape decoding error: %.400s",
1127 else if (strcmp(errors
,"ignore") == 0) {
1130 else if (strcmp(errors
,"replace") == 0) {
1131 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1135 PyErr_Format(PyExc_ValueError
,
1136 "Unicode-Escape decoding error; "
1137 "unknown error handling code: %.400s",
1143 static _Py_UCNHashAPI
*pucnHash
= NULL
;
1146 int mystrnicmp(const char *s1
, const char *s2
, size_t count
)
1154 c1
= tolower(*(s1
++));
1155 c2
= tolower(*(s2
++));
1157 while(--count
&& c1
== c2
);
1165 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1170 Py_UNICODE
*p
= NULL
, *buf
= NULL
;
1173 /* Escaped strings will always be longer than the resulting
1174 Unicode string, so we start with size here and then reduce the
1175 length after conversion to the true value. */
1176 v
= _PyUnicode_New(size
);
1180 return (PyObject
*)v
;
1181 p
= buf
= PyUnicode_AS_UNICODE(v
);
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1190 *p
++ = (unsigned char)*s
++;
1200 case '\\': *p
++ = '\\'; break;
1201 case '\'': *p
++ = '\''; break;
1202 case '\"': *p
++ = '\"'; break;
1203 case 'b': *p
++ = '\b'; break;
1204 case 'f': *p
++ = '\014'; break; /* FF */
1205 case 't': *p
++ = '\t'; break;
1206 case 'n': *p
++ = '\n'; break;
1207 case 'r': *p
++ = '\r'; break;
1208 case 'v': *p
++ = '\013'; break; /* VT */
1209 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
1215 if ('0' <= *s
&& *s
<= '7') {
1216 x
= (x
<<3) + *s
++ - '0';
1217 if ('0' <= *s
&& *s
<= '7')
1218 x
= (x
<<3) + *s
++ - '0';
1223 /* \xXXXX escape with 1-n hex digits. for compatibility
1224 with 8-bit strings, this code ignores all but the last
1228 c
= (unsigned char)*s
;
1232 if ('0' <= c
&& c
<= '9')
1234 else if ('a' <= c
&& c
<= 'f')
1238 c
= (unsigned char)*++s
;
1239 } while (isxdigit(c
));
1240 *p
++ = (unsigned char) x
;
1243 *p
++ = (unsigned char)s
[-1];
1247 /* \uXXXX with 4 hex digits */
1249 for (x
= 0, i
= 0; i
< 4; i
++) {
1250 c
= (unsigned char)s
[i
];
1252 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1253 "truncated \\uXXXX"))
1259 if (c
>= '0' && c
<= '9')
1261 else if (c
>= 'a' && c
<= 'f')
1271 /* Ok, we need to deal with Unicode Character Names now,
1272 * make sure we've imported the hash table data...
1274 if (pucnHash
== NULL
)
1276 PyObject
*mod
= 0, *v
= 0;
1278 mod
= PyImport_ImportModule("ucnhash");
1281 v
= PyObject_GetAttrString(mod
,"ucnhashAPI");
1287 pucnHash
= PyCObject_AsVoidPtr(v
);
1289 if (pucnHash
== NULL
)
1297 const char *start
= s
+ 1;
1298 const char *endBrace
= start
;
1302 /* look for either the closing brace, or we
1303 * exceed the maximum length of the unicode character names
1305 while (*endBrace
!= '}' &&
1306 (unsigned int)(endBrace
- start
) <=
1312 if (endBrace
!= end
&& *endBrace
== '}')
1314 j
= pucnHash
->hash(start
, endBrace
- start
);
1315 if (j
> pucnHash
->cKeys
||
1318 ((_Py_UnicodeCharacterName
*)
1319 (pucnHash
->getValue(j
)))->pszUCN
,
1320 (int)(endBrace
- start
)) != 0)
1322 if (unicodeescape_decoding_error(
1324 "Invalid Unicode Character Name"))
1328 goto ucnFallthrough
;
1330 value
= ((_Py_UnicodeCharacterName
*)
1331 (pucnHash
->getValue(j
)))->value
;
1334 /* In UCS-2 range, easy solution.. */
1339 /* Oops, its in UCS-4 space, */
1340 /* compute and append the two surrogates: */
1341 /* translate from 10000..10FFFF to 0..FFFFF */
1344 /* high surrogate = top 10 bits added to D800 */
1345 *p
++ = 0xD800 + (value
>> 10);
1347 /* low surrogate = bottom 10 bits added to DC00 */
1348 *p
++ = 0xDC00 + (value
& ~0xFC00);
1354 if (unicodeescape_decoding_error(
1356 "Unicode name missing closing brace"))
1358 goto ucnFallthrough
;
1362 if (unicodeescape_decoding_error(
1364 "Missing opening brace for Unicode Character Name escape"))
1367 /* fall through on purpose */
1370 *p
++ = (unsigned char)s
[-1];
1374 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1376 return (PyObject
*)v
;
1383 /* Return a Unicode-Escape string version of the Unicode object.
1385 If quotes is true, the string is enclosed in u"" or u'' quotes as
1390 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1395 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1403 static const char *hexdigit
= "0123456789ABCDEF";
1405 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1409 p
= q
= PyString_AS_STRING(repr
);
1413 *p
++ = (findchar(s
, size
, '\'') &&
1414 !findchar(s
, size
, '"')) ? '"' : '\'';
1416 while (size
-- > 0) {
1417 Py_UNICODE ch
= *s
++;
1419 if (quotes
&& (ch
== q
[1] || ch
== '\\')) {
1423 /* Map 16-bit characters to '\uxxxx' */
1424 else if (ch
>= 256) {
1427 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1428 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1429 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1430 *p
++ = hexdigit
[ch
& 15];
1432 /* Map non-printable US ASCII to '\ooo' */
1433 else if (ch
< ' ' || ch
>= 128) {
1435 *p
++ = hexdigit
[(ch
>> 6) & 7];
1436 *p
++ = hexdigit
[(ch
>> 3) & 7];
1437 *p
++ = hexdigit
[ch
& 7];
1439 /* Copy everything else as-is */
1447 if (_PyString_Resize(&repr
, p
- q
))
1457 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1460 return unicodeescape_string(s
, size
, 0);
1463 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1465 if (!PyUnicode_Check(unicode
)) {
1466 PyErr_BadArgument();
1469 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1470 PyUnicode_GET_SIZE(unicode
));
1473 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1475 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1480 Py_UNICODE
*p
, *buf
;
1484 /* Escaped strings will always be longer than the resulting
1485 Unicode string, so we start with size here and then reduce the
1486 length after conversion to the true value. */
1487 v
= _PyUnicode_New(size
);
1491 return (PyObject
*)v
;
1492 p
= buf
= PyUnicode_AS_UNICODE(v
);
1499 /* Non-escape characters are interpreted as Unicode ordinals */
1501 *p
++ = (unsigned char)*s
++;
1505 /* \u-escapes are only interpreted iff the number of leading
1506 backslashes if odd */
1511 *p
++ = (unsigned char)*s
++;
1513 if (((s
- bs
) & 1) == 0 ||
1521 /* \uXXXX with 4 hex digits */
1522 for (x
= 0, i
= 0; i
< 4; i
++) {
1523 c
= (unsigned char)s
[i
];
1525 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1526 "truncated \\uXXXX"))
1532 if (c
>= '0' && c
<= '9')
1534 else if (c
>= 'a' && c
<= 'f')
1542 if (_PyUnicode_Resize(v
, (int)(p
- buf
)))
1544 return (PyObject
*)v
;
1551 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1558 static const char *hexdigit
= "0123456789ABCDEF";
1560 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1566 p
= q
= PyString_AS_STRING(repr
);
1567 while (size
-- > 0) {
1568 Py_UNICODE ch
= *s
++;
1569 /* Map 16-bit characters to '\uxxxx' */
1573 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1574 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1575 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1576 *p
++ = hexdigit
[ch
& 15];
1578 /* Copy everything else as-is */
1583 if (_PyString_Resize(&repr
, p
- q
))
1593 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1595 if (!PyUnicode_Check(unicode
)) {
1596 PyErr_BadArgument();
1599 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1600 PyUnicode_GET_SIZE(unicode
));
1603 /* --- Latin-1 Codec ------------------------------------------------------ */
1605 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
1612 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1613 v
= _PyUnicode_New(size
);
1617 return (PyObject
*)v
;
1618 p
= PyUnicode_AS_UNICODE(v
);
1620 *p
++ = (unsigned char)*s
++;
1621 return (PyObject
*)v
;
1629 int latin1_encoding_error(const Py_UNICODE
**source
,
1632 const char *details
)
1634 if ((errors
== NULL
) ||
1635 (strcmp(errors
,"strict") == 0)) {
1636 PyErr_Format(PyExc_UnicodeError
,
1637 "Latin-1 encoding error: %.400s",
1641 else if (strcmp(errors
,"ignore") == 0) {
1644 else if (strcmp(errors
,"replace") == 0) {
1650 PyErr_Format(PyExc_ValueError
,
1651 "Latin-1 encoding error; "
1652 "unknown error handling code: %.400s",
1658 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
1665 repr
= PyString_FromStringAndSize(NULL
, size
);
1671 s
= PyString_AS_STRING(repr
);
1673 while (size
-- > 0) {
1674 Py_UNICODE ch
= *p
++;
1676 if (latin1_encoding_error(&p
, &s
, errors
,
1677 "ordinal not in range(256)"))
1683 /* Resize if error handling skipped some characters */
1684 if (s
- start
< PyString_GET_SIZE(repr
))
1685 if (_PyString_Resize(&repr
, s
- start
))
1694 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
1696 if (!PyUnicode_Check(unicode
)) {
1697 PyErr_BadArgument();
1700 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
1701 PyUnicode_GET_SIZE(unicode
),
1705 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1708 int ascii_decoding_error(const char **source
,
1711 const char *details
)
1713 if ((errors
== NULL
) ||
1714 (strcmp(errors
,"strict") == 0)) {
1715 PyErr_Format(PyExc_UnicodeError
,
1716 "ASCII decoding error: %.400s",
1720 else if (strcmp(errors
,"ignore") == 0) {
1723 else if (strcmp(errors
,"replace") == 0) {
1724 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1729 PyErr_Format(PyExc_ValueError
,
1730 "ASCII decoding error; "
1731 "unknown error handling code: %.400s",
1737 PyObject
*PyUnicode_DecodeASCII(const char *s
,
1744 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1745 v
= _PyUnicode_New(size
);
1749 return (PyObject
*)v
;
1750 p
= PyUnicode_AS_UNICODE(v
);
1751 while (size
-- > 0) {
1752 register unsigned char c
;
1754 c
= (unsigned char)*s
++;
1757 else if (ascii_decoding_error(&s
, &p
, errors
,
1758 "ordinal not in range(128)"))
1761 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
1762 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1764 return (PyObject
*)v
;
1772 int ascii_encoding_error(const Py_UNICODE
**source
,
1775 const char *details
)
1777 if ((errors
== NULL
) ||
1778 (strcmp(errors
,"strict") == 0)) {
1779 PyErr_Format(PyExc_UnicodeError
,
1780 "ASCII encoding error: %.400s",
1784 else if (strcmp(errors
,"ignore") == 0) {
1787 else if (strcmp(errors
,"replace") == 0) {
1793 PyErr_Format(PyExc_ValueError
,
1794 "ASCII encoding error; "
1795 "unknown error handling code: %.400s",
1801 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
1808 repr
= PyString_FromStringAndSize(NULL
, size
);
1814 s
= PyString_AS_STRING(repr
);
1816 while (size
-- > 0) {
1817 Py_UNICODE ch
= *p
++;
1819 if (ascii_encoding_error(&p
, &s
, errors
,
1820 "ordinal not in range(128)"))
1826 /* Resize if error handling skipped some characters */
1827 if (s
- start
< PyString_GET_SIZE(repr
))
1828 if (_PyString_Resize(&repr
, s
- start
))
1837 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
1839 if (!PyUnicode_Check(unicode
)) {
1840 PyErr_BadArgument();
1843 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
1844 PyUnicode_GET_SIZE(unicode
),
1850 /* --- MBCS codecs for Windows -------------------------------------------- */
1852 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
1859 /* First get the size of the result */
1860 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
1861 if (size
> 0 && usize
==0)
1862 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1864 v
= _PyUnicode_New(usize
);
1868 return (PyObject
*)v
;
1869 p
= PyUnicode_AS_UNICODE(v
);
1870 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
1872 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1875 return (PyObject
*)v
;
1878 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
1886 /* If there are no characters, bail now! */
1888 return PyString_FromString("");
1890 /* First get the size of the result */
1891 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
1893 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1895 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
1901 /* Do the conversion */
1902 s
= PyString_AS_STRING(repr
);
1903 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
1905 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1910 #endif /* MS_WIN32 */
1912 /* --- Character Mapping Codec -------------------------------------------- */
1915 int charmap_decoding_error(const char **source
,
1918 const char *details
)
1920 if ((errors
== NULL
) ||
1921 (strcmp(errors
,"strict") == 0)) {
1922 PyErr_Format(PyExc_UnicodeError
,
1923 "charmap decoding error: %.400s",
1927 else if (strcmp(errors
,"ignore") == 0) {
1930 else if (strcmp(errors
,"replace") == 0) {
1931 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1936 PyErr_Format(PyExc_ValueError
,
1937 "charmap decoding error; "
1938 "unknown error handling code: %.400s",
1944 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
1952 /* Default to Latin-1 */
1953 if (mapping
== NULL
)
1954 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1956 v
= _PyUnicode_New(size
);
1960 return (PyObject
*)v
;
1961 p
= PyUnicode_AS_UNICODE(v
);
1962 while (size
-- > 0) {
1963 unsigned char ch
= *s
++;
1966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967 w
= PyInt_FromLong((long)ch
);
1970 x
= PyObject_GetItem(mapping
, w
);
1973 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
1974 /* No mapping found: default to Latin-1 mapping */
1976 *p
++ = (Py_UNICODE
)ch
;
1983 if (PyInt_Check(x
)) {
1984 long value
= PyInt_AS_LONG(x
);
1985 if (value
< 0 || value
> 65535) {
1986 PyErr_SetString(PyExc_TypeError
,
1987 "character mapping must be in range(65536)");
1991 *p
++ = (Py_UNICODE
)value
;
1993 else if (x
== Py_None
) {
1994 /* undefined mapping */
1995 if (charmap_decoding_error(&s
, &p
, errors
,
1996 "character maps to <undefined>")) {
2001 else if (PyUnicode_Check(x
)) {
2002 if (PyUnicode_GET_SIZE(x
) != 1) {
2004 PyErr_SetString(PyExc_NotImplementedError
,
2005 "1-n mappings are currently not implemented");
2009 *p
++ = *PyUnicode_AS_UNICODE(x
);
2012 /* wrong return value */
2013 PyErr_SetString(PyExc_TypeError
,
2014 "character mapping must return integer, None or unicode");
2020 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2021 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2023 return (PyObject
*)v
;
2031 int charmap_encoding_error(const Py_UNICODE
**source
,
2034 const char *details
)
2036 if ((errors
== NULL
) ||
2037 (strcmp(errors
,"strict") == 0)) {
2038 PyErr_Format(PyExc_UnicodeError
,
2039 "charmap encoding error: %.400s",
2043 else if (strcmp(errors
,"ignore") == 0) {
2046 else if (strcmp(errors
,"replace") == 0) {
2052 PyErr_Format(PyExc_ValueError
,
2053 "charmap encoding error; "
2054 "unknown error handling code: %.400s",
2060 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2068 /* Default to Latin-1 */
2069 if (mapping
== NULL
)
2070 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2072 v
= PyString_FromStringAndSize(NULL
, size
);
2077 s
= PyString_AS_STRING(v
);
2078 while (size
-- > 0) {
2079 Py_UNICODE ch
= *p
++;
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w
= PyInt_FromLong((long)ch
);
2086 x
= PyObject_GetItem(mapping
, w
);
2089 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2090 /* No mapping found: default to Latin-1 mapping if possible */
2096 else if (!charmap_encoding_error(&p
, &s
, errors
,
2097 "missing character mapping"))
2104 if (PyInt_Check(x
)) {
2105 long value
= PyInt_AS_LONG(x
);
2106 if (value
< 0 || value
> 255) {
2107 PyErr_SetString(PyExc_TypeError
,
2108 "character mapping must be in range(256)");
2114 else if (x
== Py_None
) {
2115 /* undefined mapping */
2116 if (charmap_encoding_error(&p
, &s
, errors
,
2117 "character maps to <undefined>")) {
2122 else if (PyString_Check(x
)) {
2123 if (PyString_GET_SIZE(x
) != 1) {
2125 PyErr_SetString(PyExc_NotImplementedError
,
2126 "1-n mappings are currently not implemented");
2130 *s
++ = *PyString_AS_STRING(x
);
2133 /* wrong return value */
2134 PyErr_SetString(PyExc_TypeError
,
2135 "character mapping must return integer, None or unicode");
2141 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2142 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2151 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2154 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2155 PyErr_BadArgument();
2158 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2159 PyUnicode_GET_SIZE(unicode
),
2165 int translate_error(const Py_UNICODE
**source
,
2168 const char *details
)
2170 if ((errors
== NULL
) ||
2171 (strcmp(errors
,"strict") == 0)) {
2172 PyErr_Format(PyExc_UnicodeError
,
2173 "translate error: %.400s",
2177 else if (strcmp(errors
,"ignore") == 0) {
2180 else if (strcmp(errors
,"replace") == 0) {
2186 PyErr_Format(PyExc_ValueError
,
2188 "unknown error handling code: %.400s",
2194 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2202 if (mapping
== NULL
) {
2203 PyErr_BadArgument();
2207 /* Output will never be longer than input */
2208 v
= _PyUnicode_New(size
);
2213 p
= PyUnicode_AS_UNICODE(v
);
2214 while (size
-- > 0) {
2215 Py_UNICODE ch
= *s
++;
2219 w
= PyInt_FromLong(ch
);
2222 x
= PyObject_GetItem(mapping
, w
);
2225 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2226 /* No mapping found: default to 1-1 mapping */
2236 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2237 else if (x
== Py_None
) {
2238 /* undefined mapping */
2239 if (translate_error(&s
, &p
, errors
,
2240 "character maps to <undefined>")) {
2245 else if (PyUnicode_Check(x
)) {
2246 if (PyUnicode_GET_SIZE(x
) != 1) {
2248 PyErr_SetString(PyExc_NotImplementedError
,
2249 "1-n mappings are currently not implemented");
2253 *p
++ = *PyUnicode_AS_UNICODE(x
);
2256 /* wrong return value */
2257 PyErr_SetString(PyExc_TypeError
,
2258 "translate mapping must return integer, None or unicode");
2264 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2265 if (_PyUnicode_Resize(v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2269 return (PyObject
*)v
;
2276 PyObject
*PyUnicode_Translate(PyObject
*str
,
2282 str
= PyUnicode_FromObject(str
);
2285 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2286 PyUnicode_GET_SIZE(str
),
2297 /* --- Decimal Encoder ---------------------------------------------------- */
2299 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2304 Py_UNICODE
*p
, *end
;
2306 if (output
== NULL
) {
2307 PyErr_BadArgument();
2314 register Py_UNICODE ch
= *p
++;
2317 if (Py_UNICODE_ISSPACE(ch
)) {
2321 decimal
= Py_UNICODE_TODECIMAL(ch
);
2323 *output
++ = '0' + decimal
;
2326 if (0 < ch
&& ch
< 256) {
2327 *output
++ = (char)ch
;
2330 /* All other characters are considered invalid */
2331 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2332 PyErr_SetString(PyExc_ValueError
,
2333 "invalid decimal Unicode string");
2336 else if (strcmp(errors
, "ignore") == 0)
2338 else if (strcmp(errors
, "replace") == 0) {
2343 /* 0-terminate the output string */
2351 /* --- Helpers ------------------------------------------------------------ */
2354 int count(PyUnicodeObject
*self
,
2357 PyUnicodeObject
*substring
)
2361 if (substring
->length
== 0)
2362 return (end
- start
+ 1);
2364 end
-= substring
->length
;
2366 while (start
<= end
)
2367 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2369 start
+= substring
->length
;
2376 int PyUnicode_Count(PyObject
*str
,
2383 str
= PyUnicode_FromObject(str
);
2386 substr
= PyUnicode_FromObject(substr
);
2387 if (substr
== NULL
) {
2392 result
= count((PyUnicodeObject
*)str
,
2394 (PyUnicodeObject
*)substr
);
2402 int findstring(PyUnicodeObject
*self
,
2403 PyUnicodeObject
*substring
,
2409 start
+= self
->length
;
2413 if (substring
->length
== 0)
2416 if (end
> self
->length
)
2419 end
+= self
->length
;
2423 end
-= substring
->length
;
2425 if (direction
< 0) {
2426 for (; end
>= start
; end
--)
2427 if (Py_UNICODE_MATCH(self
, end
, substring
))
2430 for (; start
<= end
; start
++)
2431 if (Py_UNICODE_MATCH(self
, start
, substring
))
2438 int PyUnicode_Find(PyObject
*str
,
2446 str
= PyUnicode_FromObject(str
);
2449 substr
= PyUnicode_FromObject(substr
);
2450 if (substr
== NULL
) {
2455 result
= findstring((PyUnicodeObject
*)str
,
2456 (PyUnicodeObject
*)substr
,
2457 start
, end
, direction
);
2464 int tailmatch(PyUnicodeObject
*self
,
2465 PyUnicodeObject
*substring
,
2471 start
+= self
->length
;
2475 if (substring
->length
== 0)
2478 if (end
> self
->length
)
2481 end
+= self
->length
;
2485 end
-= substring
->length
;
2489 if (direction
> 0) {
2490 if (Py_UNICODE_MATCH(self
, end
, substring
))
2493 if (Py_UNICODE_MATCH(self
, start
, substring
))
2500 int PyUnicode_Tailmatch(PyObject
*str
,
2508 str
= PyUnicode_FromObject(str
);
2511 substr
= PyUnicode_FromObject(substr
);
2512 if (substr
== NULL
) {
2517 result
= tailmatch((PyUnicodeObject
*)str
,
2518 (PyUnicodeObject
*)substr
,
2519 start
, end
, direction
);
2526 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2530 /* like wcschr, but doesn't stop at NULL characters */
2532 while (size
-- > 0) {
2541 /* Apply fixfct filter to the Unicode object self and return a
2542 reference to the modified object */
2545 PyObject
*fixup(PyUnicodeObject
*self
,
2546 int (*fixfct
)(PyUnicodeObject
*s
))
2551 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(self
->str
,
2556 /* fixfct should return TRUE if it modified the buffer. If
2557 FALSE, return a reference to the original buffer instead
2558 (to save space, not time) */
2561 return (PyObject
*) self
;
2563 return (PyObject
*) u
;
2567 int fixupper(PyUnicodeObject
*self
)
2569 int len
= self
->length
;
2570 Py_UNICODE
*s
= self
->str
;
2574 register Py_UNICODE ch
;
2576 ch
= Py_UNICODE_TOUPPER(*s
);
2588 int fixlower(PyUnicodeObject
*self
)
2590 int len
= self
->length
;
2591 Py_UNICODE
*s
= self
->str
;
2595 register Py_UNICODE ch
;
2597 ch
= Py_UNICODE_TOLOWER(*s
);
2609 int fixswapcase(PyUnicodeObject
*self
)
2611 int len
= self
->length
;
2612 Py_UNICODE
*s
= self
->str
;
2616 if (Py_UNICODE_ISUPPER(*s
)) {
2617 *s
= Py_UNICODE_TOLOWER(*s
);
2619 } else if (Py_UNICODE_ISLOWER(*s
)) {
2620 *s
= Py_UNICODE_TOUPPER(*s
);
2630 int fixcapitalize(PyUnicodeObject
*self
)
2632 if (self
->length
> 0 && Py_UNICODE_ISLOWER(self
->str
[0])) {
2633 self
->str
[0] = Py_UNICODE_TOUPPER(self
->str
[0]);
2640 int fixtitle(PyUnicodeObject
*self
)
2642 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
2643 register Py_UNICODE
*e
;
2644 int previous_is_cased
;
2646 /* Shortcut for single character strings */
2647 if (PyUnicode_GET_SIZE(self
) == 1) {
2648 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
2657 e
= p
+ PyUnicode_GET_SIZE(self
);
2658 previous_is_cased
= 0;
2659 for (; p
< e
; p
++) {
2660 register const Py_UNICODE ch
= *p
;
2662 if (previous_is_cased
)
2663 *p
= Py_UNICODE_TOLOWER(ch
);
2665 *p
= Py_UNICODE_TOTITLE(ch
);
2667 if (Py_UNICODE_ISLOWER(ch
) ||
2668 Py_UNICODE_ISUPPER(ch
) ||
2669 Py_UNICODE_ISTITLE(ch
))
2670 previous_is_cased
= 1;
2672 previous_is_cased
= 0;
2677 PyObject
*PyUnicode_Join(PyObject
*separator
,
2682 PyUnicodeObject
*res
= NULL
;
2689 seqlen
= PySequence_Size(seq
);
2690 if (seqlen
< 0 && PyErr_Occurred())
2693 if (separator
== NULL
) {
2694 Py_UNICODE blank
= ' ';
2699 separator
= PyUnicode_FromObject(separator
);
2700 if (separator
== NULL
)
2702 sep
= PyUnicode_AS_UNICODE(separator
);
2703 seplen
= PyUnicode_GET_SIZE(separator
);
2706 res
= _PyUnicode_New(sz
);
2709 p
= PyUnicode_AS_UNICODE(res
);
2712 for (i
= 0; i
< seqlen
; i
++) {
2716 item
= PySequence_GetItem(seq
, i
);
2719 if (!PyUnicode_Check(item
)) {
2721 v
= PyUnicode_FromObject(item
);
2727 itemlen
= PyUnicode_GET_SIZE(item
);
2728 while (reslen
+ itemlen
+ seplen
>= sz
) {
2729 if (_PyUnicode_Resize(res
, sz
*2))
2732 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
2735 memcpy(p
, sep
, seplen
* sizeof(Py_UNICODE
));
2739 memcpy(p
, PyUnicode_AS_UNICODE(item
), itemlen
* sizeof(Py_UNICODE
));
2744 if (_PyUnicode_Resize(res
, reslen
))
2747 Py_XDECREF(separator
);
2748 return (PyObject
*)res
;
2751 Py_XDECREF(separator
);
2757 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
2769 if (left
== 0 && right
== 0) {
2774 u
= _PyUnicode_New(left
+ self
->length
+ right
);
2777 Py_UNICODE_FILL(u
->str
, fill
, left
);
2778 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
2780 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
2786 #define SPLIT_APPEND(data, left, right) \
2787 str = PyUnicode_FromUnicode(data + left, right - left); \
2790 if (PyList_Append(list, str)) { \
2798 PyObject
*split_whitespace(PyUnicodeObject
*self
,
2804 int len
= self
->length
;
2807 for (i
= j
= 0; i
< len
; ) {
2809 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2812 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
2815 if (maxcount
-- <= 0)
2817 SPLIT_APPEND(self
->str
, j
, i
);
2818 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2824 SPLIT_APPEND(self
->str
, j
, len
);
2833 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
2843 string
= PyUnicode_FromObject(string
);
2846 data
= PyUnicode_AS_UNICODE(string
);
2847 len
= PyUnicode_GET_SIZE(string
);
2849 list
= PyList_New(0);
2853 for (i
= j
= 0; i
< len
; ) {
2856 /* Find a line and append it */
2857 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
2860 /* Skip the line break reading CRLF as one line break */
2863 if (data
[i
] == '\r' && i
+ 1 < len
&&
2871 SPLIT_APPEND(data
, j
, eol
);
2875 SPLIT_APPEND(data
, j
, len
);
2888 PyObject
*split_char(PyUnicodeObject
*self
,
2895 int len
= self
->length
;
2898 for (i
= j
= 0; i
< len
; ) {
2899 if (self
->str
[i
] == ch
) {
2900 if (maxcount
-- <= 0)
2902 SPLIT_APPEND(self
->str
, j
, i
);
2908 SPLIT_APPEND(self
->str
, j
, len
);
2918 PyObject
*split_substring(PyUnicodeObject
*self
,
2920 PyUnicodeObject
*substring
,
2925 int len
= self
->length
;
2926 int sublen
= substring
->length
;
2929 for (i
= j
= 0; i
< len
- sublen
; ) {
2930 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
2931 if (maxcount
-- <= 0)
2933 SPLIT_APPEND(self
->str
, j
, i
);
2939 SPLIT_APPEND(self
->str
, j
, len
);
2951 PyObject
*split(PyUnicodeObject
*self
,
2952 PyUnicodeObject
*substring
,
2960 list
= PyList_New(0);
2964 if (substring
== NULL
)
2965 return split_whitespace(self
,list
,maxcount
);
2967 else if (substring
->length
== 1)
2968 return split_char(self
,list
,substring
->str
[0],maxcount
);
2970 else if (substring
->length
== 0) {
2972 PyErr_SetString(PyExc_ValueError
, "empty separator");
2976 return split_substring(self
,list
,substring
,maxcount
);
2980 PyObject
*strip(PyUnicodeObject
*self
,
2984 Py_UNICODE
*p
= self
->str
;
2986 int end
= self
->length
;
2989 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
2993 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
2996 if (start
== 0 && end
== self
->length
) {
2997 /* couldn't strip anything off, return original string */
2999 return (PyObject
*) self
;
3002 return (PyObject
*) PyUnicode_FromUnicode(
3009 PyObject
*replace(PyUnicodeObject
*self
,
3010 PyUnicodeObject
*str1
,
3011 PyUnicodeObject
*str2
,
3019 if (str1
->length
== 1 && str2
->length
== 1) {
3022 /* replace characters */
3023 if (!findchar(self
->str
, self
->length
, str1
->str
[0])) {
3024 /* nothing to replace, return original string */
3028 Py_UNICODE u1
= str1
->str
[0];
3029 Py_UNICODE u2
= str2
->str
[0];
3031 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3036 for (i
= 0; i
< u
->length
; i
++)
3037 if (u
->str
[i
] == u1
) {
3048 /* replace strings */
3049 n
= count(self
, 0, self
->length
, str1
);
3053 /* nothing to replace, return original string */
3058 self
->length
+ n
* (str2
->length
- str1
->length
));
3062 while (i
<= self
->length
- str1
->length
)
3063 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3064 /* replace string segment */
3065 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3069 /* copy remaining part */
3070 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3074 *p
++ = self
->str
[i
++];
3079 return (PyObject
*) u
;
3082 /* --- Unicode Object Methods --------------------------------------------- */
3084 static char title__doc__
[] =
3085 "S.title() -> unicode\n\
3087 Return a titlecased version of S, i.e. words start with title case\n\
3088 characters, all remaining cased characters have lower case.";
3091 unicode_title(PyUnicodeObject
*self
, PyObject
*args
)
3093 if (!PyArg_NoArgs(args
))
3095 return fixup(self
, fixtitle
);
3098 static char capitalize__doc__
[] =
3099 "S.capitalize() -> unicode\n\
3101 Return a capitalized version of S, i.e. make the first character\n\
3105 unicode_capitalize(PyUnicodeObject
*self
, PyObject
*args
)
3107 if (!PyArg_NoArgs(args
))
3109 return fixup(self
, fixcapitalize
);
3113 static char capwords__doc__
[] =
3114 "S.capwords() -> unicode\n\
3116 Apply .capitalize() to all words in S and return the result with\n\
3117 normalized whitespace (all whitespace strings are replaced by ' ').";
3120 unicode_capwords(PyUnicodeObject
*self
, PyObject
*args
)
3126 if (!PyArg_NoArgs(args
))
3129 /* Split into words */
3130 list
= split(self
, NULL
, -1);
3134 /* Capitalize each word */
3135 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3136 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3140 Py_DECREF(PyList_GET_ITEM(list
, i
));
3141 PyList_SET_ITEM(list
, i
, item
);
3144 /* Join the words to form a new string */
3145 item
= PyUnicode_Join(NULL
, list
);
3149 return (PyObject
*)item
;
3153 static char center__doc__
[] =
3154 "S.center(width) -> unicode\n\
3156 Return S centered in a Unicode string of length width. Padding is done\n\
3160 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3165 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3168 if (self
->length
>= width
) {
3170 return (PyObject
*) self
;
3173 marg
= width
- self
->length
;
3174 left
= marg
/ 2 + (marg
& width
& 1);
3176 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3181 /* This code should go into some future Unicode collation support
3182 module. The basic comparison should compare ordinals on a naive
3183 basis (this is what Java does and thus JPython too). */
3185 /* speedy UTF-16 code point order comparison */
3187 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3189 static short utf16Fixup
[32] =
3191 0, 0, 0, 0, 0, 0, 0, 0,
3192 0, 0, 0, 0, 0, 0, 0, 0,
3193 0, 0, 0, 0, 0, 0, 0, 0,
3194 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3198 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3202 Py_UNICODE
*s1
= str1
->str
;
3203 Py_UNICODE
*s2
= str2
->str
;
3205 len1
= str1
->length
;
3206 len2
= str2
->length
;
3208 while (len1
> 0 && len2
> 0) {
3214 if (c1
> (1<<11) * 26)
3215 c1
+= utf16Fixup
[c1
>>11];
3216 if (c2
> (1<<11) * 26)
3217 c2
+= utf16Fixup
[c2
>>11];
3219 /* now c1 and c2 are in UTF-32-compatible order */
3220 diff
= (long)c1
- (long)c2
;
3222 return (diff
< 0) ? -1 : (diff
!= 0);
3226 return (len1
< len2
) ? -1 : (len1
!= len2
);
3232 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3234 register int len1
, len2
;
3236 Py_UNICODE
*s1
= str1
->str
;
3237 Py_UNICODE
*s2
= str2
->str
;
3239 len1
= str1
->length
;
3240 len2
= str2
->length
;
3242 while (len1
> 0 && len2
> 0) {
3245 diff
= (long)*s1
++ - (long)*s2
++;
3247 return (diff
< 0) ? -1 : (diff
!= 0);
3251 return (len1
< len2
) ? -1 : (len1
!= len2
);
3256 int PyUnicode_Compare(PyObject
*left
,
3259 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3262 /* Coerce the two arguments */
3263 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3266 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3270 /* Shortcut for empty or interned objects */
3277 result
= unicode_compare(u
, v
);
3289 int PyUnicode_Contains(PyObject
*container
,
3292 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3294 register const Py_UNICODE
*p
, *e
;
3295 register Py_UNICODE ch
;
3297 /* Coerce the two arguments */
3298 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3300 PyErr_SetString(PyExc_TypeError
,
3301 "'in <string>' requires character as left operand");
3304 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3311 if (PyUnicode_GET_SIZE(v
) != 1) {
3312 PyErr_SetString(PyExc_TypeError
,
3313 "'in <string>' requires character as left operand");
3316 ch
= *PyUnicode_AS_UNICODE(v
);
3317 p
= PyUnicode_AS_UNICODE(u
);
3318 e
= p
+ PyUnicode_GET_SIZE(u
);
3337 /* Concat to string or Unicode object giving a new Unicode object. */
3339 PyObject
*PyUnicode_Concat(PyObject
*left
,
3342 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3344 /* Coerce the two arguments */
3345 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3348 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3353 if (v
== unicode_empty
) {
3355 return (PyObject
*)u
;
3357 if (u
== unicode_empty
) {
3359 return (PyObject
*)v
;
3362 /* Concat the two Unicode strings */
3363 w
= _PyUnicode_New(u
->length
+ v
->length
);
3366 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3367 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3371 return (PyObject
*)w
;
3379 static char count__doc__
[] =
3380 "S.count(sub[, start[, end]]) -> int\n\
3382 Return the number of occurrences of substring sub in Unicode string\n\
3383 S[start:end]. Optional arguments start and end are\n\
3384 interpreted as in slice notation.";
3387 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3389 PyUnicodeObject
*substring
;
3394 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3395 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3398 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3399 (PyObject
*)substring
);
3400 if (substring
== NULL
)
3404 start
+= self
->length
;
3407 if (end
> self
->length
)
3410 end
+= self
->length
;
3414 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3416 Py_DECREF(substring
);
3420 static char encode__doc__
[] =
3421 "S.encode([encoding[,errors]]) -> string\n\
3423 Return an encoded string version of S. Default encoding is the current\n\
3424 default string encoding. errors may be given to set a different error\n\
3425 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3426 a ValueError. Other possible values are 'ignore' and 'replace'.";
3429 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3431 char *encoding
= NULL
;
3432 char *errors
= NULL
;
3433 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3435 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3438 static char expandtabs__doc__
[] =
3439 "S.expandtabs([tabsize]) -> unicode\n\
3441 Return a copy of S where all tab characters are expanded using spaces.\n\
3442 If tabsize is not given, a tab size of 8 characters is assumed.";
3445 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3454 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3457 /* First pass: determine size of output string */
3459 e
= self
->str
+ self
->length
;
3460 for (p
= self
->str
; p
< e
; p
++)
3463 j
+= tabsize
- (j
% tabsize
);
3467 if (*p
== '\n' || *p
== '\r') {
3473 /* Second pass: create output string and fill it */
3474 u
= _PyUnicode_New(i
+ j
);
3481 for (p
= self
->str
; p
< e
; p
++)
3484 i
= tabsize
- (j
% tabsize
);
3493 if (*p
== '\n' || *p
== '\r')
3497 return (PyObject
*) u
;
3500 static char find__doc__
[] =
3501 "S.find(sub [,start [,end]]) -> int\n\
3503 Return the lowest index in S where substring sub is found,\n\
3504 such that sub is contained within s[start,end]. Optional\n\
3505 arguments start and end are interpreted as in slice notation.\n\
3507 Return -1 on failure.";
3510 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3512 PyUnicodeObject
*substring
;
3517 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
3518 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3520 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3521 (PyObject
*)substring
);
3522 if (substring
== NULL
)
3525 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
3527 Py_DECREF(substring
);
3532 unicode_getitem(PyUnicodeObject
*self
, int index
)
3534 if (index
< 0 || index
>= self
->length
) {
3535 PyErr_SetString(PyExc_IndexError
, "string index out of range");
3539 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
3543 unicode_hash(PyUnicodeObject
*self
)
3545 /* Since Unicode objects compare equal to their ASCII string
3546 counterparts, they should use the individual character values
3547 as basis for their hash value. This is needed to assure that
3548 strings and Unicode objects behave in the same way as
3552 register Py_UNICODE
*p
;
3555 if (self
->hash
!= -1)
3557 len
= PyUnicode_GET_SIZE(self
);
3558 p
= PyUnicode_AS_UNICODE(self
);
3561 x
= (1000003*x
) ^ *p
++;
3562 x
^= PyUnicode_GET_SIZE(self
);
3569 static char index__doc__
[] =
3570 "S.index(sub [,start [,end]]) -> int\n\
3572 Like S.find() but raise ValueError when the substring is not found.";
3575 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
3578 PyUnicodeObject
*substring
;
3582 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
3583 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3586 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3587 (PyObject
*)substring
);
3588 if (substring
== NULL
)
3591 result
= findstring(self
, substring
, start
, end
, 1);
3593 Py_DECREF(substring
);
3595 PyErr_SetString(PyExc_ValueError
, "substring not found");
3598 return PyInt_FromLong(result
);
3601 static char islower__doc__
[] =
3602 "S.islower() -> int\n\
3604 Return 1 if all cased characters in S are lowercase and there is\n\
3605 at least one cased character in S, 0 otherwise.";
3608 unicode_islower(PyUnicodeObject
*self
, PyObject
*args
)
3610 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3611 register const Py_UNICODE
*e
;
3614 if (!PyArg_NoArgs(args
))
3617 /* Shortcut for single character strings */
3618 if (PyUnicode_GET_SIZE(self
) == 1)
3619 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
3621 /* Special case for empty strings */
3622 if (PyString_GET_SIZE(self
) == 0)
3623 return PyInt_FromLong(0);
3625 e
= p
+ PyUnicode_GET_SIZE(self
);
3627 for (; p
< e
; p
++) {
3628 register const Py_UNICODE ch
= *p
;
3630 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
3631 return PyInt_FromLong(0);
3632 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
3635 return PyInt_FromLong(cased
);
3638 static char isupper__doc__
[] =
3639 "S.isupper() -> int\n\
3641 Return 1 if all cased characters in S are uppercase and there is\n\
3642 at least one cased character in S, 0 otherwise.";
3645 unicode_isupper(PyUnicodeObject
*self
, PyObject
*args
)
3647 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3648 register const Py_UNICODE
*e
;
3651 if (!PyArg_NoArgs(args
))
3654 /* Shortcut for single character strings */
3655 if (PyUnicode_GET_SIZE(self
) == 1)
3656 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
3658 /* Special case for empty strings */
3659 if (PyString_GET_SIZE(self
) == 0)
3660 return PyInt_FromLong(0);
3662 e
= p
+ PyUnicode_GET_SIZE(self
);
3664 for (; p
< e
; p
++) {
3665 register const Py_UNICODE ch
= *p
;
3667 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
3668 return PyInt_FromLong(0);
3669 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
3672 return PyInt_FromLong(cased
);
3675 static char istitle__doc__
[] =
3676 "S.istitle() -> int\n\
3678 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3679 may only follow uncased characters and lowercase characters only cased\n\
3680 ones. Return 0 otherwise.";
3683 unicode_istitle(PyUnicodeObject
*self
, PyObject
*args
)
3685 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3686 register const Py_UNICODE
*e
;
3687 int cased
, previous_is_cased
;
3689 if (!PyArg_NoArgs(args
))
3692 /* Shortcut for single character strings */
3693 if (PyUnicode_GET_SIZE(self
) == 1)
3694 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
3695 (Py_UNICODE_ISUPPER(*p
) != 0));
3697 /* Special case for empty strings */
3698 if (PyString_GET_SIZE(self
) == 0)
3699 return PyInt_FromLong(0);
3701 e
= p
+ PyUnicode_GET_SIZE(self
);
3703 previous_is_cased
= 0;
3704 for (; p
< e
; p
++) {
3705 register const Py_UNICODE ch
= *p
;
3707 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
3708 if (previous_is_cased
)
3709 return PyInt_FromLong(0);
3710 previous_is_cased
= 1;
3713 else if (Py_UNICODE_ISLOWER(ch
)) {
3714 if (!previous_is_cased
)
3715 return PyInt_FromLong(0);
3716 previous_is_cased
= 1;
3720 previous_is_cased
= 0;
3722 return PyInt_FromLong(cased
);
3725 static char isspace__doc__
[] =
3726 "S.isspace() -> int\n\
3728 Return 1 if there are only whitespace characters in S,\n\
3732 unicode_isspace(PyUnicodeObject
*self
, PyObject
*args
)
3734 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3735 register const Py_UNICODE
*e
;
3737 if (!PyArg_NoArgs(args
))
3740 /* Shortcut for single character strings */
3741 if (PyUnicode_GET_SIZE(self
) == 1 &&
3742 Py_UNICODE_ISSPACE(*p
))
3743 return PyInt_FromLong(1);
3745 /* Special case for empty strings */
3746 if (PyString_GET_SIZE(self
) == 0)
3747 return PyInt_FromLong(0);
3749 e
= p
+ PyUnicode_GET_SIZE(self
);
3750 for (; p
< e
; p
++) {
3751 if (!Py_UNICODE_ISSPACE(*p
))
3752 return PyInt_FromLong(0);
3754 return PyInt_FromLong(1);
3757 static char isalpha__doc__
[] =
3758 "S.isalpha() -> int\n\
3760 Return 1 if all characters in S are alphabetic\n\
3761 and there is at least one character in S, 0 otherwise.";
3764 unicode_isalpha(PyUnicodeObject
*self
, PyObject
*args
)
3766 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3767 register const Py_UNICODE
*e
;
3769 if (!PyArg_NoArgs(args
))
3772 /* Shortcut for single character strings */
3773 if (PyUnicode_GET_SIZE(self
) == 1 &&
3774 Py_UNICODE_ISALPHA(*p
))
3775 return PyInt_FromLong(1);
3777 /* Special case for empty strings */
3778 if (PyString_GET_SIZE(self
) == 0)
3779 return PyInt_FromLong(0);
3781 e
= p
+ PyUnicode_GET_SIZE(self
);
3782 for (; p
< e
; p
++) {
3783 if (!Py_UNICODE_ISALPHA(*p
))
3784 return PyInt_FromLong(0);
3786 return PyInt_FromLong(1);
3789 static char isalnum__doc__
[] =
3790 "S.isalnum() -> int\n\
3792 Return 1 if all characters in S are alphanumeric\n\
3793 and there is at least one character in S, 0 otherwise.";
3796 unicode_isalnum(PyUnicodeObject
*self
, PyObject
*args
)
3798 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3799 register const Py_UNICODE
*e
;
3801 if (!PyArg_NoArgs(args
))
3804 /* Shortcut for single character strings */
3805 if (PyUnicode_GET_SIZE(self
) == 1 &&
3806 Py_UNICODE_ISALNUM(*p
))
3807 return PyInt_FromLong(1);
3809 /* Special case for empty strings */
3810 if (PyString_GET_SIZE(self
) == 0)
3811 return PyInt_FromLong(0);
3813 e
= p
+ PyUnicode_GET_SIZE(self
);
3814 for (; p
< e
; p
++) {
3815 if (!Py_UNICODE_ISALNUM(*p
))
3816 return PyInt_FromLong(0);
3818 return PyInt_FromLong(1);
3821 static char isdecimal__doc__
[] =
3822 "S.isdecimal() -> int\n\
3824 Return 1 if there are only decimal characters in S,\n\
3828 unicode_isdecimal(PyUnicodeObject
*self
, PyObject
*args
)
3830 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3831 register const Py_UNICODE
*e
;
3833 if (!PyArg_NoArgs(args
))
3836 /* Shortcut for single character strings */
3837 if (PyUnicode_GET_SIZE(self
) == 1 &&
3838 Py_UNICODE_ISDECIMAL(*p
))
3839 return PyInt_FromLong(1);
3841 /* Special case for empty strings */
3842 if (PyString_GET_SIZE(self
) == 0)
3843 return PyInt_FromLong(0);
3845 e
= p
+ PyUnicode_GET_SIZE(self
);
3846 for (; p
< e
; p
++) {
3847 if (!Py_UNICODE_ISDECIMAL(*p
))
3848 return PyInt_FromLong(0);
3850 return PyInt_FromLong(1);
3853 static char isdigit__doc__
[] =
3854 "S.isdigit() -> int\n\
3856 Return 1 if there are only digit characters in S,\n\
3860 unicode_isdigit(PyUnicodeObject
*self
, PyObject
*args
)
3862 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3863 register const Py_UNICODE
*e
;
3865 if (!PyArg_NoArgs(args
))
3868 /* Shortcut for single character strings */
3869 if (PyUnicode_GET_SIZE(self
) == 1 &&
3870 Py_UNICODE_ISDIGIT(*p
))
3871 return PyInt_FromLong(1);
3873 /* Special case for empty strings */
3874 if (PyString_GET_SIZE(self
) == 0)
3875 return PyInt_FromLong(0);
3877 e
= p
+ PyUnicode_GET_SIZE(self
);
3878 for (; p
< e
; p
++) {
3879 if (!Py_UNICODE_ISDIGIT(*p
))
3880 return PyInt_FromLong(0);
3882 return PyInt_FromLong(1);
3885 static char isnumeric__doc__
[] =
3886 "S.isnumeric() -> int\n\
3888 Return 1 if there are only numeric characters in S,\n\
3892 unicode_isnumeric(PyUnicodeObject
*self
, PyObject
*args
)
3894 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3895 register const Py_UNICODE
*e
;
3897 if (!PyArg_NoArgs(args
))
3900 /* Shortcut for single character strings */
3901 if (PyUnicode_GET_SIZE(self
) == 1 &&
3902 Py_UNICODE_ISNUMERIC(*p
))
3903 return PyInt_FromLong(1);
3905 /* Special case for empty strings */
3906 if (PyString_GET_SIZE(self
) == 0)
3907 return PyInt_FromLong(0);
3909 e
= p
+ PyUnicode_GET_SIZE(self
);
3910 for (; p
< e
; p
++) {
3911 if (!Py_UNICODE_ISNUMERIC(*p
))
3912 return PyInt_FromLong(0);
3914 return PyInt_FromLong(1);
3917 static char join__doc__
[] =
3918 "S.join(sequence) -> unicode\n\
3920 Return a string which is the concatenation of the strings in the\n\
3921 sequence. The separator between elements is S.";
3924 unicode_join(PyUnicodeObject
*self
, PyObject
*args
)
3927 if (!PyArg_ParseTuple(args
, "O:join", &data
))
3930 return PyUnicode_Join((PyObject
*)self
, data
);
3934 unicode_length(PyUnicodeObject
*self
)
3936 return self
->length
;
3939 static char ljust__doc__
[] =
3940 "S.ljust(width) -> unicode\n\
3942 Return S left justified in a Unicode string of length width. Padding is\n\
3943 done using spaces.";
3946 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
3949 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
3952 if (self
->length
>= width
) {
3954 return (PyObject
*) self
;
3957 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
3960 static char lower__doc__
[] =
3961 "S.lower() -> unicode\n\
3963 Return a copy of the string S converted to lowercase.";
3966 unicode_lower(PyUnicodeObject
*self
, PyObject
*args
)
3968 if (!PyArg_NoArgs(args
))
3970 return fixup(self
, fixlower
);
3973 static char lstrip__doc__
[] =
3974 "S.lstrip() -> unicode\n\
3976 Return a copy of the string S with leading whitespace removed.";
3979 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
3981 if (!PyArg_NoArgs(args
))
3983 return strip(self
, 1, 0);
3987 unicode_repeat(PyUnicodeObject
*str
, int len
)
3996 /* no repeat, return original string */
3998 return (PyObject
*) str
;
4001 u
= _PyUnicode_New(len
* str
->length
);
4008 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4012 return (PyObject
*) u
;
4015 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4025 self
= PyUnicode_FromObject(obj
);
4028 str1
= PyUnicode_FromObject(subobj
);
4033 str2
= PyUnicode_FromObject(replobj
);
4039 result
= replace((PyUnicodeObject
*)self
,
4040 (PyUnicodeObject
*)str1
,
4041 (PyUnicodeObject
*)str2
,
4049 static char replace__doc__
[] =
4050 "S.replace (old, new[, maxsplit]) -> unicode\n\
4052 Return a copy of S with all occurrences of substring\n\
4053 old replaced by new. If the optional argument maxsplit is\n\
4054 given, only the first maxsplit occurrences are replaced.";
4057 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4059 PyUnicodeObject
*str1
;
4060 PyUnicodeObject
*str2
;
4064 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4066 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4069 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4073 result
= replace(self
, str1
, str2
, maxcount
);
4081 PyObject
*unicode_repr(PyObject
*unicode
)
4083 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4084 PyUnicode_GET_SIZE(unicode
),
4088 static char rfind__doc__
[] =
4089 "S.rfind(sub [,start [,end]]) -> int\n\
4091 Return the highest index in S where substring sub is found,\n\
4092 such that sub is contained within s[start,end]. Optional\n\
4093 arguments start and end are interpreted as in slice notation.\n\
4095 Return -1 on failure.";
4098 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4100 PyUnicodeObject
*substring
;
4105 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4106 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4108 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4109 (PyObject
*)substring
);
4110 if (substring
== NULL
)
4113 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4115 Py_DECREF(substring
);
4119 static char rindex__doc__
[] =
4120 "S.rindex(sub [,start [,end]]) -> int\n\
4122 Like S.rfind() but raise ValueError when the substring is not found.";
4125 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4128 PyUnicodeObject
*substring
;
4132 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4133 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4135 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4136 (PyObject
*)substring
);
4137 if (substring
== NULL
)
4140 result
= findstring(self
, substring
, start
, end
, -1);
4142 Py_DECREF(substring
);
4144 PyErr_SetString(PyExc_ValueError
, "substring not found");
4147 return PyInt_FromLong(result
);
4150 static char rjust__doc__
[] =
4151 "S.rjust(width) -> unicode\n\
4153 Return S right justified in a Unicode string of length width. Padding is\n\
4154 done using spaces.";
4157 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4160 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4163 if (self
->length
>= width
) {
4165 return (PyObject
*) self
;
4168 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4171 static char rstrip__doc__
[] =
4172 "S.rstrip() -> unicode\n\
4174 Return a copy of the string S with trailing whitespace removed.";
4177 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
4179 if (!PyArg_NoArgs(args
))
4181 return strip(self
, 0, 1);
4185 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4187 /* standard clamping */
4192 if (end
> self
->length
)
4194 if (start
== 0 && end
== self
->length
) {
4195 /* full slice, return original string */
4197 return (PyObject
*) self
;
4202 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4206 PyObject
*PyUnicode_Split(PyObject
*s
,
4212 s
= PyUnicode_FromObject(s
);
4216 sep
= PyUnicode_FromObject(sep
);
4223 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4230 static char split__doc__
[] =
4231 "S.split([sep [,maxsplit]]) -> list of strings\n\
4233 Return a list of the words in S, using sep as the\n\
4234 delimiter string. If maxsplit is given, at most maxsplit\n\
4235 splits are done. If sep is not specified, any whitespace string\n\
4239 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4241 PyObject
*substring
= Py_None
;
4244 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4247 if (substring
== Py_None
)
4248 return split(self
, NULL
, maxcount
);
4249 else if (PyUnicode_Check(substring
))
4250 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4252 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4255 static char splitlines__doc__
[] =
4256 "S.splitlines([keepends]]) -> list of strings\n\
4258 Return a list of the lines in S, breaking at line boundaries.\n\
4259 Line breaks are not included in the resulting list unless keepends\n\
4260 is given and true.";
4263 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4267 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4270 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4274 PyObject
*unicode_str(PyUnicodeObject
*self
)
4276 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4279 static char strip__doc__
[] =
4280 "S.strip() -> unicode\n\
4282 Return a copy of S with leading and trailing whitespace removed.";
4285 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
4287 if (!PyArg_NoArgs(args
))
4289 return strip(self
, 1, 1);
4292 static char swapcase__doc__
[] =
4293 "S.swapcase() -> unicode\n\
4295 Return a copy of S with uppercase characters converted to lowercase\n\
4299 unicode_swapcase(PyUnicodeObject
*self
, PyObject
*args
)
4301 if (!PyArg_NoArgs(args
))
4303 return fixup(self
, fixswapcase
);
4306 static char translate__doc__
[] =
4307 "S.translate(table) -> unicode\n\
4309 Return a copy of the string S, where all characters have been mapped\n\
4310 through the given translation table, which must be a mapping of\n\
4311 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4312 are left untouched. Characters mapped to None are deleted.";
4315 unicode_translate(PyUnicodeObject
*self
, PyObject
*args
)
4319 if (!PyArg_ParseTuple(args
, "O:translate", &table
))
4321 return PyUnicode_TranslateCharmap(self
->str
,
4327 static char upper__doc__
[] =
4328 "S.upper() -> unicode\n\
4330 Return a copy of S converted to uppercase.";
4333 unicode_upper(PyUnicodeObject
*self
, PyObject
*args
)
4335 if (!PyArg_NoArgs(args
))
4337 return fixup(self
, fixupper
);
4341 static char zfill__doc__
[] =
4342 "S.zfill(width) -> unicode\n\
4344 Pad a numeric string x with zeros on the left, to fill a field\n\
4345 of the specified width. The string x is never truncated.";
4348 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4354 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4357 if (self
->length
>= width
) {
4359 return (PyObject
*) self
;
4362 fill
= width
- self
->length
;
4364 u
= pad(self
, fill
, 0, '0');
4366 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4367 /* move sign to beginning of string */
4368 u
->str
[0] = u
->str
[fill
];
4372 return (PyObject
*) u
;
4378 unicode_freelistsize(PyUnicodeObject
*self
, PyObject
*args
)
4380 if (!PyArg_NoArgs(args
))
4382 return PyInt_FromLong(unicode_freelist_size
);
4386 static char startswith__doc__
[] =
4387 "S.startswith(prefix[, start[, end]]) -> int\n\
4389 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4390 optional start, test S beginning at that position. With optional end, stop\n\
4391 comparing S at that position.";
4394 unicode_startswith(PyUnicodeObject
*self
,
4397 PyUnicodeObject
*substring
;
4402 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4403 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4405 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4406 (PyObject
*)substring
);
4407 if (substring
== NULL
)
4410 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4412 Py_DECREF(substring
);
4417 static char endswith__doc__
[] =
4418 "S.endswith(suffix[, start[, end]]) -> int\n\
4420 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4421 optional start, test S beginning at that position. With optional end, stop\n\
4422 comparing S at that position.";
4425 unicode_endswith(PyUnicodeObject
*self
,
4428 PyUnicodeObject
*substring
;
4433 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4434 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4436 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4437 (PyObject
*)substring
);
4438 if (substring
== NULL
)
4441 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4443 Py_DECREF(substring
);
4448 static PyMethodDef unicode_methods
[] = {
4450 /* Order is according to common usage: often used methods should
4451 appear first, since lookup is done sequentially. */
4453 {"encode", (PyCFunction
) unicode_encode
, 1, encode__doc__
},
4454 {"replace", (PyCFunction
) unicode_replace
, 1, replace__doc__
},
4455 {"split", (PyCFunction
) unicode_split
, 1, split__doc__
},
4456 {"join", (PyCFunction
) unicode_join
, 1, join__doc__
},
4457 {"capitalize", (PyCFunction
) unicode_capitalize
, 0, capitalize__doc__
},
4458 {"title", (PyCFunction
) unicode_title
, 0, title__doc__
},
4459 {"center", (PyCFunction
) unicode_center
, 1, center__doc__
},
4460 {"count", (PyCFunction
) unicode_count
, 1, count__doc__
},
4461 {"expandtabs", (PyCFunction
) unicode_expandtabs
, 1, expandtabs__doc__
},
4462 {"find", (PyCFunction
) unicode_find
, 1, find__doc__
},
4463 {"index", (PyCFunction
) unicode_index
, 1, index__doc__
},
4464 {"ljust", (PyCFunction
) unicode_ljust
, 1, ljust__doc__
},
4465 {"lower", (PyCFunction
) unicode_lower
, 0, lower__doc__
},
4466 {"lstrip", (PyCFunction
) unicode_lstrip
, 0, lstrip__doc__
},
4467 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4468 {"rfind", (PyCFunction
) unicode_rfind
, 1, rfind__doc__
},
4469 {"rindex", (PyCFunction
) unicode_rindex
, 1, rindex__doc__
},
4470 {"rjust", (PyCFunction
) unicode_rjust
, 1, rjust__doc__
},
4471 {"rstrip", (PyCFunction
) unicode_rstrip
, 0, rstrip__doc__
},
4472 {"splitlines", (PyCFunction
) unicode_splitlines
, 1, splitlines__doc__
},
4473 {"strip", (PyCFunction
) unicode_strip
, 0, strip__doc__
},
4474 {"swapcase", (PyCFunction
) unicode_swapcase
, 0, swapcase__doc__
},
4475 {"translate", (PyCFunction
) unicode_translate
, 1, translate__doc__
},
4476 {"upper", (PyCFunction
) unicode_upper
, 0, upper__doc__
},
4477 {"startswith", (PyCFunction
) unicode_startswith
, 1, startswith__doc__
},
4478 {"endswith", (PyCFunction
) unicode_endswith
, 1, endswith__doc__
},
4479 {"islower", (PyCFunction
) unicode_islower
, 0, islower__doc__
},
4480 {"isupper", (PyCFunction
) unicode_isupper
, 0, isupper__doc__
},
4481 {"istitle", (PyCFunction
) unicode_istitle
, 0, istitle__doc__
},
4482 {"isspace", (PyCFunction
) unicode_isspace
, 0, isspace__doc__
},
4483 {"isdecimal", (PyCFunction
) unicode_isdecimal
, 0, isdecimal__doc__
},
4484 {"isdigit", (PyCFunction
) unicode_isdigit
, 0, isdigit__doc__
},
4485 {"isnumeric", (PyCFunction
) unicode_isnumeric
, 0, isnumeric__doc__
},
4486 {"isalpha", (PyCFunction
) unicode_isalpha
, 0, isalpha__doc__
},
4487 {"isalnum", (PyCFunction
) unicode_isalnum
, 0, isalnum__doc__
},
4489 {"zfill", (PyCFunction
) unicode_zfill
, 1, zfill__doc__
},
4490 {"capwords", (PyCFunction
) unicode_capwords
, 0, capwords__doc__
},
4494 /* This one is just used for debugging the implementation. */
4495 {"freelistsize", (PyCFunction
) unicode_freelistsize
, 0},
4502 unicode_getattr(PyUnicodeObject
*self
, char *name
)
4504 return Py_FindMethod(unicode_methods
, (PyObject
*) self
, name
);
4507 static PySequenceMethods unicode_as_sequence
= {
4508 (inquiry
) unicode_length
, /* sq_length */
4509 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4510 (intargfunc
) unicode_repeat
, /* sq_repeat */
4511 (intargfunc
) unicode_getitem
, /* sq_item */
4512 (intintargfunc
) unicode_slice
, /* sq_slice */
4513 0, /* sq_ass_item */
4514 0, /* sq_ass_slice */
4515 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4519 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4524 PyErr_SetString(PyExc_SystemError
,
4525 "accessing non-existent unicode segment");
4528 *ptr
= (void *) self
->str
;
4529 return PyUnicode_GET_DATA_SIZE(self
);
4533 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4536 PyErr_SetString(PyExc_TypeError
,
4537 "cannot use unicode as modifyable buffer");
4542 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4546 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4551 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
4558 PyErr_SetString(PyExc_SystemError
,
4559 "accessing non-existent unicode segment");
4562 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
4565 *ptr
= (void *) PyString_AS_STRING(str
);
4566 return PyString_GET_SIZE(str
);
4569 /* Helpers for PyUnicode_Format() */
4572 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
4574 int argidx
= *p_argidx
;
4575 if (argidx
< arglen
) {
4580 return PyTuple_GetItem(args
, argidx
);
4582 PyErr_SetString(PyExc_TypeError
,
4583 "not enough arguments for format string");
4587 #define F_LJUST (1<<0)
4588 #define F_SIGN (1<<1)
4589 #define F_BLANK (1<<2)
4590 #define F_ALT (1<<3)
4591 #define F_ZERO (1<<4)
4594 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
4600 va_start(va
, format
);
4602 /* First, format the string as char array, then expand to Py_UNICODE
4604 charbuffer
= (char *)buffer
;
4605 len
= vsprintf(charbuffer
, format
, va
);
4606 for (i
= len
- 1; i
>= 0; i
--)
4607 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
4614 formatfloat(Py_UNICODE
*buf
,
4621 /* fmt = '%#.' + `prec` + `type`
4622 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4626 x
= PyFloat_AsDouble(v
);
4627 if (x
== -1.0 && PyErr_Occurred())
4631 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
4633 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4634 /* worst case length calc to ensure no buffer overrun:
4636 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4637 for any double rep.)
4638 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4639 If prec=0 the effective precision is 1 (the leading digit is
4640 always given), therefore increase by one to 10+prec. */
4641 if (buflen
<= (size_t)10 + (size_t)prec
) {
4642 PyErr_SetString(PyExc_OverflowError
,
4643 "formatted float is too long (precision too long?)");
4646 return usprintf(buf
, fmt
, x
);
4650 formatint(Py_UNICODE
*buf
,
4657 /* fmt = '%#.' + `prec` + 'l' + `type`
4658 worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
4662 x
= PyInt_AsLong(v
);
4663 if (x
== -1 && PyErr_Occurred())
4667 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4668 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4669 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
4670 PyErr_SetString(PyExc_OverflowError
,
4671 "formatted integer is too long (precision too long?)");
4674 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4675 return usprintf(buf
, fmt
, x
);
4679 formatchar(Py_UNICODE
*buf
,
4683 /* presume that the buffer is at least 2 characters long */
4684 if (PyUnicode_Check(v
)) {
4685 if (PyUnicode_GET_SIZE(v
) != 1)
4687 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
4690 else if (PyString_Check(v
)) {
4691 if (PyString_GET_SIZE(v
) != 1)
4693 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
4697 /* Integer input truncated to a character */
4699 x
= PyInt_AsLong(v
);
4700 if (x
== -1 && PyErr_Occurred())
4708 PyErr_SetString(PyExc_TypeError
,
4709 "%c requires int or char");
4713 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4715 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4716 chars are formatted. XXX This is a magic number. Each formatting
4717 routine does bounds checking to ensure no overflow, but a better
4718 solution may be to malloc a buffer of appropriate size for each
4719 format. For now, the current solution is sufficient.
4721 #define FORMATBUFLEN (size_t)120
4723 PyObject
*PyUnicode_Format(PyObject
*format
,
4726 Py_UNICODE
*fmt
, *res
;
4727 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
4729 PyUnicodeObject
*result
= NULL
;
4730 PyObject
*dict
= NULL
;
4733 if (format
== NULL
|| args
== NULL
) {
4734 PyErr_BadInternalCall();
4737 uformat
= PyUnicode_FromObject(format
);
4738 if (uformat
== NULL
)
4740 fmt
= PyUnicode_AS_UNICODE(uformat
);
4741 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
4743 reslen
= rescnt
= fmtcnt
+ 100;
4744 result
= _PyUnicode_New(reslen
);
4747 res
= PyUnicode_AS_UNICODE(result
);
4749 if (PyTuple_Check(args
)) {
4750 arglen
= PyTuple_Size(args
);
4757 if (args
->ob_type
->tp_as_mapping
)
4760 while (--fmtcnt
>= 0) {
4763 rescnt
= fmtcnt
+ 100;
4765 if (_PyUnicode_Resize(result
, reslen
) < 0)
4767 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
4773 /* Got a format specifier */
4778 Py_UNICODE c
= '\0';
4781 PyObject
*temp
= NULL
;
4785 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
4789 Py_UNICODE
*keystart
;
4795 PyErr_SetString(PyExc_TypeError
,
4796 "format requires a mapping");
4802 /* Skip over balanced parentheses */
4803 while (pcount
> 0 && --fmtcnt
>= 0) {
4806 else if (*fmt
== '(')
4810 keylen
= fmt
- keystart
- 1;
4811 if (fmtcnt
< 0 || pcount
> 0) {
4812 PyErr_SetString(PyExc_ValueError
,
4813 "incomplete format key");
4816 /* keys are converted to strings using UTF-8 and
4817 then looked up since Python uses strings to hold
4818 variables names etc. in its namespaces and we
4819 wouldn't want to break common idioms. */
4820 key
= PyUnicode_EncodeUTF8(keystart
,
4829 args
= PyObject_GetItem(dict
, key
);
4838 while (--fmtcnt
>= 0) {
4839 switch (c
= *fmt
++) {
4840 case '-': flags
|= F_LJUST
; continue;
4841 case '+': flags
|= F_SIGN
; continue;
4842 case ' ': flags
|= F_BLANK
; continue;
4843 case '#': flags
|= F_ALT
; continue;
4844 case '0': flags
|= F_ZERO
; continue;
4849 v
= getnextarg(args
, arglen
, &argidx
);
4852 if (!PyInt_Check(v
)) {
4853 PyErr_SetString(PyExc_TypeError
,
4857 width
= PyInt_AsLong(v
);
4865 else if (c
>= '0' && c
<= '9') {
4867 while (--fmtcnt
>= 0) {
4869 if (c
< '0' || c
> '9')
4871 if ((width
*10) / 10 != width
) {
4872 PyErr_SetString(PyExc_ValueError
,
4876 width
= width
*10 + (c
- '0');
4884 v
= getnextarg(args
, arglen
, &argidx
);
4887 if (!PyInt_Check(v
)) {
4888 PyErr_SetString(PyExc_TypeError
,
4892 prec
= PyInt_AsLong(v
);
4898 else if (c
>= '0' && c
<= '9') {
4900 while (--fmtcnt
>= 0) {
4901 c
= Py_CHARMASK(*fmt
++);
4902 if (c
< '0' || c
> '9')
4904 if ((prec
*10) / 10 != prec
) {
4905 PyErr_SetString(PyExc_ValueError
,
4909 prec
= prec
*10 + (c
- '0');
4914 if (c
== 'h' || c
== 'l' || c
== 'L') {
4921 PyErr_SetString(PyExc_ValueError
,
4922 "incomplete format");
4926 v
= getnextarg(args
, arglen
, &argidx
);
4936 /* presume that buffer length is at least 1 */
4943 if (PyUnicode_Check(v
) && c
== 's') {
4950 temp
= PyObject_Str(v
);
4952 temp
= PyObject_Repr(v
);
4955 if (!PyString_Check(temp
)) {
4956 /* XXX Note: this should never happen, since
4957 PyObject_Repr() and PyObject_Str() assure
4960 PyErr_SetString(PyExc_TypeError
,
4961 "%s argument has non-string str()");
4964 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
4965 PyString_GET_SIZE(temp
),
4973 pbuf
= PyUnicode_AS_UNICODE(temp
);
4974 len
= PyUnicode_GET_SIZE(temp
);
4975 if (prec
>= 0 && len
> prec
)
4988 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
4993 if (flags
& F_ZERO
) {
4995 if ((flags
&F_ALT
) &&
4996 (c
== 'x' || c
== 'X') &&
4997 pbuf
[0] == '0' && pbuf
[1] == c
) {
5015 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5026 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5032 PyErr_Format(PyExc_ValueError
,
5033 "unsupported format character '%c' (0x%x)",
5038 if (*pbuf
== '-' || *pbuf
== '+') {
5042 else if (flags
& F_SIGN
)
5044 else if (flags
& F_BLANK
)
5051 if (rescnt
< width
+ (sign
!= 0)) {
5053 rescnt
= width
+ fmtcnt
+ 100;
5055 if (_PyUnicode_Resize(result
, reslen
) < 0)
5057 res
= PyUnicode_AS_UNICODE(result
)
5067 if (width
> len
&& !(flags
& F_LJUST
)) {
5071 } while (--width
> len
);
5073 if (sign
&& fill
== ' ')
5075 memcpy(res
, pbuf
, len
* sizeof(Py_UNICODE
));
5078 while (--width
>= len
) {
5082 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5083 PyErr_SetString(PyExc_TypeError
,
5084 "not all arguments converted");
5090 if (argidx
< arglen
&& !dict
) {
5091 PyErr_SetString(PyExc_TypeError
,
5092 "not all arguments converted");
5100 if (_PyUnicode_Resize(result
, reslen
- rescnt
))
5102 return (PyObject
*)result
;
5113 static PyBufferProcs unicode_as_buffer
= {
5114 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5115 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5116 (getsegcountproc
) unicode_buffer_getsegcount
,
5117 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5120 PyTypeObject PyUnicode_Type
= {
5121 PyObject_HEAD_INIT(&PyType_Type
)
5123 "unicode", /* tp_name */
5124 sizeof(PyUnicodeObject
), /* tp_size */
5125 0, /* tp_itemsize */
5127 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5129 (getattrfunc
)unicode_getattr
, /* tp_getattr */
5131 (cmpfunc
) unicode_compare
, /* tp_compare */
5132 (reprfunc
) unicode_repr
, /* tp_repr */
5133 0, /* tp_as_number */
5134 &unicode_as_sequence
, /* tp_as_sequence */
5135 0, /* tp_as_mapping */
5136 (hashfunc
) unicode_hash
, /* tp_hash*/
5138 (reprfunc
) unicode_str
, /* tp_str */
5139 (getattrofunc
) NULL
, /* tp_getattro */
5140 (setattrofunc
) NULL
, /* tp_setattro */
5141 &unicode_as_buffer
, /* tp_as_buffer */
5142 Py_TPFLAGS_DEFAULT
, /* tp_flags */
5145 /* Initialize the Unicode implementation */
5147 void _PyUnicode_Init(void)
5149 /* Doublecheck the configuration... */
5150 if (sizeof(Py_UNICODE
) != 2)
5151 Py_FatalError("Unicode configuration error: "
5152 "sizeof(Py_UNICODE) != 2 bytes");
5154 /* Init the implementation */
5155 unicode_freelist
= NULL
;
5156 unicode_freelist_size
= 0;
5157 unicode_empty
= _PyUnicode_New(0);
5158 strcpy(unicode_default_encoding
, "ascii");
5161 /* Finalize the Unicode implementation */
5164 _PyUnicode_Fini(void)
5166 PyUnicodeObject
*u
= unicode_freelist
;
5169 PyUnicodeObject
*v
= u
;
5170 u
= *(PyUnicodeObject
**)u
;
5173 Py_XDECREF(v
->defenc
);
5176 unicode_freelist
= NULL
;
5177 unicode_freelist_size
= 0;
5178 Py_XDECREF(unicode_empty
);
5179 unicode_empty
= NULL
;