3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_DEL(unicode
);
227 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
229 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
230 /* Keep-Alive optimization */
231 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
232 PyMem_DEL(unicode
->str
);
236 if (unicode
->defenc
) {
237 Py_DECREF(unicode
->defenc
);
238 unicode
->defenc
= NULL
;
240 /* Add to free list */
241 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
242 unicode_freelist
= unicode
;
243 unicode_freelist_size
++;
246 PyMem_DEL(unicode
->str
);
247 Py_XDECREF(unicode
->defenc
);
248 PyObject_DEL(unicode
);
252 int PyUnicode_Resize(PyObject
**unicode
,
255 register PyUnicodeObject
*v
;
257 /* Argument checks */
258 if (unicode
== NULL
) {
259 PyErr_BadInternalCall();
262 v
= (PyUnicodeObject
*)*unicode
;
263 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
264 PyErr_BadInternalCall();
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v
->length
!= length
&&
272 (v
== unicode_empty
|| v
->length
== 1)) {
273 PyUnicodeObject
*w
= _PyUnicode_New(length
);
276 Py_UNICODE_COPY(w
->str
, v
->str
,
277 length
< v
->length
? length
: v
->length
);
278 *unicode
= (PyObject
*)w
;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v
, length
);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
294 PyUnicodeObject
*unicode
;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
300 /* Optimization for empty strings */
301 if (size
== 0 && unicode_empty
!= NULL
) {
302 Py_INCREF(unicode_empty
);
303 return (PyObject
*)unicode_empty
;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size
== 1 && *u
< 256) {
309 unicode
= unicode_latin1
[*u
];
311 unicode
= _PyUnicode_New(1);
314 unicode
->str
[0] = *u
;
315 unicode_latin1
[*u
] = unicode
;
318 return (PyObject
*)unicode
;
322 unicode
= _PyUnicode_New(size
);
326 /* Copy the Unicode data into the new object */
328 Py_UNICODE_COPY(unicode
->str
, u
, size
);
330 return (PyObject
*)unicode
;
335 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
338 PyUnicodeObject
*unicode
;
341 PyErr_BadInternalCall();
345 unicode
= _PyUnicode_New(size
);
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
354 register Py_UNICODE
*u
;
356 u
= PyUnicode_AS_UNICODE(unicode
);
357 for (i
= size
; i
>= 0; i
--)
362 return (PyObject
*)unicode
;
365 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
369 if (unicode
== NULL
) {
370 PyErr_BadInternalCall();
373 if (size
> PyUnicode_GET_SIZE(unicode
))
374 size
= PyUnicode_GET_SIZE(unicode
);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
379 register Py_UNICODE
*u
;
381 u
= PyUnicode_AS_UNICODE(unicode
);
382 for (i
= size
; i
>= 0; i
--)
392 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
394 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
397 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
398 const char *encoding
,
407 PyErr_BadInternalCall();
412 if (PyInstance_Check(obj
)) {
414 func
= PyObject_GetAttrString(obj
, "__str__");
416 PyErr_SetString(PyExc_TypeError
,
417 "coercing to Unicode: instance doesn't define __str__");
420 obj
= PyEval_CallObject(func
, NULL
);
426 if (PyUnicode_Check(obj
)) {
430 PyErr_SetString(PyExc_TypeError
,
431 "decoding Unicode is not supported");
436 else if (PyString_Check(obj
)) {
437 s
= PyString_AS_STRING(obj
);
438 len
= PyString_GET_SIZE(obj
);
440 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError
))
444 PyErr_Format(PyExc_TypeError
,
445 "coercing to Unicode: need string or buffer, "
447 obj
->ob_type
->tp_name
);
451 /* Convert to Unicode */
453 Py_INCREF(unicode_empty
);
454 v
= (PyObject
*)unicode_empty
;
457 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
472 PyObject
*PyUnicode_Decode(const char *s
,
474 const char *encoding
,
477 PyObject
*buffer
= NULL
, *unicode
;
479 if (encoding
== NULL
)
480 encoding
= PyUnicode_GetDefaultEncoding();
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding
, "utf-8") == 0)
484 return PyUnicode_DecodeUTF8(s
, size
, errors
);
485 else if (strcmp(encoding
, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s
, size
, errors
);
487 else if (strcmp(encoding
, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s
, size
, errors
);
490 /* Decode via the codec registry */
491 buffer
= PyBuffer_FromMemory((void *)s
, size
);
494 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
497 if (!PyUnicode_Check(unicode
)) {
498 PyErr_Format(PyExc_TypeError
,
499 "decoder did not return an unicode object (type=%.400s)",
500 unicode
->ob_type
->tp_name
);
512 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
514 const char *encoding
,
517 PyObject
*v
, *unicode
;
519 unicode
= PyUnicode_FromUnicode(s
, size
);
522 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
527 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
528 const char *encoding
,
533 if (!PyUnicode_Check(unicode
)) {
538 if (encoding
== NULL
)
539 encoding
= PyUnicode_GetDefaultEncoding();
541 /* Shortcuts for common default encodings */
542 if (errors
== NULL
) {
543 if (strcmp(encoding
, "utf-8") == 0)
544 return PyUnicode_AsUTF8String(unicode
);
545 else if (strcmp(encoding
, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode
);
547 else if (strcmp(encoding
, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode
);
551 /* Encode via the codec registry */
552 v
= PyCodec_Encode(unicode
, encoding
, errors
);
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v
)) {
557 PyErr_Format(PyExc_TypeError
,
558 "encoder did not return a string object (type=%.400s)",
559 v
->ob_type
->tp_name
);
569 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
572 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
576 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
577 if (v
&& errors
== NULL
)
578 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
582 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
584 if (!PyUnicode_Check(unicode
)) {
588 return PyUnicode_AS_UNICODE(unicode
);
594 int PyUnicode_GetSize(PyObject
*unicode
)
596 if (!PyUnicode_Check(unicode
)) {
600 return PyUnicode_GET_SIZE(unicode
);
606 const char *PyUnicode_GetDefaultEncoding(void)
608 return unicode_default_encoding
;
611 int PyUnicode_SetDefaultEncoding(const char *encoding
)
615 /* Make sure the encoding is valid. As side effect, this also
616 loads the encoding into the codec registry cache. */
617 v
= _PyCodec_Lookup(encoding
);
621 strncpy(unicode_default_encoding
,
623 sizeof(unicode_default_encoding
));
630 /* --- UTF-8 Codec -------------------------------------------------------- */
633 char utf8_code_length
[256] = {
634 /* Map UTF-8 encoded prefix byte to sequence length. zero means
635 illegal prefix. see RFC 2279 for details */
636 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
637 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
645 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
649 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
650 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
651 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
655 int utf8_decoding_error(const char **source
,
660 if ((errors
== NULL
) ||
661 (strcmp(errors
,"strict") == 0)) {
662 PyErr_Format(PyExc_UnicodeError
,
663 "UTF-8 decoding error: %.400s",
667 else if (strcmp(errors
,"ignore") == 0) {
671 else if (strcmp(errors
,"replace") == 0) {
673 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
678 PyErr_Format(PyExc_ValueError
,
679 "UTF-8 decoding error; unknown error handling code: %.400s",
685 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
691 PyUnicodeObject
*unicode
;
693 const char *errmsg
= "";
695 /* Note: size will always be longer than the resulting Unicode
697 unicode
= _PyUnicode_New(size
);
701 return (PyObject
*)unicode
;
703 /* Unpack UTF-8 encoded data */
708 Py_UCS4 ch
= (unsigned char)*s
;
711 *p
++ = (Py_UNICODE
)ch
;
716 n
= utf8_code_length
[ch
];
719 errmsg
= "unexpected end of data";
726 errmsg
= "unexpected code byte";
730 errmsg
= "internal error";
734 if ((s
[1] & 0xc0) != 0x80) {
735 errmsg
= "invalid data";
738 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
740 errmsg
= "illegal encoding";
744 *p
++ = (Py_UNICODE
)ch
;
748 if ((s
[1] & 0xc0) != 0x80 ||
749 (s
[2] & 0xc0) != 0x80) {
750 errmsg
= "invalid data";
753 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
754 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
755 errmsg
= "illegal encoding";
759 *p
++ = (Py_UNICODE
)ch
;
763 if ((s
[1] & 0xc0) != 0x80 ||
764 (s
[2] & 0xc0) != 0x80 ||
765 (s
[3] & 0xc0) != 0x80) {
766 errmsg
= "invalid data";
769 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
770 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
771 /* validate and convert to UTF-16 */
772 if ((ch
< 0x10000) /* minimum value allowed for 4
774 || (ch
> 0x10ffff)) /* maximum value allowed for
777 errmsg
= "illegal encoding";
780 #ifdef Py_UNICODE_WIDE
781 *p
++ = (Py_UNICODE
)ch
;
783 /* compute and append the two surrogates: */
785 /* translate from 10000..10FFFF to 0..FFFF */
788 /* high surrogate = top 10 bits added to D800 */
789 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
791 /* low surrogate = bottom 10 bits added to DC00 */
792 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
797 /* Other sizes are only needed for UCS-4 */
798 errmsg
= "unsupported Unicode code range";
805 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
810 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
813 return (PyObject
*)unicode
;
820 /* Not used anymore, now that the encoder supports UTF-16
824 int utf8_encoding_error(const Py_UNICODE
**source
,
829 if ((errors
== NULL
) ||
830 (strcmp(errors
,"strict") == 0)) {
831 PyErr_Format(PyExc_UnicodeError
,
832 "UTF-8 encoding error: %.400s",
836 else if (strcmp(errors
,"ignore") == 0) {
839 else if (strcmp(errors
,"replace") == 0) {
845 PyErr_Format(PyExc_ValueError
,
846 "UTF-8 encoding error; "
847 "unknown error handling code: %.400s",
854 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
862 unsigned int cbAllocated
= 3 * size
;
863 unsigned int cbWritten
= 0;
866 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
872 p
= q
= PyString_AS_STRING(v
);
879 else if (ch
< 0x0800) {
880 *p
++ = 0xc0 | (ch
>> 6);
881 *p
++ = 0x80 | (ch
& 0x3f);
884 else if (ch
< 0x10000) {
885 /* Check for high surrogate */
886 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
889 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
891 if (cbWritten
>= (cbAllocated
- 4)) {
892 /* Provide enough room for some more
895 if (_PyString_Resize(&v
, cbAllocated
))
899 /* combine the two values */
900 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
902 *p
++ = (char)((ch
>> 18) | 0xf0);
903 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
910 *p
++ = (char)(0xe0 | (ch
>> 12));
913 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
914 *p
++ = (char)(0x80 | (ch
& 0x3f));
916 *p
++ = 0xf0 | (ch
>>18);
917 *p
++ = 0x80 | ((ch
>>12) & 0x3f);
918 *p
++ = 0x80 | ((ch
>>6) & 0x3f);
919 *p
++ = 0x80 | (ch
& 0x3f);
924 if (_PyString_Resize(&v
, p
- q
))
933 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
935 if (!PyUnicode_Check(unicode
)) {
939 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
940 PyUnicode_GET_SIZE(unicode
),
944 /* --- UTF-16 Codec ------------------------------------------------------- */
947 int utf16_decoding_error(Py_UNICODE
**dest
,
951 if ((errors
== NULL
) ||
952 (strcmp(errors
,"strict") == 0)) {
953 PyErr_Format(PyExc_UnicodeError
,
954 "UTF-16 decoding error: %.400s",
958 else if (strcmp(errors
,"ignore") == 0) {
961 else if (strcmp(errors
,"replace") == 0) {
963 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
969 PyErr_Format(PyExc_ValueError
,
970 "UTF-16 decoding error; "
971 "unknown error handling code: %.400s",
978 PyUnicode_DecodeUTF16(const char *s
,
983 PyUnicodeObject
*unicode
;
985 const unsigned char *q
, *e
;
986 int bo
= 0; /* assume native ordering by default */
987 const char *errmsg
= "";
988 /* Offsets from q for retrieving byte pairs in the right order. */
989 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
990 int ihi
= 1, ilo
= 0;
992 int ihi
= 0, ilo
= 1;
995 /* size should be an even number */
997 if (utf16_decoding_error(NULL
, errors
, "truncated data"))
999 --size
; /* else ignore the oddball byte */
1002 /* Note: size will always be longer than the resulting Unicode
1004 unicode
= _PyUnicode_New(size
);
1008 return (PyObject
*)unicode
;
1010 /* Unpack UTF-16 encoded data */
1012 q
= (unsigned char *)s
;
1018 /* Check for BOM marks (U+FEFF) in the input and adjust current
1019 byte order setting accordingly. In native mode, the leading BOM
1020 mark is skipped, in all other modes, it is copied to the output
1021 stream as-is (giving a ZWNBSP character). */
1023 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1024 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1025 if (bom
== 0xFEFF) {
1029 else if (bom
== 0xFFFE) {
1034 if (bom
== 0xFEFF) {
1038 else if (bom
== 0xFFFE) {
1057 Py_UNICODE ch
= (q
[ihi
] << 8) | q
[ilo
];
1060 if (ch
< 0xD800 || ch
> 0xDFFF) {
1065 /* UTF-16 code pair: */
1067 errmsg
= "unexpected end of data";
1070 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1071 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1073 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1074 #ifndef Py_UNICODE_WIDE
1078 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1083 errmsg
= "illegal UTF-16 surrogate";
1088 errmsg
= "illegal encoding";
1089 /* Fall through to report the error */
1092 if (utf16_decoding_error(&p
, errors
, errmsg
))
1100 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1103 return (PyObject
*)unicode
;
1111 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1119 /* Offsets from p for storing byte pairs in the right order. */
1120 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1121 int ihi
= 1, ilo
= 0;
1123 int ihi
= 0, ilo
= 1;
1126 #define STORECHAR(CH) \
1128 p[ihi] = ((CH) >> 8) & 0xff; \
1129 p[ilo] = (CH) & 0xff; \
1133 for (i
= pairs
= 0; i
< size
; i
++)
1134 if (s
[i
] >= 0x10000)
1136 v
= PyString_FromStringAndSize(NULL
,
1137 2 * (size
+ pairs
+ (byteorder
== 0)));
1141 p
= (unsigned char *)PyString_AS_STRING(v
);
1147 if (byteorder
== -1) {
1152 else if (byteorder
== 1) {
1158 while (size
-- > 0) {
1159 Py_UNICODE ch
= *s
++;
1161 if (ch
>= 0x10000) {
1162 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1163 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1173 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1175 if (!PyUnicode_Check(unicode
)) {
1176 PyErr_BadArgument();
1179 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1180 PyUnicode_GET_SIZE(unicode
),
1185 /* --- Unicode Escape Codec ----------------------------------------------- */
1188 int unicodeescape_decoding_error(const char **source
,
1191 const char *details
)
1193 if ((errors
== NULL
) ||
1194 (strcmp(errors
,"strict") == 0)) {
1195 PyErr_Format(PyExc_UnicodeError
,
1196 "Unicode-Escape decoding error: %.400s",
1200 else if (strcmp(errors
,"ignore") == 0) {
1203 else if (strcmp(errors
,"replace") == 0) {
1204 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1208 PyErr_Format(PyExc_ValueError
,
1209 "Unicode-Escape decoding error; "
1210 "unknown error handling code: %.400s",
1216 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1218 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1223 Py_UNICODE
*p
, *buf
;
1226 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1228 /* Escaped strings will always be longer than the resulting
1229 Unicode string, so we start with size here and then reduce the
1230 length after conversion to the true value. */
1231 v
= _PyUnicode_New(size
);
1235 return (PyObject
*)v
;
1237 p
= buf
= PyUnicode_AS_UNICODE(v
);
1245 /* Non-escape characters are interpreted as Unicode ordinals */
1247 *p
++ = (unsigned char) *s
++;
1257 case '\\': *p
++ = '\\'; break;
1258 case '\'': *p
++ = '\''; break;
1259 case '\"': *p
++ = '\"'; break;
1260 case 'b': *p
++ = '\b'; break;
1261 case 'f': *p
++ = '\014'; break; /* FF */
1262 case 't': *p
++ = '\t'; break;
1263 case 'n': *p
++ = '\n'; break;
1264 case 'r': *p
++ = '\r'; break;
1265 case 'v': *p
++ = '\013'; break; /* VT */
1266 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1268 /* \OOO (octal) escapes */
1269 case '0': case '1': case '2': case '3':
1270 case '4': case '5': case '6': case '7':
1272 if ('0' <= *s
&& *s
<= '7') {
1273 x
= (x
<<3) + *s
++ - '0';
1274 if ('0' <= *s
&& *s
<= '7')
1275 x
= (x
<<3) + *s
++ - '0';
1284 message
= "truncated \\xXX escape";
1290 message
= "truncated \\uXXXX escape";
1296 message
= "truncated \\UXXXXXXXX escape";
1299 for (i
= 0; i
< digits
; i
++) {
1300 c
= (unsigned char) s
[i
];
1302 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1308 chr
= (chr
<<4) & ~0xF;
1309 if (c
>= '0' && c
<= '9')
1311 else if (c
>= 'a' && c
<= 'f')
1312 chr
+= 10 + c
- 'a';
1314 chr
+= 10 + c
- 'A';
1318 /* when we get here, chr is a 32-bit unicode character */
1320 /* UCS-2 character */
1321 *p
++ = (Py_UNICODE
) chr
;
1322 else if (chr
<= 0x10ffff) {
1323 /* UCS-4 character. Either store directly, or as
1325 #ifdef Py_UNICODE_WIDE
1329 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1330 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1333 if (unicodeescape_decoding_error(
1335 "illegal Unicode character")
1338 *p
++ = x
; /* store replacement character */
1344 message
= "malformed \\N character escape";
1345 if (ucnhash_CAPI
== NULL
) {
1346 /* load the unicode data module */
1348 m
= PyImport_ImportModule("unicodedata");
1351 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1355 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1357 if (ucnhash_CAPI
== NULL
)
1361 const char *start
= s
+1;
1362 /* look for the closing brace */
1363 while (*s
!= '}' && s
< end
)
1365 if (s
> start
&& s
< end
&& *s
== '}') {
1366 /* found a name. look it up in the unicode database */
1367 message
= "unknown Unicode character name";
1369 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1373 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1380 *p
++ = (unsigned char)s
[-1];
1384 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1386 return (PyObject
*)v
;
1391 "\\N escapes not supported (can't load unicodedata module)"
1400 /* Return a Unicode-Escape string version of the Unicode object.
1402 If quotes is true, the string is enclosed in u"" or u'' quotes as
1407 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1412 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1419 static const char *hexdigit
= "0123456789abcdef";
1421 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1425 p
= PyString_AS_STRING(repr
);
1429 *p
++ = (findchar(s
, size
, '\'') &&
1430 !findchar(s
, size
, '"')) ? '"' : '\'';
1432 while (size
-- > 0) {
1433 Py_UNICODE ch
= *s
++;
1437 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
1442 #ifdef Py_UNICODE_WIDE
1443 /* Map 21-bit characters to '\U00xxxxxx' */
1444 else if (ch
>= 0x10000) {
1445 int offset
= p
- PyString_AS_STRING(repr
);
1447 /* Resize the string if necessary */
1448 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
1449 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
1451 p
= PyString_AS_STRING(repr
) + offset
;
1456 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
1457 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
1458 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
1459 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
1460 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
1461 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
1462 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
1463 *p
++ = hexdigit
[ch
& 0x0000000F];
1467 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1468 else if (ch
>= 0xD800 && ch
< 0xDC00) {
1474 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
1475 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
1478 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
1479 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
1480 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
1481 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
1482 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
1483 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
1484 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
1485 *p
++ = hexdigit
[ucs
& 0x0000000F];
1488 /* Fall through: isolated surrogates are copied as-is */
1493 /* Map 16-bit characters to '\uxxxx' */
1497 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
1498 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
1499 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1500 *p
++ = hexdigit
[ch
& 0x000F];
1503 /* Map special whitespace to '\t', \n', '\r' */
1504 else if (ch
== '\t') {
1508 else if (ch
== '\n') {
1512 else if (ch
== '\r') {
1517 /* Map non-printable US ASCII to '\xhh' */
1518 else if (ch
< ' ' || ch
>= 128) {
1521 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1522 *p
++ = hexdigit
[ch
& 0x000F];
1525 /* Copy everything else as-is */
1530 *p
++ = PyString_AS_STRING(repr
)[1];
1533 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
1543 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1546 return unicodeescape_string(s
, size
, 0);
1549 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1551 if (!PyUnicode_Check(unicode
)) {
1552 PyErr_BadArgument();
1555 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1556 PyUnicode_GET_SIZE(unicode
));
1559 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1561 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1566 Py_UNICODE
*p
, *buf
;
1570 /* Escaped strings will always be longer than the resulting
1571 Unicode string, so we start with size here and then reduce the
1572 length after conversion to the true value. */
1573 v
= _PyUnicode_New(size
);
1577 return (PyObject
*)v
;
1578 p
= buf
= PyUnicode_AS_UNICODE(v
);
1585 /* Non-escape characters are interpreted as Unicode ordinals */
1587 *p
++ = (unsigned char)*s
++;
1591 /* \u-escapes are only interpreted iff the number of leading
1592 backslashes if odd */
1597 *p
++ = (unsigned char)*s
++;
1599 if (((s
- bs
) & 1) == 0 ||
1607 /* \uXXXX with 4 hex digits */
1608 for (x
= 0, i
= 0; i
< 4; i
++) {
1609 c
= (unsigned char)s
[i
];
1611 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1612 "truncated \\uXXXX"))
1618 if (c
>= '0' && c
<= '9')
1620 else if (c
>= 'a' && c
<= 'f')
1628 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1630 return (PyObject
*)v
;
1637 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1644 static const char *hexdigit
= "0123456789abcdef";
1646 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1652 p
= q
= PyString_AS_STRING(repr
);
1653 while (size
-- > 0) {
1654 Py_UNICODE ch
= *s
++;
1655 /* Map 16-bit characters to '\uxxxx' */
1659 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1660 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1661 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1662 *p
++ = hexdigit
[ch
& 15];
1664 /* Copy everything else as-is */
1669 if (_PyString_Resize(&repr
, p
- q
))
1679 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1681 if (!PyUnicode_Check(unicode
)) {
1682 PyErr_BadArgument();
1685 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1686 PyUnicode_GET_SIZE(unicode
));
1689 /* --- Latin-1 Codec ------------------------------------------------------ */
1691 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
1698 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1699 if (size
== 1 && *(unsigned char*)s
< 256) {
1700 Py_UNICODE r
= *(unsigned char*)s
;
1701 return PyUnicode_FromUnicode(&r
, 1);
1704 v
= _PyUnicode_New(size
);
1708 return (PyObject
*)v
;
1709 p
= PyUnicode_AS_UNICODE(v
);
1711 *p
++ = (unsigned char)*s
++;
1712 return (PyObject
*)v
;
1720 int latin1_encoding_error(const Py_UNICODE
**source
,
1723 const char *details
)
1725 if ((errors
== NULL
) ||
1726 (strcmp(errors
,"strict") == 0)) {
1727 PyErr_Format(PyExc_UnicodeError
,
1728 "Latin-1 encoding error: %.400s",
1732 else if (strcmp(errors
,"ignore") == 0) {
1735 else if (strcmp(errors
,"replace") == 0) {
1741 PyErr_Format(PyExc_ValueError
,
1742 "Latin-1 encoding error; "
1743 "unknown error handling code: %.400s",
1749 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
1756 repr
= PyString_FromStringAndSize(NULL
, size
);
1762 s
= PyString_AS_STRING(repr
);
1764 while (size
-- > 0) {
1765 Py_UNICODE ch
= *p
++;
1767 if (latin1_encoding_error(&p
, &s
, errors
,
1768 "ordinal not in range(256)"))
1774 /* Resize if error handling skipped some characters */
1775 if (s
- start
< PyString_GET_SIZE(repr
))
1776 if (_PyString_Resize(&repr
, s
- start
))
1785 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
1787 if (!PyUnicode_Check(unicode
)) {
1788 PyErr_BadArgument();
1791 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
1792 PyUnicode_GET_SIZE(unicode
),
1796 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1799 int ascii_decoding_error(const char **source
,
1802 const char *details
)
1804 if ((errors
== NULL
) ||
1805 (strcmp(errors
,"strict") == 0)) {
1806 PyErr_Format(PyExc_UnicodeError
,
1807 "ASCII decoding error: %.400s",
1811 else if (strcmp(errors
,"ignore") == 0) {
1814 else if (strcmp(errors
,"replace") == 0) {
1815 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1820 PyErr_Format(PyExc_ValueError
,
1821 "ASCII decoding error; "
1822 "unknown error handling code: %.400s",
1828 PyObject
*PyUnicode_DecodeASCII(const char *s
,
1835 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1836 if (size
== 1 && *(unsigned char*)s
< 128) {
1837 Py_UNICODE r
= *(unsigned char*)s
;
1838 return PyUnicode_FromUnicode(&r
, 1);
1841 v
= _PyUnicode_New(size
);
1845 return (PyObject
*)v
;
1846 p
= PyUnicode_AS_UNICODE(v
);
1847 while (size
-- > 0) {
1848 register unsigned char c
;
1850 c
= (unsigned char)*s
++;
1853 else if (ascii_decoding_error(&s
, &p
, errors
,
1854 "ordinal not in range(128)"))
1857 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
1858 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1860 return (PyObject
*)v
;
1868 int ascii_encoding_error(const Py_UNICODE
**source
,
1871 const char *details
)
1873 if ((errors
== NULL
) ||
1874 (strcmp(errors
,"strict") == 0)) {
1875 PyErr_Format(PyExc_UnicodeError
,
1876 "ASCII encoding error: %.400s",
1880 else if (strcmp(errors
,"ignore") == 0) {
1883 else if (strcmp(errors
,"replace") == 0) {
1889 PyErr_Format(PyExc_ValueError
,
1890 "ASCII encoding error; "
1891 "unknown error handling code: %.400s",
1897 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
1904 repr
= PyString_FromStringAndSize(NULL
, size
);
1910 s
= PyString_AS_STRING(repr
);
1912 while (size
-- > 0) {
1913 Py_UNICODE ch
= *p
++;
1915 if (ascii_encoding_error(&p
, &s
, errors
,
1916 "ordinal not in range(128)"))
1922 /* Resize if error handling skipped some characters */
1923 if (s
- start
< PyString_GET_SIZE(repr
))
1924 if (_PyString_Resize(&repr
, s
- start
))
1933 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
1935 if (!PyUnicode_Check(unicode
)) {
1936 PyErr_BadArgument();
1939 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
1940 PyUnicode_GET_SIZE(unicode
),
1944 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1946 /* --- MBCS codecs for Windows -------------------------------------------- */
1948 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
1955 /* First get the size of the result */
1956 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
1957 if (size
> 0 && usize
==0)
1958 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1960 v
= _PyUnicode_New(usize
);
1964 return (PyObject
*)v
;
1965 p
= PyUnicode_AS_UNICODE(v
);
1966 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
1968 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1971 return (PyObject
*)v
;
1974 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
1982 /* If there are no characters, bail now! */
1984 return PyString_FromString("");
1986 /* First get the size of the result */
1987 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
1989 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
1991 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
1997 /* Do the conversion */
1998 s
= PyString_AS_STRING(repr
);
1999 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2001 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2006 #endif /* MS_WIN32 */
2008 /* --- Character Mapping Codec -------------------------------------------- */
2011 int charmap_decoding_error(const char **source
,
2014 const char *details
)
2016 if ((errors
== NULL
) ||
2017 (strcmp(errors
,"strict") == 0)) {
2018 PyErr_Format(PyExc_UnicodeError
,
2019 "charmap decoding error: %.400s",
2023 else if (strcmp(errors
,"ignore") == 0) {
2026 else if (strcmp(errors
,"replace") == 0) {
2027 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2032 PyErr_Format(PyExc_ValueError
,
2033 "charmap decoding error; "
2034 "unknown error handling code: %.400s",
2040 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2049 /* Default to Latin-1 */
2050 if (mapping
== NULL
)
2051 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2053 v
= _PyUnicode_New(size
);
2057 return (PyObject
*)v
;
2058 p
= PyUnicode_AS_UNICODE(v
);
2059 while (size
-- > 0) {
2060 unsigned char ch
= *s
++;
2063 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2064 w
= PyInt_FromLong((long)ch
);
2067 x
= PyObject_GetItem(mapping
, w
);
2070 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2071 /* No mapping found means: mapping is undefined. */
2080 if (PyInt_Check(x
)) {
2081 long value
= PyInt_AS_LONG(x
);
2082 if (value
< 0 || value
> 65535) {
2083 PyErr_SetString(PyExc_TypeError
,
2084 "character mapping must be in range(65536)");
2088 *p
++ = (Py_UNICODE
)value
;
2090 else if (x
== Py_None
) {
2091 /* undefined mapping */
2092 if (charmap_decoding_error(&s
, &p
, errors
,
2093 "character maps to <undefined>")) {
2098 else if (PyUnicode_Check(x
)) {
2099 int targetsize
= PyUnicode_GET_SIZE(x
);
2101 if (targetsize
== 1)
2103 *p
++ = *PyUnicode_AS_UNICODE(x
);
2105 else if (targetsize
> 1) {
2107 if (targetsize
> extrachars
) {
2109 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2110 int needed
= (targetsize
- extrachars
) + \
2112 extrachars
+= needed
;
2113 if (_PyUnicode_Resize(&v
,
2114 PyUnicode_GET_SIZE(v
) + needed
)) {
2118 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2121 PyUnicode_AS_UNICODE(x
),
2124 extrachars
-= targetsize
;
2126 /* 1-0 mapping: skip the character */
2129 /* wrong return value */
2130 PyErr_SetString(PyExc_TypeError
,
2131 "character mapping must return integer, None or unicode");
2137 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2138 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2140 return (PyObject
*)v
;
2148 int charmap_encoding_error(const Py_UNICODE
**source
,
2151 const char *details
)
2153 if ((errors
== NULL
) ||
2154 (strcmp(errors
,"strict") == 0)) {
2155 PyErr_Format(PyExc_UnicodeError
,
2156 "charmap encoding error: %.400s",
2160 else if (strcmp(errors
,"ignore") == 0) {
2163 else if (strcmp(errors
,"replace") == 0) {
2169 PyErr_Format(PyExc_ValueError
,
2170 "charmap encoding error; "
2171 "unknown error handling code: %.400s",
2177 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2186 /* Default to Latin-1 */
2187 if (mapping
== NULL
)
2188 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2190 v
= PyString_FromStringAndSize(NULL
, size
);
2195 s
= PyString_AS_STRING(v
);
2196 while (size
-- > 0) {
2197 Py_UNICODE ch
= *p
++;
2200 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2201 w
= PyInt_FromLong((long)ch
);
2204 x
= PyObject_GetItem(mapping
, w
);
2207 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2208 /* No mapping found means: mapping is undefined. */
2217 if (PyInt_Check(x
)) {
2218 long value
= PyInt_AS_LONG(x
);
2219 if (value
< 0 || value
> 255) {
2220 PyErr_SetString(PyExc_TypeError
,
2221 "character mapping must be in range(256)");
2227 else if (x
== Py_None
) {
2228 /* undefined mapping */
2229 if (charmap_encoding_error(&p
, &s
, errors
,
2230 "character maps to <undefined>")) {
2235 else if (PyString_Check(x
)) {
2236 int targetsize
= PyString_GET_SIZE(x
);
2238 if (targetsize
== 1)
2240 *s
++ = *PyString_AS_STRING(x
);
2242 else if (targetsize
> 1) {
2244 if (targetsize
> extrachars
) {
2246 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2247 int needed
= (targetsize
- extrachars
) + \
2249 extrachars
+= needed
;
2250 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2254 s
= PyString_AS_STRING(v
) + oldpos
;
2256 memcpy(s
, PyString_AS_STRING(x
), targetsize
);
2258 extrachars
-= targetsize
;
2260 /* 1-0 mapping: skip the character */
2263 /* wrong return value */
2264 PyErr_SetString(PyExc_TypeError
,
2265 "character mapping must return integer, None or unicode");
2271 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2272 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2281 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2284 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2285 PyErr_BadArgument();
2288 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2289 PyUnicode_GET_SIZE(unicode
),
2295 int translate_error(const Py_UNICODE
**source
,
2298 const char *details
)
2300 if ((errors
== NULL
) ||
2301 (strcmp(errors
,"strict") == 0)) {
2302 PyErr_Format(PyExc_UnicodeError
,
2303 "translate error: %.400s",
2307 else if (strcmp(errors
,"ignore") == 0) {
2310 else if (strcmp(errors
,"replace") == 0) {
2316 PyErr_Format(PyExc_ValueError
,
2318 "unknown error handling code: %.400s",
2324 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2332 if (mapping
== NULL
) {
2333 PyErr_BadArgument();
2337 /* Output will never be longer than input */
2338 v
= _PyUnicode_New(size
);
2343 p
= PyUnicode_AS_UNICODE(v
);
2344 while (size
-- > 0) {
2345 Py_UNICODE ch
= *s
++;
2349 w
= PyInt_FromLong(ch
);
2352 x
= PyObject_GetItem(mapping
, w
);
2355 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2356 /* No mapping found: default to 1-1 mapping */
2366 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2367 else if (x
== Py_None
) {
2368 /* undefined mapping */
2369 if (translate_error(&s
, &p
, errors
,
2370 "character maps to <undefined>")) {
2375 else if (PyUnicode_Check(x
)) {
2376 if (PyUnicode_GET_SIZE(x
) != 1) {
2378 PyErr_SetString(PyExc_NotImplementedError
,
2379 "1-n mappings are currently not implemented");
2383 *p
++ = *PyUnicode_AS_UNICODE(x
);
2386 /* wrong return value */
2387 PyErr_SetString(PyExc_TypeError
,
2388 "translate mapping must return integer, None or unicode");
2394 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2395 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2399 return (PyObject
*)v
;
2406 PyObject
*PyUnicode_Translate(PyObject
*str
,
2412 str
= PyUnicode_FromObject(str
);
2415 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2416 PyUnicode_GET_SIZE(str
),
2427 /* --- Decimal Encoder ---------------------------------------------------- */
2429 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2434 Py_UNICODE
*p
, *end
;
2436 if (output
== NULL
) {
2437 PyErr_BadArgument();
2444 register Py_UNICODE ch
= *p
++;
2447 if (Py_UNICODE_ISSPACE(ch
)) {
2451 decimal
= Py_UNICODE_TODECIMAL(ch
);
2453 *output
++ = '0' + decimal
;
2456 if (0 < ch
&& ch
< 256) {
2457 *output
++ = (char)ch
;
2460 /* All other characters are considered invalid */
2461 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2462 PyErr_SetString(PyExc_ValueError
,
2463 "invalid decimal Unicode string");
2466 else if (strcmp(errors
, "ignore") == 0)
2468 else if (strcmp(errors
, "replace") == 0) {
2473 /* 0-terminate the output string */
2481 /* --- Helpers ------------------------------------------------------------ */
2484 int count(PyUnicodeObject
*self
,
2487 PyUnicodeObject
*substring
)
2492 start
+= self
->length
;
2495 if (end
> self
->length
)
2498 end
+= self
->length
;
2502 if (substring
->length
== 0)
2503 return (end
- start
+ 1);
2505 end
-= substring
->length
;
2507 while (start
<= end
)
2508 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2510 start
+= substring
->length
;
2517 int PyUnicode_Count(PyObject
*str
,
2524 str
= PyUnicode_FromObject(str
);
2527 substr
= PyUnicode_FromObject(substr
);
2528 if (substr
== NULL
) {
2533 result
= count((PyUnicodeObject
*)str
,
2535 (PyUnicodeObject
*)substr
);
2543 int findstring(PyUnicodeObject
*self
,
2544 PyUnicodeObject
*substring
,
2550 start
+= self
->length
;
2554 if (substring
->length
== 0)
2557 if (end
> self
->length
)
2560 end
+= self
->length
;
2564 end
-= substring
->length
;
2566 if (direction
< 0) {
2567 for (; end
>= start
; end
--)
2568 if (Py_UNICODE_MATCH(self
, end
, substring
))
2571 for (; start
<= end
; start
++)
2572 if (Py_UNICODE_MATCH(self
, start
, substring
))
2579 int PyUnicode_Find(PyObject
*str
,
2587 str
= PyUnicode_FromObject(str
);
2590 substr
= PyUnicode_FromObject(substr
);
2591 if (substr
== NULL
) {
2596 result
= findstring((PyUnicodeObject
*)str
,
2597 (PyUnicodeObject
*)substr
,
2598 start
, end
, direction
);
2605 int tailmatch(PyUnicodeObject
*self
,
2606 PyUnicodeObject
*substring
,
2612 start
+= self
->length
;
2616 if (substring
->length
== 0)
2619 if (end
> self
->length
)
2622 end
+= self
->length
;
2626 end
-= substring
->length
;
2630 if (direction
> 0) {
2631 if (Py_UNICODE_MATCH(self
, end
, substring
))
2634 if (Py_UNICODE_MATCH(self
, start
, substring
))
2641 int PyUnicode_Tailmatch(PyObject
*str
,
2649 str
= PyUnicode_FromObject(str
);
2652 substr
= PyUnicode_FromObject(substr
);
2653 if (substr
== NULL
) {
2658 result
= tailmatch((PyUnicodeObject
*)str
,
2659 (PyUnicodeObject
*)substr
,
2660 start
, end
, direction
);
2667 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2671 /* like wcschr, but doesn't stop at NULL characters */
2673 while (size
-- > 0) {
2682 /* Apply fixfct filter to the Unicode object self and return a
2683 reference to the modified object */
2686 PyObject
*fixup(PyUnicodeObject
*self
,
2687 int (*fixfct
)(PyUnicodeObject
*s
))
2692 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
2696 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
2699 /* fixfct should return TRUE if it modified the buffer. If
2700 FALSE, return a reference to the original buffer instead
2701 (to save space, not time) */
2704 return (PyObject
*) self
;
2706 return (PyObject
*) u
;
2710 int fixupper(PyUnicodeObject
*self
)
2712 int len
= self
->length
;
2713 Py_UNICODE
*s
= self
->str
;
2717 register Py_UNICODE ch
;
2719 ch
= Py_UNICODE_TOUPPER(*s
);
2731 int fixlower(PyUnicodeObject
*self
)
2733 int len
= self
->length
;
2734 Py_UNICODE
*s
= self
->str
;
2738 register Py_UNICODE ch
;
2740 ch
= Py_UNICODE_TOLOWER(*s
);
2752 int fixswapcase(PyUnicodeObject
*self
)
2754 int len
= self
->length
;
2755 Py_UNICODE
*s
= self
->str
;
2759 if (Py_UNICODE_ISUPPER(*s
)) {
2760 *s
= Py_UNICODE_TOLOWER(*s
);
2762 } else if (Py_UNICODE_ISLOWER(*s
)) {
2763 *s
= Py_UNICODE_TOUPPER(*s
);
2773 int fixcapitalize(PyUnicodeObject
*self
)
2775 int len
= self
->length
;
2776 Py_UNICODE
*s
= self
->str
;
2781 if (Py_UNICODE_ISLOWER(*s
)) {
2782 *s
= Py_UNICODE_TOUPPER(*s
);
2787 if (Py_UNICODE_ISUPPER(*s
)) {
2788 *s
= Py_UNICODE_TOLOWER(*s
);
2797 int fixtitle(PyUnicodeObject
*self
)
2799 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
2800 register Py_UNICODE
*e
;
2801 int previous_is_cased
;
2803 /* Shortcut for single character strings */
2804 if (PyUnicode_GET_SIZE(self
) == 1) {
2805 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
2814 e
= p
+ PyUnicode_GET_SIZE(self
);
2815 previous_is_cased
= 0;
2816 for (; p
< e
; p
++) {
2817 register const Py_UNICODE ch
= *p
;
2819 if (previous_is_cased
)
2820 *p
= Py_UNICODE_TOLOWER(ch
);
2822 *p
= Py_UNICODE_TOTITLE(ch
);
2824 if (Py_UNICODE_ISLOWER(ch
) ||
2825 Py_UNICODE_ISUPPER(ch
) ||
2826 Py_UNICODE_ISTITLE(ch
))
2827 previous_is_cased
= 1;
2829 previous_is_cased
= 0;
2834 PyObject
*PyUnicode_Join(PyObject
*separator
,
2839 PyUnicodeObject
*res
= NULL
;
2846 it
= PyObject_GetIter(seq
);
2850 if (separator
== NULL
) {
2851 Py_UNICODE blank
= ' ';
2856 separator
= PyUnicode_FromObject(separator
);
2857 if (separator
== NULL
)
2859 sep
= PyUnicode_AS_UNICODE(separator
);
2860 seplen
= PyUnicode_GET_SIZE(separator
);
2863 res
= _PyUnicode_New(sz
);
2866 p
= PyUnicode_AS_UNICODE(res
);
2869 for (i
= 0; ; ++i
) {
2871 PyObject
*item
= PyIter_Next(it
);
2873 if (PyErr_Occurred())
2877 if (!PyUnicode_Check(item
)) {
2879 v
= PyUnicode_FromObject(item
);
2885 itemlen
= PyUnicode_GET_SIZE(item
);
2886 while (reslen
+ itemlen
+ seplen
>= sz
) {
2887 if (_PyUnicode_Resize(&res
, sz
*2))
2890 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
2893 Py_UNICODE_COPY(p
, sep
, seplen
);
2897 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
2902 if (_PyUnicode_Resize(&res
, reslen
))
2905 Py_XDECREF(separator
);
2907 return (PyObject
*)res
;
2910 Py_XDECREF(separator
);
2917 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
2929 if (left
== 0 && right
== 0) {
2934 u
= _PyUnicode_New(left
+ self
->length
+ right
);
2937 Py_UNICODE_FILL(u
->str
, fill
, left
);
2938 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
2940 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
2946 #define SPLIT_APPEND(data, left, right) \
2947 str = PyUnicode_FromUnicode(data + left, right - left); \
2950 if (PyList_Append(list, str)) { \
2958 PyObject
*split_whitespace(PyUnicodeObject
*self
,
2964 int len
= self
->length
;
2967 for (i
= j
= 0; i
< len
; ) {
2969 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2972 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
2975 if (maxcount
-- <= 0)
2977 SPLIT_APPEND(self
->str
, j
, i
);
2978 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
2984 SPLIT_APPEND(self
->str
, j
, len
);
2993 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
3003 string
= PyUnicode_FromObject(string
);
3006 data
= PyUnicode_AS_UNICODE(string
);
3007 len
= PyUnicode_GET_SIZE(string
);
3009 list
= PyList_New(0);
3013 for (i
= j
= 0; i
< len
; ) {
3016 /* Find a line and append it */
3017 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
3020 /* Skip the line break reading CRLF as one line break */
3023 if (data
[i
] == '\r' && i
+ 1 < len
&&
3031 SPLIT_APPEND(data
, j
, eol
);
3035 SPLIT_APPEND(data
, j
, len
);
3048 PyObject
*split_char(PyUnicodeObject
*self
,
3055 int len
= self
->length
;
3058 for (i
= j
= 0; i
< len
; ) {
3059 if (self
->str
[i
] == ch
) {
3060 if (maxcount
-- <= 0)
3062 SPLIT_APPEND(self
->str
, j
, i
);
3068 SPLIT_APPEND(self
->str
, j
, len
);
3078 PyObject
*split_substring(PyUnicodeObject
*self
,
3080 PyUnicodeObject
*substring
,
3085 int len
= self
->length
;
3086 int sublen
= substring
->length
;
3089 for (i
= j
= 0; i
<= len
- sublen
; ) {
3090 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
3091 if (maxcount
-- <= 0)
3093 SPLIT_APPEND(self
->str
, j
, i
);
3099 SPLIT_APPEND(self
->str
, j
, len
);
3111 PyObject
*split(PyUnicodeObject
*self
,
3112 PyUnicodeObject
*substring
,
3120 list
= PyList_New(0);
3124 if (substring
== NULL
)
3125 return split_whitespace(self
,list
,maxcount
);
3127 else if (substring
->length
== 1)
3128 return split_char(self
,list
,substring
->str
[0],maxcount
);
3130 else if (substring
->length
== 0) {
3132 PyErr_SetString(PyExc_ValueError
, "empty separator");
3136 return split_substring(self
,list
,substring
,maxcount
);
3140 PyObject
*strip(PyUnicodeObject
*self
,
3144 Py_UNICODE
*p
= self
->str
;
3146 int end
= self
->length
;
3149 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
3153 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
3156 if (start
== 0 && end
== self
->length
) {
3157 /* couldn't strip anything off, return original string */
3159 return (PyObject
*) self
;
3162 return (PyObject
*) PyUnicode_FromUnicode(
3169 PyObject
*replace(PyUnicodeObject
*self
,
3170 PyUnicodeObject
*str1
,
3171 PyUnicodeObject
*str2
,
3179 if (str1
->length
== 1 && str2
->length
== 1) {
3182 /* replace characters */
3183 if (!findchar(self
->str
, self
->length
, str1
->str
[0])) {
3184 /* nothing to replace, return original string */
3188 Py_UNICODE u1
= str1
->str
[0];
3189 Py_UNICODE u2
= str2
->str
[0];
3191 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3196 Py_UNICODE_COPY(u
->str
, self
->str
,
3198 for (i
= 0; i
< u
->length
; i
++)
3199 if (u
->str
[i
] == u1
) {
3211 /* replace strings */
3212 n
= count(self
, 0, self
->length
, str1
);
3216 /* nothing to replace, return original string */
3221 self
->length
+ n
* (str2
->length
- str1
->length
));
3225 while (i
<= self
->length
- str1
->length
)
3226 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3227 /* replace string segment */
3228 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3232 /* copy remaining part */
3233 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3237 *p
++ = self
->str
[i
++];
3242 return (PyObject
*) u
;
3245 /* --- Unicode Object Methods --------------------------------------------- */
3247 static char title__doc__
[] =
3248 "S.title() -> unicode\n\
3250 Return a titlecased version of S, i.e. words start with title case\n\
3251 characters, all remaining cased characters have lower case.";
3254 unicode_title(PyUnicodeObject
*self
, PyObject
*args
)
3256 if (!PyArg_NoArgs(args
))
3258 return fixup(self
, fixtitle
);
3261 static char capitalize__doc__
[] =
3262 "S.capitalize() -> unicode\n\
3264 Return a capitalized version of S, i.e. make the first character\n\
3268 unicode_capitalize(PyUnicodeObject
*self
, PyObject
*args
)
3270 if (!PyArg_NoArgs(args
))
3272 return fixup(self
, fixcapitalize
);
3276 static char capwords__doc__
[] =
3277 "S.capwords() -> unicode\n\
3279 Apply .capitalize() to all words in S and return the result with\n\
3280 normalized whitespace (all whitespace strings are replaced by ' ').";
3283 unicode_capwords(PyUnicodeObject
*self
, PyObject
*args
)
3289 if (!PyArg_NoArgs(args
))
3292 /* Split into words */
3293 list
= split(self
, NULL
, -1);
3297 /* Capitalize each word */
3298 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3299 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3303 Py_DECREF(PyList_GET_ITEM(list
, i
));
3304 PyList_SET_ITEM(list
, i
, item
);
3307 /* Join the words to form a new string */
3308 item
= PyUnicode_Join(NULL
, list
);
3312 return (PyObject
*)item
;
3316 static char center__doc__
[] =
3317 "S.center(width) -> unicode\n\
3319 Return S centered in a Unicode string of length width. Padding is done\n\
3323 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3328 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3331 if (self
->length
>= width
) {
3333 return (PyObject
*) self
;
3336 marg
= width
- self
->length
;
3337 left
= marg
/ 2 + (marg
& width
& 1);
3339 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3344 /* This code should go into some future Unicode collation support
3345 module. The basic comparison should compare ordinals on a naive
3346 basis (this is what Java does and thus JPython too). */
3348 /* speedy UTF-16 code point order comparison */
3350 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3352 static short utf16Fixup
[32] =
3354 0, 0, 0, 0, 0, 0, 0, 0,
3355 0, 0, 0, 0, 0, 0, 0, 0,
3356 0, 0, 0, 0, 0, 0, 0, 0,
3357 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3361 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3365 Py_UNICODE
*s1
= str1
->str
;
3366 Py_UNICODE
*s2
= str2
->str
;
3368 len1
= str1
->length
;
3369 len2
= str2
->length
;
3371 while (len1
> 0 && len2
> 0) {
3377 if (c1
> (1<<11) * 26)
3378 c1
+= utf16Fixup
[c1
>>11];
3379 if (c2
> (1<<11) * 26)
3380 c2
+= utf16Fixup
[c2
>>11];
3381 /* now c1 and c2 are in UTF-32-compatible order */
3384 return (c1
< c2
) ? -1 : 1;
3389 return (len1
< len2
) ? -1 : (len1
!= len2
);
3395 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3397 register int len1
, len2
;
3399 Py_UNICODE
*s1
= str1
->str
;
3400 Py_UNICODE
*s2
= str2
->str
;
3402 len1
= str1
->length
;
3403 len2
= str2
->length
;
3405 while (len1
> 0 && len2
> 0) {
3412 return (c1
< c2
) ? -1 : 1;
3417 return (len1
< len2
) ? -1 : (len1
!= len2
);
3422 int PyUnicode_Compare(PyObject
*left
,
3425 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3428 /* Coerce the two arguments */
3429 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3432 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3436 /* Shortcut for empty or interned objects */
3443 result
= unicode_compare(u
, v
);
3455 int PyUnicode_Contains(PyObject
*container
,
3458 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3460 register const Py_UNICODE
*p
, *e
;
3461 register Py_UNICODE ch
;
3463 /* Coerce the two arguments */
3464 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3466 PyErr_SetString(PyExc_TypeError
,
3467 "'in <string>' requires character as left operand");
3470 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3477 if (PyUnicode_GET_SIZE(v
) != 1) {
3478 PyErr_SetString(PyExc_TypeError
,
3479 "'in <string>' requires character as left operand");
3482 ch
= *PyUnicode_AS_UNICODE(v
);
3483 p
= PyUnicode_AS_UNICODE(u
);
3484 e
= p
+ PyUnicode_GET_SIZE(u
);
3503 /* Concat to string or Unicode object giving a new Unicode object. */
3505 PyObject
*PyUnicode_Concat(PyObject
*left
,
3508 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3510 /* Coerce the two arguments */
3511 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3514 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3519 if (v
== unicode_empty
) {
3521 return (PyObject
*)u
;
3523 if (u
== unicode_empty
) {
3525 return (PyObject
*)v
;
3528 /* Concat the two Unicode strings */
3529 w
= _PyUnicode_New(u
->length
+ v
->length
);
3532 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3533 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3537 return (PyObject
*)w
;
3545 static char count__doc__
[] =
3546 "S.count(sub[, start[, end]]) -> int\n\
3548 Return the number of occurrences of substring sub in Unicode string\n\
3549 S[start:end]. Optional arguments start and end are\n\
3550 interpreted as in slice notation.";
3553 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3555 PyUnicodeObject
*substring
;
3560 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3561 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3564 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3565 (PyObject
*)substring
);
3566 if (substring
== NULL
)
3570 start
+= self
->length
;
3573 if (end
> self
->length
)
3576 end
+= self
->length
;
3580 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3582 Py_DECREF(substring
);
3586 static char encode__doc__
[] =
3587 "S.encode([encoding[,errors]]) -> string\n\
3589 Return an encoded string version of S. Default encoding is the current\n\
3590 default string encoding. errors may be given to set a different error\n\
3591 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3592 a ValueError. Other possible values are 'ignore' and 'replace'.";
3595 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3597 char *encoding
= NULL
;
3598 char *errors
= NULL
;
3599 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3601 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3604 static char expandtabs__doc__
[] =
3605 "S.expandtabs([tabsize]) -> unicode\n\
3607 Return a copy of S where all tab characters are expanded using spaces.\n\
3608 If tabsize is not given, a tab size of 8 characters is assumed.";
3611 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3620 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3623 /* First pass: determine size of output string */
3625 e
= self
->str
+ self
->length
;
3626 for (p
= self
->str
; p
< e
; p
++)
3629 j
+= tabsize
- (j
% tabsize
);
3633 if (*p
== '\n' || *p
== '\r') {
3639 /* Second pass: create output string and fill it */
3640 u
= _PyUnicode_New(i
+ j
);
3647 for (p
= self
->str
; p
< e
; p
++)
3650 i
= tabsize
- (j
% tabsize
);
3659 if (*p
== '\n' || *p
== '\r')
3663 return (PyObject
*) u
;
3666 static char find__doc__
[] =
3667 "S.find(sub [,start [,end]]) -> int\n\
3669 Return the lowest index in S where substring sub is found,\n\
3670 such that sub is contained within s[start,end]. Optional\n\
3671 arguments start and end are interpreted as in slice notation.\n\
3673 Return -1 on failure.";
3676 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3678 PyUnicodeObject
*substring
;
3683 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
3684 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3686 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3687 (PyObject
*)substring
);
3688 if (substring
== NULL
)
3691 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
3693 Py_DECREF(substring
);
3698 unicode_getitem(PyUnicodeObject
*self
, int index
)
3700 if (index
< 0 || index
>= self
->length
) {
3701 PyErr_SetString(PyExc_IndexError
, "string index out of range");
3705 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
3709 unicode_hash(PyUnicodeObject
*self
)
3711 /* Since Unicode objects compare equal to their ASCII string
3712 counterparts, they should use the individual character values
3713 as basis for their hash value. This is needed to assure that
3714 strings and Unicode objects behave in the same way as
3718 register Py_UNICODE
*p
;
3721 if (self
->hash
!= -1)
3723 len
= PyUnicode_GET_SIZE(self
);
3724 p
= PyUnicode_AS_UNICODE(self
);
3727 x
= (1000003*x
) ^ *p
++;
3728 x
^= PyUnicode_GET_SIZE(self
);
3735 static char index__doc__
[] =
3736 "S.index(sub [,start [,end]]) -> int\n\
3738 Like S.find() but raise ValueError when the substring is not found.";
3741 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
3744 PyUnicodeObject
*substring
;
3748 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
3749 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3752 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3753 (PyObject
*)substring
);
3754 if (substring
== NULL
)
3757 result
= findstring(self
, substring
, start
, end
, 1);
3759 Py_DECREF(substring
);
3761 PyErr_SetString(PyExc_ValueError
, "substring not found");
3764 return PyInt_FromLong(result
);
3767 static char islower__doc__
[] =
3768 "S.islower() -> int\n\
3770 Return 1 if all cased characters in S are lowercase and there is\n\
3771 at least one cased character in S, 0 otherwise.";
3774 unicode_islower(PyUnicodeObject
*self
, PyObject
*args
)
3776 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3777 register const Py_UNICODE
*e
;
3780 if (!PyArg_NoArgs(args
))
3783 /* Shortcut for single character strings */
3784 if (PyUnicode_GET_SIZE(self
) == 1)
3785 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
3787 /* Special case for empty strings */
3788 if (PyString_GET_SIZE(self
) == 0)
3789 return PyInt_FromLong(0);
3791 e
= p
+ PyUnicode_GET_SIZE(self
);
3793 for (; p
< e
; p
++) {
3794 register const Py_UNICODE ch
= *p
;
3796 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
3797 return PyInt_FromLong(0);
3798 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
3801 return PyInt_FromLong(cased
);
3804 static char isupper__doc__
[] =
3805 "S.isupper() -> int\n\
3807 Return 1 if all cased characters in S are uppercase and there is\n\
3808 at least one cased character in S, 0 otherwise.";
3811 unicode_isupper(PyUnicodeObject
*self
, PyObject
*args
)
3813 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3814 register const Py_UNICODE
*e
;
3817 if (!PyArg_NoArgs(args
))
3820 /* Shortcut for single character strings */
3821 if (PyUnicode_GET_SIZE(self
) == 1)
3822 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
3824 /* Special case for empty strings */
3825 if (PyString_GET_SIZE(self
) == 0)
3826 return PyInt_FromLong(0);
3828 e
= p
+ PyUnicode_GET_SIZE(self
);
3830 for (; p
< e
; p
++) {
3831 register const Py_UNICODE ch
= *p
;
3833 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
3834 return PyInt_FromLong(0);
3835 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
3838 return PyInt_FromLong(cased
);
3841 static char istitle__doc__
[] =
3842 "S.istitle() -> int\n\
3844 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3845 may only follow uncased characters and lowercase characters only cased\n\
3846 ones. Return 0 otherwise.";
3849 unicode_istitle(PyUnicodeObject
*self
, PyObject
*args
)
3851 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3852 register const Py_UNICODE
*e
;
3853 int cased
, previous_is_cased
;
3855 if (!PyArg_NoArgs(args
))
3858 /* Shortcut for single character strings */
3859 if (PyUnicode_GET_SIZE(self
) == 1)
3860 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
3861 (Py_UNICODE_ISUPPER(*p
) != 0));
3863 /* Special case for empty strings */
3864 if (PyString_GET_SIZE(self
) == 0)
3865 return PyInt_FromLong(0);
3867 e
= p
+ PyUnicode_GET_SIZE(self
);
3869 previous_is_cased
= 0;
3870 for (; p
< e
; p
++) {
3871 register const Py_UNICODE ch
= *p
;
3873 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
3874 if (previous_is_cased
)
3875 return PyInt_FromLong(0);
3876 previous_is_cased
= 1;
3879 else if (Py_UNICODE_ISLOWER(ch
)) {
3880 if (!previous_is_cased
)
3881 return PyInt_FromLong(0);
3882 previous_is_cased
= 1;
3886 previous_is_cased
= 0;
3888 return PyInt_FromLong(cased
);
3891 static char isspace__doc__
[] =
3892 "S.isspace() -> int\n\
3894 Return 1 if there are only whitespace characters in S,\n\
3898 unicode_isspace(PyUnicodeObject
*self
, PyObject
*args
)
3900 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3901 register const Py_UNICODE
*e
;
3903 if (!PyArg_NoArgs(args
))
3906 /* Shortcut for single character strings */
3907 if (PyUnicode_GET_SIZE(self
) == 1 &&
3908 Py_UNICODE_ISSPACE(*p
))
3909 return PyInt_FromLong(1);
3911 /* Special case for empty strings */
3912 if (PyString_GET_SIZE(self
) == 0)
3913 return PyInt_FromLong(0);
3915 e
= p
+ PyUnicode_GET_SIZE(self
);
3916 for (; p
< e
; p
++) {
3917 if (!Py_UNICODE_ISSPACE(*p
))
3918 return PyInt_FromLong(0);
3920 return PyInt_FromLong(1);
3923 static char isalpha__doc__
[] =
3924 "S.isalpha() -> int\n\
3926 Return 1 if all characters in S are alphabetic\n\
3927 and there is at least one character in S, 0 otherwise.";
3930 unicode_isalpha(PyUnicodeObject
*self
, PyObject
*args
)
3932 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3933 register const Py_UNICODE
*e
;
3935 if (!PyArg_NoArgs(args
))
3938 /* Shortcut for single character strings */
3939 if (PyUnicode_GET_SIZE(self
) == 1 &&
3940 Py_UNICODE_ISALPHA(*p
))
3941 return PyInt_FromLong(1);
3943 /* Special case for empty strings */
3944 if (PyString_GET_SIZE(self
) == 0)
3945 return PyInt_FromLong(0);
3947 e
= p
+ PyUnicode_GET_SIZE(self
);
3948 for (; p
< e
; p
++) {
3949 if (!Py_UNICODE_ISALPHA(*p
))
3950 return PyInt_FromLong(0);
3952 return PyInt_FromLong(1);
3955 static char isalnum__doc__
[] =
3956 "S.isalnum() -> int\n\
3958 Return 1 if all characters in S are alphanumeric\n\
3959 and there is at least one character in S, 0 otherwise.";
3962 unicode_isalnum(PyUnicodeObject
*self
, PyObject
*args
)
3964 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3965 register const Py_UNICODE
*e
;
3967 if (!PyArg_NoArgs(args
))
3970 /* Shortcut for single character strings */
3971 if (PyUnicode_GET_SIZE(self
) == 1 &&
3972 Py_UNICODE_ISALNUM(*p
))
3973 return PyInt_FromLong(1);
3975 /* Special case for empty strings */
3976 if (PyString_GET_SIZE(self
) == 0)
3977 return PyInt_FromLong(0);
3979 e
= p
+ PyUnicode_GET_SIZE(self
);
3980 for (; p
< e
; p
++) {
3981 if (!Py_UNICODE_ISALNUM(*p
))
3982 return PyInt_FromLong(0);
3984 return PyInt_FromLong(1);
3987 static char isdecimal__doc__
[] =
3988 "S.isdecimal() -> int\n\
3990 Return 1 if there are only decimal characters in S,\n\
3994 unicode_isdecimal(PyUnicodeObject
*self
, PyObject
*args
)
3996 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3997 register const Py_UNICODE
*e
;
3999 if (!PyArg_NoArgs(args
))
4002 /* Shortcut for single character strings */
4003 if (PyUnicode_GET_SIZE(self
) == 1 &&
4004 Py_UNICODE_ISDECIMAL(*p
))
4005 return PyInt_FromLong(1);
4007 /* Special case for empty strings */
4008 if (PyString_GET_SIZE(self
) == 0)
4009 return PyInt_FromLong(0);
4011 e
= p
+ PyUnicode_GET_SIZE(self
);
4012 for (; p
< e
; p
++) {
4013 if (!Py_UNICODE_ISDECIMAL(*p
))
4014 return PyInt_FromLong(0);
4016 return PyInt_FromLong(1);
4019 static char isdigit__doc__
[] =
4020 "S.isdigit() -> int\n\
4022 Return 1 if there are only digit characters in S,\n\
4026 unicode_isdigit(PyUnicodeObject
*self
, PyObject
*args
)
4028 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4029 register const Py_UNICODE
*e
;
4031 if (!PyArg_NoArgs(args
))
4034 /* Shortcut for single character strings */
4035 if (PyUnicode_GET_SIZE(self
) == 1 &&
4036 Py_UNICODE_ISDIGIT(*p
))
4037 return PyInt_FromLong(1);
4039 /* Special case for empty strings */
4040 if (PyString_GET_SIZE(self
) == 0)
4041 return PyInt_FromLong(0);
4043 e
= p
+ PyUnicode_GET_SIZE(self
);
4044 for (; p
< e
; p
++) {
4045 if (!Py_UNICODE_ISDIGIT(*p
))
4046 return PyInt_FromLong(0);
4048 return PyInt_FromLong(1);
4051 static char isnumeric__doc__
[] =
4052 "S.isnumeric() -> int\n\
4054 Return 1 if there are only numeric characters in S,\n\
4058 unicode_isnumeric(PyUnicodeObject
*self
, PyObject
*args
)
4060 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4061 register const Py_UNICODE
*e
;
4063 if (!PyArg_NoArgs(args
))
4066 /* Shortcut for single character strings */
4067 if (PyUnicode_GET_SIZE(self
) == 1 &&
4068 Py_UNICODE_ISNUMERIC(*p
))
4069 return PyInt_FromLong(1);
4071 /* Special case for empty strings */
4072 if (PyString_GET_SIZE(self
) == 0)
4073 return PyInt_FromLong(0);
4075 e
= p
+ PyUnicode_GET_SIZE(self
);
4076 for (; p
< e
; p
++) {
4077 if (!Py_UNICODE_ISNUMERIC(*p
))
4078 return PyInt_FromLong(0);
4080 return PyInt_FromLong(1);
4083 static char join__doc__
[] =
4084 "S.join(sequence) -> unicode\n\
4086 Return a string which is the concatenation of the strings in the\n\
4087 sequence. The separator between elements is S.";
4090 unicode_join(PyUnicodeObject
*self
, PyObject
*args
)
4093 if (!PyArg_ParseTuple(args
, "O:join", &data
))
4096 return PyUnicode_Join((PyObject
*)self
, data
);
4100 unicode_length(PyUnicodeObject
*self
)
4102 return self
->length
;
4105 static char ljust__doc__
[] =
4106 "S.ljust(width) -> unicode\n\
4108 Return S left justified in a Unicode string of length width. Padding is\n\
4109 done using spaces.";
4112 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
4115 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
4118 if (self
->length
>= width
) {
4120 return (PyObject
*) self
;
4123 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
4126 static char lower__doc__
[] =
4127 "S.lower() -> unicode\n\
4129 Return a copy of the string S converted to lowercase.";
4132 unicode_lower(PyUnicodeObject
*self
, PyObject
*args
)
4134 if (!PyArg_NoArgs(args
))
4136 return fixup(self
, fixlower
);
4139 static char lstrip__doc__
[] =
4140 "S.lstrip() -> unicode\n\
4142 Return a copy of the string S with leading whitespace removed.";
4145 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
4147 if (!PyArg_NoArgs(args
))
4149 return strip(self
, 1, 0);
4153 unicode_repeat(PyUnicodeObject
*str
, int len
)
4164 /* no repeat, return original string */
4166 return (PyObject
*) str
;
4169 /* ensure # of chars needed doesn't overflow int and # of bytes
4170 * needed doesn't overflow size_t
4172 nchars
= len
* str
->length
;
4173 if (len
&& nchars
/ len
!= str
->length
) {
4174 PyErr_SetString(PyExc_OverflowError
,
4175 "repeated string is too long");
4178 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4179 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4180 PyErr_SetString(PyExc_OverflowError
,
4181 "repeated string is too long");
4184 u
= _PyUnicode_New(nchars
);
4191 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4195 return (PyObject
*) u
;
4198 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4208 self
= PyUnicode_FromObject(obj
);
4211 str1
= PyUnicode_FromObject(subobj
);
4216 str2
= PyUnicode_FromObject(replobj
);
4222 result
= replace((PyUnicodeObject
*)self
,
4223 (PyUnicodeObject
*)str1
,
4224 (PyUnicodeObject
*)str2
,
4232 static char replace__doc__
[] =
4233 "S.replace (old, new[, maxsplit]) -> unicode\n\
4235 Return a copy of S with all occurrences of substring\n\
4236 old replaced by new. If the optional argument maxsplit is\n\
4237 given, only the first maxsplit occurrences are replaced.";
4240 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4242 PyUnicodeObject
*str1
;
4243 PyUnicodeObject
*str2
;
4247 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4249 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4252 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4256 result
= replace(self
, str1
, str2
, maxcount
);
4264 PyObject
*unicode_repr(PyObject
*unicode
)
4266 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4267 PyUnicode_GET_SIZE(unicode
),
4271 static char rfind__doc__
[] =
4272 "S.rfind(sub [,start [,end]]) -> int\n\
4274 Return the highest index in S where substring sub is found,\n\
4275 such that sub is contained within s[start,end]. Optional\n\
4276 arguments start and end are interpreted as in slice notation.\n\
4278 Return -1 on failure.";
4281 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4283 PyUnicodeObject
*substring
;
4288 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4289 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4291 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4292 (PyObject
*)substring
);
4293 if (substring
== NULL
)
4296 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4298 Py_DECREF(substring
);
4302 static char rindex__doc__
[] =
4303 "S.rindex(sub [,start [,end]]) -> int\n\
4305 Like S.rfind() but raise ValueError when the substring is not found.";
4308 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4311 PyUnicodeObject
*substring
;
4315 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4316 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4318 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4319 (PyObject
*)substring
);
4320 if (substring
== NULL
)
4323 result
= findstring(self
, substring
, start
, end
, -1);
4325 Py_DECREF(substring
);
4327 PyErr_SetString(PyExc_ValueError
, "substring not found");
4330 return PyInt_FromLong(result
);
4333 static char rjust__doc__
[] =
4334 "S.rjust(width) -> unicode\n\
4336 Return S right justified in a Unicode string of length width. Padding is\n\
4337 done using spaces.";
4340 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4343 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4346 if (self
->length
>= width
) {
4348 return (PyObject
*) self
;
4351 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4354 static char rstrip__doc__
[] =
4355 "S.rstrip() -> unicode\n\
4357 Return a copy of the string S with trailing whitespace removed.";
4360 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
4362 if (!PyArg_NoArgs(args
))
4364 return strip(self
, 0, 1);
4368 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4370 /* standard clamping */
4375 if (end
> self
->length
)
4377 if (start
== 0 && end
== self
->length
) {
4378 /* full slice, return original string */
4380 return (PyObject
*) self
;
4385 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4389 PyObject
*PyUnicode_Split(PyObject
*s
,
4395 s
= PyUnicode_FromObject(s
);
4399 sep
= PyUnicode_FromObject(sep
);
4406 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4413 static char split__doc__
[] =
4414 "S.split([sep [,maxsplit]]) -> list of strings\n\
4416 Return a list of the words in S, using sep as the\n\
4417 delimiter string. If maxsplit is given, at most maxsplit\n\
4418 splits are done. If sep is not specified, any whitespace string\n\
4422 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4424 PyObject
*substring
= Py_None
;
4427 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4430 if (substring
== Py_None
)
4431 return split(self
, NULL
, maxcount
);
4432 else if (PyUnicode_Check(substring
))
4433 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4435 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4438 static char splitlines__doc__
[] =
4439 "S.splitlines([keepends]]) -> list of strings\n\
4441 Return a list of the lines in S, breaking at line boundaries.\n\
4442 Line breaks are not included in the resulting list unless keepends\n\
4443 is given and true.";
4446 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4450 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4453 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4457 PyObject
*unicode_str(PyUnicodeObject
*self
)
4459 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4462 static char strip__doc__
[] =
4463 "S.strip() -> unicode\n\
4465 Return a copy of S with leading and trailing whitespace removed.";
4468 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
4470 if (!PyArg_NoArgs(args
))
4472 return strip(self
, 1, 1);
4475 static char swapcase__doc__
[] =
4476 "S.swapcase() -> unicode\n\
4478 Return a copy of S with uppercase characters converted to lowercase\n\
4482 unicode_swapcase(PyUnicodeObject
*self
, PyObject
*args
)
4484 if (!PyArg_NoArgs(args
))
4486 return fixup(self
, fixswapcase
);
4489 static char translate__doc__
[] =
4490 "S.translate(table) -> unicode\n\
4492 Return a copy of the string S, where all characters have been mapped\n\
4493 through the given translation table, which must be a mapping of\n\
4494 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4495 are left untouched. Characters mapped to None are deleted.";
4498 unicode_translate(PyUnicodeObject
*self
, PyObject
*args
)
4502 if (!PyArg_ParseTuple(args
, "O:translate", &table
))
4504 return PyUnicode_TranslateCharmap(self
->str
,
4510 static char upper__doc__
[] =
4511 "S.upper() -> unicode\n\
4513 Return a copy of S converted to uppercase.";
4516 unicode_upper(PyUnicodeObject
*self
, PyObject
*args
)
4518 if (!PyArg_NoArgs(args
))
4520 return fixup(self
, fixupper
);
4524 static char zfill__doc__
[] =
4525 "S.zfill(width) -> unicode\n\
4527 Pad a numeric string x with zeros on the left, to fill a field\n\
4528 of the specified width. The string x is never truncated.";
4531 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4537 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4540 if (self
->length
>= width
) {
4542 return (PyObject
*) self
;
4545 fill
= width
- self
->length
;
4547 u
= pad(self
, fill
, 0, '0');
4549 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4550 /* move sign to beginning of string */
4551 u
->str
[0] = u
->str
[fill
];
4555 return (PyObject
*) u
;
4561 unicode_freelistsize(PyUnicodeObject
*self
, PyObject
*args
)
4563 if (!PyArg_NoArgs(args
))
4565 return PyInt_FromLong(unicode_freelist_size
);
4569 static char startswith__doc__
[] =
4570 "S.startswith(prefix[, start[, end]]) -> int\n\
4572 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4573 optional start, test S beginning at that position. With optional end, stop\n\
4574 comparing S at that position.";
4577 unicode_startswith(PyUnicodeObject
*self
,
4580 PyUnicodeObject
*substring
;
4585 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4586 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4588 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4589 (PyObject
*)substring
);
4590 if (substring
== NULL
)
4593 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4595 Py_DECREF(substring
);
4600 static char endswith__doc__
[] =
4601 "S.endswith(suffix[, start[, end]]) -> int\n\
4603 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4604 optional start, test S beginning at that position. With optional end, stop\n\
4605 comparing S at that position.";
4608 unicode_endswith(PyUnicodeObject
*self
,
4611 PyUnicodeObject
*substring
;
4616 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4617 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4619 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4620 (PyObject
*)substring
);
4621 if (substring
== NULL
)
4624 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4626 Py_DECREF(substring
);
4631 static PyMethodDef unicode_methods
[] = {
4633 /* Order is according to common usage: often used methods should
4634 appear first, since lookup is done sequentially. */
4636 {"encode", (PyCFunction
) unicode_encode
, 1, encode__doc__
},
4637 {"replace", (PyCFunction
) unicode_replace
, 1, replace__doc__
},
4638 {"split", (PyCFunction
) unicode_split
, 1, split__doc__
},
4639 {"join", (PyCFunction
) unicode_join
, 1, join__doc__
},
4640 {"capitalize", (PyCFunction
) unicode_capitalize
, 0, capitalize__doc__
},
4641 {"title", (PyCFunction
) unicode_title
, 0, title__doc__
},
4642 {"center", (PyCFunction
) unicode_center
, 1, center__doc__
},
4643 {"count", (PyCFunction
) unicode_count
, 1, count__doc__
},
4644 {"expandtabs", (PyCFunction
) unicode_expandtabs
, 1, expandtabs__doc__
},
4645 {"find", (PyCFunction
) unicode_find
, 1, find__doc__
},
4646 {"index", (PyCFunction
) unicode_index
, 1, index__doc__
},
4647 {"ljust", (PyCFunction
) unicode_ljust
, 1, ljust__doc__
},
4648 {"lower", (PyCFunction
) unicode_lower
, 0, lower__doc__
},
4649 {"lstrip", (PyCFunction
) unicode_lstrip
, 0, lstrip__doc__
},
4650 /* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4651 {"rfind", (PyCFunction
) unicode_rfind
, 1, rfind__doc__
},
4652 {"rindex", (PyCFunction
) unicode_rindex
, 1, rindex__doc__
},
4653 {"rjust", (PyCFunction
) unicode_rjust
, 1, rjust__doc__
},
4654 {"rstrip", (PyCFunction
) unicode_rstrip
, 0, rstrip__doc__
},
4655 {"splitlines", (PyCFunction
) unicode_splitlines
, 1, splitlines__doc__
},
4656 {"strip", (PyCFunction
) unicode_strip
, 0, strip__doc__
},
4657 {"swapcase", (PyCFunction
) unicode_swapcase
, 0, swapcase__doc__
},
4658 {"translate", (PyCFunction
) unicode_translate
, 1, translate__doc__
},
4659 {"upper", (PyCFunction
) unicode_upper
, 0, upper__doc__
},
4660 {"startswith", (PyCFunction
) unicode_startswith
, 1, startswith__doc__
},
4661 {"endswith", (PyCFunction
) unicode_endswith
, 1, endswith__doc__
},
4662 {"islower", (PyCFunction
) unicode_islower
, 0, islower__doc__
},
4663 {"isupper", (PyCFunction
) unicode_isupper
, 0, isupper__doc__
},
4664 {"istitle", (PyCFunction
) unicode_istitle
, 0, istitle__doc__
},
4665 {"isspace", (PyCFunction
) unicode_isspace
, 0, isspace__doc__
},
4666 {"isdecimal", (PyCFunction
) unicode_isdecimal
, 0, isdecimal__doc__
},
4667 {"isdigit", (PyCFunction
) unicode_isdigit
, 0, isdigit__doc__
},
4668 {"isnumeric", (PyCFunction
) unicode_isnumeric
, 0, isnumeric__doc__
},
4669 {"isalpha", (PyCFunction
) unicode_isalpha
, 0, isalpha__doc__
},
4670 {"isalnum", (PyCFunction
) unicode_isalnum
, 0, isalnum__doc__
},
4672 {"zfill", (PyCFunction
) unicode_zfill
, 1, zfill__doc__
},
4673 {"capwords", (PyCFunction
) unicode_capwords
, 0, capwords__doc__
},
4677 /* This one is just used for debugging the implementation. */
4678 {"freelistsize", (PyCFunction
) unicode_freelistsize
, 0},
4684 static PySequenceMethods unicode_as_sequence
= {
4685 (inquiry
) unicode_length
, /* sq_length */
4686 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4687 (intargfunc
) unicode_repeat
, /* sq_repeat */
4688 (intargfunc
) unicode_getitem
, /* sq_item */
4689 (intintargfunc
) unicode_slice
, /* sq_slice */
4690 0, /* sq_ass_item */
4691 0, /* sq_ass_slice */
4692 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4696 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4701 PyErr_SetString(PyExc_SystemError
,
4702 "accessing non-existent unicode segment");
4705 *ptr
= (void *) self
->str
;
4706 return PyUnicode_GET_DATA_SIZE(self
);
4710 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4713 PyErr_SetString(PyExc_TypeError
,
4714 "cannot use unicode as modifyable buffer");
4719 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4723 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4728 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
4735 PyErr_SetString(PyExc_SystemError
,
4736 "accessing non-existent unicode segment");
4739 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
4742 *ptr
= (void *) PyString_AS_STRING(str
);
4743 return PyString_GET_SIZE(str
);
4746 /* Helpers for PyUnicode_Format() */
4749 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
4751 int argidx
= *p_argidx
;
4752 if (argidx
< arglen
) {
4757 return PyTuple_GetItem(args
, argidx
);
4759 PyErr_SetString(PyExc_TypeError
,
4760 "not enough arguments for format string");
4764 #define F_LJUST (1<<0)
4765 #define F_SIGN (1<<1)
4766 #define F_BLANK (1<<2)
4767 #define F_ALT (1<<3)
4768 #define F_ZERO (1<<4)
4771 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
4777 va_start(va
, format
);
4779 /* First, format the string as char array, then expand to Py_UNICODE
4781 charbuffer
= (char *)buffer
;
4782 len
= vsprintf(charbuffer
, format
, va
);
4783 for (i
= len
- 1; i
>= 0; i
--)
4784 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
4791 formatfloat(Py_UNICODE
*buf
,
4798 /* fmt = '%#.' + `prec` + `type`
4799 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4803 x
= PyFloat_AsDouble(v
);
4804 if (x
== -1.0 && PyErr_Occurred())
4808 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
4810 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4811 /* worst case length calc to ensure no buffer overrun:
4813 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4814 for any double rep.)
4815 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4816 If prec=0 the effective precision is 1 (the leading digit is
4817 always given), therefore increase by one to 10+prec. */
4818 if (buflen
<= (size_t)10 + (size_t)prec
) {
4819 PyErr_SetString(PyExc_OverflowError
,
4820 "formatted float is too long (precision too long?)");
4823 return usprintf(buf
, fmt
, x
);
4827 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
4831 PyObject
*str
; /* temporary string object. */
4832 PyUnicodeObject
*result
;
4834 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
4837 result
= _PyUnicode_New(len
);
4838 for (i
= 0; i
< len
; i
++)
4839 result
->str
[i
] = buf
[i
];
4840 result
->str
[len
] = 0;
4842 return (PyObject
*)result
;
4846 formatint(Py_UNICODE
*buf
,
4853 /* fmt = '%#.' + `prec` + 'l' + `type`
4854 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4856 char fmt
[64]; /* plenty big enough! */
4858 int use_native_c_format
= 1;
4860 x
= PyInt_AsLong(v
);
4861 if (x
== -1 && PyErr_Occurred())
4865 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4866 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4867 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
4868 PyErr_SetString(PyExc_OverflowError
,
4869 "formatted integer is too long (precision too long?)");
4872 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4873 * but we want it (for consistency with other %#x conversions, and
4874 * for consistency with Python's hex() function).
4875 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4876 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4877 * So add it only if the platform doesn't already.
4879 if (x
== 0 && (flags
& F_ALT
) && (type
== 'x' || type
== 'X')) {
4880 /* Only way to know what the platform does is to try it. */
4881 sprintf(fmt
, type
== 'x' ? "%#x" : "%#X", 0);
4882 if (fmt
[1] != (char)type
) {
4883 /* Supply our own leading 0x/0X -- needed under std C */
4884 use_native_c_format
= 0;
4885 sprintf(fmt
, "0%c%%#.%dl%c", type
, prec
, type
);
4888 if (use_native_c_format
)
4889 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
4890 return usprintf(buf
, fmt
, x
);
4894 formatchar(Py_UNICODE
*buf
,
4898 /* presume that the buffer is at least 2 characters long */
4899 if (PyUnicode_Check(v
)) {
4900 if (PyUnicode_GET_SIZE(v
) != 1)
4902 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
4905 else if (PyString_Check(v
)) {
4906 if (PyString_GET_SIZE(v
) != 1)
4908 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
4912 /* Integer input truncated to a character */
4914 x
= PyInt_AsLong(v
);
4915 if (x
== -1 && PyErr_Occurred())
4923 PyErr_SetString(PyExc_TypeError
,
4924 "%c requires int or char");
4928 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4930 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4931 chars are formatted. XXX This is a magic number. Each formatting
4932 routine does bounds checking to ensure no overflow, but a better
4933 solution may be to malloc a buffer of appropriate size for each
4934 format. For now, the current solution is sufficient.
4936 #define FORMATBUFLEN (size_t)120
4938 PyObject
*PyUnicode_Format(PyObject
*format
,
4941 Py_UNICODE
*fmt
, *res
;
4942 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
4944 PyUnicodeObject
*result
= NULL
;
4945 PyObject
*dict
= NULL
;
4948 if (format
== NULL
|| args
== NULL
) {
4949 PyErr_BadInternalCall();
4952 uformat
= PyUnicode_FromObject(format
);
4953 if (uformat
== NULL
)
4955 fmt
= PyUnicode_AS_UNICODE(uformat
);
4956 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
4958 reslen
= rescnt
= fmtcnt
+ 100;
4959 result
= _PyUnicode_New(reslen
);
4962 res
= PyUnicode_AS_UNICODE(result
);
4964 if (PyTuple_Check(args
)) {
4965 arglen
= PyTuple_Size(args
);
4972 if (args
->ob_type
->tp_as_mapping
)
4975 while (--fmtcnt
>= 0) {
4978 rescnt
= fmtcnt
+ 100;
4980 if (_PyUnicode_Resize(&result
, reslen
) < 0)
4982 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
4988 /* Got a format specifier */
4992 Py_UNICODE c
= '\0';
4995 PyObject
*temp
= NULL
;
4999 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
5003 Py_UNICODE
*keystart
;
5009 PyErr_SetString(PyExc_TypeError
,
5010 "format requires a mapping");
5016 /* Skip over balanced parentheses */
5017 while (pcount
> 0 && --fmtcnt
>= 0) {
5020 else if (*fmt
== '(')
5024 keylen
= fmt
- keystart
- 1;
5025 if (fmtcnt
< 0 || pcount
> 0) {
5026 PyErr_SetString(PyExc_ValueError
,
5027 "incomplete format key");
5030 /* keys are converted to strings using UTF-8 and
5031 then looked up since Python uses strings to hold
5032 variables names etc. in its namespaces and we
5033 wouldn't want to break common idioms. */
5034 key
= PyUnicode_EncodeUTF8(keystart
,
5043 args
= PyObject_GetItem(dict
, key
);
5052 while (--fmtcnt
>= 0) {
5053 switch (c
= *fmt
++) {
5054 case '-': flags
|= F_LJUST
; continue;
5055 case '+': flags
|= F_SIGN
; continue;
5056 case ' ': flags
|= F_BLANK
; continue;
5057 case '#': flags
|= F_ALT
; continue;
5058 case '0': flags
|= F_ZERO
; continue;
5063 v
= getnextarg(args
, arglen
, &argidx
);
5066 if (!PyInt_Check(v
)) {
5067 PyErr_SetString(PyExc_TypeError
,
5071 width
= PyInt_AsLong(v
);
5079 else if (c
>= '0' && c
<= '9') {
5081 while (--fmtcnt
>= 0) {
5083 if (c
< '0' || c
> '9')
5085 if ((width
*10) / 10 != width
) {
5086 PyErr_SetString(PyExc_ValueError
,
5090 width
= width
*10 + (c
- '0');
5098 v
= getnextarg(args
, arglen
, &argidx
);
5101 if (!PyInt_Check(v
)) {
5102 PyErr_SetString(PyExc_TypeError
,
5106 prec
= PyInt_AsLong(v
);
5112 else if (c
>= '0' && c
<= '9') {
5114 while (--fmtcnt
>= 0) {
5115 c
= Py_CHARMASK(*fmt
++);
5116 if (c
< '0' || c
> '9')
5118 if ((prec
*10) / 10 != prec
) {
5119 PyErr_SetString(PyExc_ValueError
,
5123 prec
= prec
*10 + (c
- '0');
5128 if (c
== 'h' || c
== 'l' || c
== 'L') {
5134 PyErr_SetString(PyExc_ValueError
,
5135 "incomplete format");
5139 v
= getnextarg(args
, arglen
, &argidx
);
5149 /* presume that buffer length is at least 1 */
5156 if (PyUnicode_Check(v
) && c
== 's') {
5163 temp
= PyObject_Str(v
);
5165 temp
= PyObject_Repr(v
);
5168 if (!PyString_Check(temp
)) {
5169 /* XXX Note: this should never happen, since
5170 PyObject_Repr() and PyObject_Str() assure
5173 PyErr_SetString(PyExc_TypeError
,
5174 "%s argument has non-string str()");
5177 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5178 PyString_GET_SIZE(temp
),
5186 pbuf
= PyUnicode_AS_UNICODE(temp
);
5187 len
= PyUnicode_GET_SIZE(temp
);
5188 if (prec
>= 0 && len
> prec
)
5200 if (PyLong_Check(v
)) {
5201 temp
= formatlong(v
, flags
, prec
, c
);
5204 pbuf
= PyUnicode_AS_UNICODE(temp
);
5205 len
= PyUnicode_GET_SIZE(temp
);
5206 /* unbounded ints can always produce
5207 a sign character! */
5212 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5216 /* only d conversion is signed */
5229 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5240 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5246 PyErr_Format(PyExc_ValueError
,
5247 "unsupported format character '%c' (0x%x) "
5249 (31<=c
&& c
<=126) ? c
: '?',
5250 c
, fmt
-1 - PyUnicode_AS_UNICODE(uformat
));
5254 if (*pbuf
== '-' || *pbuf
== '+') {
5258 else if (flags
& F_SIGN
)
5260 else if (flags
& F_BLANK
)
5267 if (rescnt
< width
+ (sign
!= 0)) {
5269 rescnt
= width
+ fmtcnt
+ 100;
5271 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5273 res
= PyUnicode_AS_UNICODE(result
)
5283 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5284 assert(pbuf
[0] == '0');
5285 assert(pbuf
[1] == c
);
5296 if (width
> len
&& !(flags
& F_LJUST
)) {
5300 } while (--width
> len
);
5305 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5306 assert(pbuf
[0] == '0');
5307 assert(pbuf
[1] == c
);
5312 Py_UNICODE_COPY(res
, pbuf
, len
);
5315 while (--width
>= len
) {
5319 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5320 PyErr_SetString(PyExc_TypeError
,
5321 "not all arguments converted");
5327 if (argidx
< arglen
&& !dict
) {
5328 PyErr_SetString(PyExc_TypeError
,
5329 "not all arguments converted");
5337 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
5339 return (PyObject
*)result
;
5350 static PyBufferProcs unicode_as_buffer
= {
5351 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5352 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5353 (getsegcountproc
) unicode_buffer_getsegcount
,
5354 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5358 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5361 static char *kwlist
[] = {"string", "encoding", "errors", 0};
5362 char *encoding
= NULL
;
5363 char *errors
= NULL
;
5365 assert(type
== &PyUnicode_Type
);
5366 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
5367 kwlist
, &x
, &encoding
, &errors
))
5370 return (PyObject
*)_PyUnicode_New(0);
5371 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
5374 static char unicode_doc
[] =
5375 "unicode(string [, encoding[, errors]]) -> object\n\
5377 Create a new Unicode object from the given encoded string.\n\
5378 encoding defaults to the current default string encoding and \n\
5379 errors, defining the error handling, to 'strict'.";
5381 PyTypeObject PyUnicode_Type
= {
5382 PyObject_HEAD_INIT(&PyType_Type
)
5384 "unicode", /* tp_name */
5385 sizeof(PyUnicodeObject
), /* tp_size */
5386 0, /* tp_itemsize */
5388 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5392 (cmpfunc
) unicode_compare
, /* tp_compare */
5393 (reprfunc
) unicode_repr
, /* tp_repr */
5394 0, /* tp_as_number */
5395 &unicode_as_sequence
, /* tp_as_sequence */
5396 0, /* tp_as_mapping */
5397 (hashfunc
) unicode_hash
, /* tp_hash*/
5399 (reprfunc
) unicode_str
, /* tp_str */
5400 PyObject_GenericGetAttr
, /* tp_getattro */
5401 0, /* tp_setattro */
5402 &unicode_as_buffer
, /* tp_as_buffer */
5403 Py_TPFLAGS_DEFAULT
, /* tp_flags */
5404 unicode_doc
, /* tp_doc */
5405 0, /* tp_traverse */
5407 0, /* tp_richcompare */
5408 0, /* tp_weaklistoffset */
5410 0, /* tp_iternext */
5411 unicode_methods
, /* tp_methods */
5416 0, /* tp_descr_get */
5417 0, /* tp_descr_set */
5418 0, /* tp_dictoffset */
5421 unicode_new
, /* tp_new */
5424 /* Initialize the Unicode implementation */
5426 void _PyUnicode_Init(void)
5430 /* Init the implementation */
5431 unicode_freelist
= NULL
;
5432 unicode_freelist_size
= 0;
5433 unicode_empty
= _PyUnicode_New(0);
5434 strcpy(unicode_default_encoding
, "ascii");
5435 for (i
= 0; i
< 256; i
++)
5436 unicode_latin1
[i
] = NULL
;
5439 /* Finalize the Unicode implementation */
5442 _PyUnicode_Fini(void)
5447 Py_XDECREF(unicode_empty
);
5448 unicode_empty
= NULL
;
5450 for (i
= 0; i
< 256; i
++) {
5451 if (unicode_latin1
[i
]) {
5452 Py_DECREF(unicode_latin1
[i
]);
5453 unicode_latin1
[i
] = NULL
;
5457 for (u
= unicode_freelist
; u
!= NULL
;) {
5458 PyUnicodeObject
*v
= u
;
5459 u
= *(PyUnicodeObject
**)u
;
5462 Py_XDECREF(v
->defenc
);
5465 unicode_freelist
= NULL
;
5466 unicode_freelist_size
= 0;