3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_Del(unicode
);
227 void unicode_dealloc(register PyUnicodeObject
*unicode
)
229 if (PyUnicode_CheckExact(unicode
) &&
230 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
231 /* Keep-Alive optimization */
232 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
233 PyMem_DEL(unicode
->str
);
237 if (unicode
->defenc
) {
238 Py_DECREF(unicode
->defenc
);
239 unicode
->defenc
= NULL
;
241 /* Add to free list */
242 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
243 unicode_freelist
= unicode
;
244 unicode_freelist_size
++;
247 PyMem_DEL(unicode
->str
);
248 Py_XDECREF(unicode
->defenc
);
249 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
253 int PyUnicode_Resize(PyObject
**unicode
,
256 register PyUnicodeObject
*v
;
258 /* Argument checks */
259 if (unicode
== NULL
) {
260 PyErr_BadInternalCall();
263 v
= (PyUnicodeObject
*)*unicode
;
264 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1 || length
< 0) {
265 PyErr_BadInternalCall();
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v
->length
!= length
&&
273 (v
== unicode_empty
|| v
->length
== 1)) {
274 PyUnicodeObject
*w
= _PyUnicode_New(length
);
277 Py_UNICODE_COPY(w
->str
, v
->str
,
278 length
< v
->length
? length
: v
->length
);
280 *unicode
= (PyObject
*)w
;
284 /* Note that we don't have to modify *unicode for unshared Unicode
285 objects, since we can modify them in-place. */
286 return unicode_resize(v
, length
);
289 /* Internal API for use in unicodeobject.c only ! */
290 #define _PyUnicode_Resize(unicodevar, length) \
291 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
293 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
296 PyUnicodeObject
*unicode
;
298 /* If the Unicode data is known at construction time, we can apply
299 some optimizations which share commonly used objects. */
302 /* Optimization for empty strings */
303 if (size
== 0 && unicode_empty
!= NULL
) {
304 Py_INCREF(unicode_empty
);
305 return (PyObject
*)unicode_empty
;
308 /* Single character Unicode objects in the Latin-1 range are
309 shared when using this constructor */
310 if (size
== 1 && *u
< 256) {
311 unicode
= unicode_latin1
[*u
];
313 unicode
= _PyUnicode_New(1);
316 unicode
->str
[0] = *u
;
317 unicode_latin1
[*u
] = unicode
;
320 return (PyObject
*)unicode
;
324 unicode
= _PyUnicode_New(size
);
328 /* Copy the Unicode data into the new object */
330 Py_UNICODE_COPY(unicode
->str
, u
, size
);
332 return (PyObject
*)unicode
;
337 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
340 PyUnicodeObject
*unicode
;
343 PyErr_BadInternalCall();
347 unicode
= _PyUnicode_New(size
);
351 /* Copy the wchar_t data into the new object */
352 #ifdef HAVE_USABLE_WCHAR_T
353 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
356 register Py_UNICODE
*u
;
358 u
= PyUnicode_AS_UNICODE(unicode
);
359 for (i
= size
; i
>= 0; i
--)
364 return (PyObject
*)unicode
;
367 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
371 if (unicode
== NULL
) {
372 PyErr_BadInternalCall();
375 if (size
> PyUnicode_GET_SIZE(unicode
))
376 size
= PyUnicode_GET_SIZE(unicode
);
377 #ifdef HAVE_USABLE_WCHAR_T
378 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
381 register Py_UNICODE
*u
;
383 u
= PyUnicode_AS_UNICODE(unicode
);
384 for (i
= size
; i
>= 0; i
--)
394 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
398 #ifdef Py_UNICODE_WIDE
399 if (ordinal
< 0 || ordinal
> 0x10ffff) {
400 PyErr_SetString(PyExc_ValueError
,
401 "unichr() arg not in range(0x110000) "
402 "(wide Python build)");
406 if (ordinal
< 0 || ordinal
> 0xffff) {
407 PyErr_SetString(PyExc_ValueError
,
408 "unichr() arg not in range(0x10000) "
409 "(narrow Python build)");
414 if (ordinal
<= 0xffff) {
415 /* UCS-2 character */
416 s
[0] = (Py_UNICODE
) ordinal
;
417 return PyUnicode_FromUnicode(s
, 1);
420 #ifndef Py_UNICODE_WIDE
421 /* UCS-4 character. store as two surrogate characters */
423 s
[0] = 0xD800 + (Py_UNICODE
) (ordinal
>> 10);
424 s
[1] = 0xDC00 + (Py_UNICODE
) (ordinal
& 0x03FF);
425 return PyUnicode_FromUnicode(s
, 2);
427 s
[0] = (Py_UNICODE
)ordinal
;
428 return PyUnicode_FromUnicode(s
, 1);
433 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
435 /* XXX Perhaps we should make this API an alias of
436 PyObject_Unicode() instead ?! */
437 if (PyUnicode_CheckExact(obj
)) {
441 if (PyUnicode_Check(obj
)) {
442 /* For a Unicode subtype that's not a Unicode object,
443 return a true Unicode object with the same data. */
444 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
445 PyUnicode_GET_SIZE(obj
));
447 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
450 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
451 const char *encoding
,
454 const char *s
= NULL
;
459 PyErr_BadInternalCall();
464 /* For b/w compatibility we also accept Unicode objects provided
465 that no encodings is given and then redirect to
466 PyObject_Unicode() which then applies the additional logic for
469 NOTE: This API should really only be used for object which
470 represent *encoded* Unicode !
473 if (PyUnicode_Check(obj
)) {
475 PyErr_SetString(PyExc_TypeError
,
476 "decoding Unicode is not supported");
479 return PyObject_Unicode(obj
);
482 if (PyUnicode_Check(obj
)) {
483 PyErr_SetString(PyExc_TypeError
,
484 "decoding Unicode is not supported");
490 if (PyString_Check(obj
)) {
491 s
= PyString_AS_STRING(obj
);
492 len
= PyString_GET_SIZE(obj
);
494 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
495 /* Overwrite the error message with something more useful in
496 case of a TypeError. */
497 if (PyErr_ExceptionMatches(PyExc_TypeError
))
498 PyErr_Format(PyExc_TypeError
,
499 "coercing to Unicode: need string or buffer, "
501 obj
->ob_type
->tp_name
);
505 /* Convert to Unicode */
507 Py_INCREF(unicode_empty
);
508 v
= (PyObject
*)unicode_empty
;
511 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
519 PyObject
*PyUnicode_Decode(const char *s
,
521 const char *encoding
,
524 PyObject
*buffer
= NULL
, *unicode
;
526 if (encoding
== NULL
)
527 encoding
= PyUnicode_GetDefaultEncoding();
529 /* Shortcuts for common default encodings */
530 if (strcmp(encoding
, "utf-8") == 0)
531 return PyUnicode_DecodeUTF8(s
, size
, errors
);
532 else if (strcmp(encoding
, "latin-1") == 0)
533 return PyUnicode_DecodeLatin1(s
, size
, errors
);
534 else if (strcmp(encoding
, "ascii") == 0)
535 return PyUnicode_DecodeASCII(s
, size
, errors
);
537 /* Decode via the codec registry */
538 buffer
= PyBuffer_FromMemory((void *)s
, size
);
541 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
544 if (!PyUnicode_Check(unicode
)) {
545 PyErr_Format(PyExc_TypeError
,
546 "decoder did not return an unicode object (type=%.400s)",
547 unicode
->ob_type
->tp_name
);
559 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
561 const char *encoding
,
564 PyObject
*v
, *unicode
;
566 unicode
= PyUnicode_FromUnicode(s
, size
);
569 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
574 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
575 const char *encoding
,
580 if (!PyUnicode_Check(unicode
)) {
585 if (encoding
== NULL
)
586 encoding
= PyUnicode_GetDefaultEncoding();
588 /* Shortcuts for common default encodings */
589 if (errors
== NULL
) {
590 if (strcmp(encoding
, "utf-8") == 0)
591 return PyUnicode_AsUTF8String(unicode
);
592 else if (strcmp(encoding
, "latin-1") == 0)
593 return PyUnicode_AsLatin1String(unicode
);
594 else if (strcmp(encoding
, "ascii") == 0)
595 return PyUnicode_AsASCIIString(unicode
);
598 /* Encode via the codec registry */
599 v
= PyCodec_Encode(unicode
, encoding
, errors
);
602 /* XXX Should we really enforce this ? */
603 if (!PyString_Check(v
)) {
604 PyErr_Format(PyExc_TypeError
,
605 "encoder did not return a string object (type=%.400s)",
606 v
->ob_type
->tp_name
);
616 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
619 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
623 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
624 if (v
&& errors
== NULL
)
625 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
629 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
631 if (!PyUnicode_Check(unicode
)) {
635 return PyUnicode_AS_UNICODE(unicode
);
641 int PyUnicode_GetSize(PyObject
*unicode
)
643 if (!PyUnicode_Check(unicode
)) {
647 return PyUnicode_GET_SIZE(unicode
);
653 const char *PyUnicode_GetDefaultEncoding(void)
655 return unicode_default_encoding
;
658 int PyUnicode_SetDefaultEncoding(const char *encoding
)
662 /* Make sure the encoding is valid. As side effect, this also
663 loads the encoding into the codec registry cache. */
664 v
= _PyCodec_Lookup(encoding
);
668 strncpy(unicode_default_encoding
,
670 sizeof(unicode_default_encoding
));
677 /* error handling callback helper:
678 build arguments, call the callback and check the arguments,
679 if no exception occured, copy the replacement to the output
680 and adjust various state variables.
681 return 0 on success, -1 on error
685 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
686 const char *encoding
, const char *reason
,
687 const char *input
, int insize
, int *startinpos
, int *endinpos
, PyObject
**exceptionObject
, const char **inptr
,
688 PyObject
**output
, int *outpos
, Py_UNICODE
**outptr
)
690 static char *argparse
= "O!i;decoding error handler must return (unicode, int) tuple";
692 PyObject
*restuple
= NULL
;
693 PyObject
*repunicode
= NULL
;
694 int outsize
= PyUnicode_GET_SIZE(*output
);
701 if (*errorHandler
== NULL
) {
702 *errorHandler
= PyCodec_LookupError(errors
);
703 if (*errorHandler
== NULL
)
707 if (*exceptionObject
== NULL
) {
708 *exceptionObject
= PyUnicodeDecodeError_Create(
709 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
710 if (*exceptionObject
== NULL
)
714 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
716 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
718 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
722 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
723 if (restuple
== NULL
)
725 if (!PyTuple_Check(restuple
)) {
726 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
729 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
732 newpos
= insize
+newpos
;
733 if (newpos
<0 || newpos
>insize
) {
734 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", newpos
);
738 /* need more space? (at least enough for what we
739 have+the replacement+the rest of the string (starting
740 at the new input position), so we won't have to check space
741 when there are no errors in the rest of the string) */
742 repptr
= PyUnicode_AS_UNICODE(repunicode
);
743 repsize
= PyUnicode_GET_SIZE(repunicode
);
744 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
745 if (requiredsize
> outsize
) {
746 if (requiredsize
<2*outsize
)
747 requiredsize
= 2*outsize
;
748 if (PyUnicode_Resize(output
, requiredsize
))
750 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
753 *inptr
= input
+ newpos
;
754 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
761 Py_XDECREF(restuple
);
765 /* --- UTF-7 Codec -------------------------------------------------------- */
767 /* see RFC2152 for details */
770 char utf7_special
[128] = {
771 /* indicate whether a UTF-7 character is special i.e. cannot be directly
775 2 - whitespace (optional)
776 3 - RFC2152 Set O (optional) */
777 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
778 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
779 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
780 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
781 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
782 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
783 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
784 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
788 #define SPECIAL(c, encodeO, encodeWS) \
789 (((c)>127 || utf7_special[(c)] == 1) || \
790 (encodeWS && (utf7_special[(c)] == 2)) || \
791 (encodeO && (utf7_special[(c)] == 3)))
793 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
794 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
795 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
796 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
798 #define ENCODE(out, ch, bits) \
799 while (bits >= 6) { \
800 *out++ = B64(ch >> (bits-6)); \
804 #define DECODE(out, ch, bits, surrogate) \
805 while (bits >= 16) { \
806 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
809 /* We have already generated an error for the high surrogate
810 so let's not bother seeing if the low surrogate is correct or not */\
812 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
813 /* This is a surrogate pair. Unfortunately we can't represent \
814 it in a 16-bit character */ \
816 errmsg = "code pairs are not supported"; \
823 PyObject *PyUnicode_DecodeUTF7(const char *s,
827 const char *starts
= s
;
832 PyUnicodeObject
*unicode
;
834 const char *errmsg
= "";
836 unsigned int bitsleft
= 0;
837 unsigned long charsleft
= 0;
839 PyObject
*errorHandler
= NULL
;
840 PyObject
*exc
= NULL
;
842 unicode
= _PyUnicode_New(size
);
846 return (PyObject
*)unicode
;
857 if ((ch
== '-') || !B64CHAR(ch
)) {
861 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
863 /* The shift sequence has a partial character in it. If
864 bitsleft < 6 then we could just classify it as padding
865 but that is not the case here */
867 errmsg
= "partial character in shift sequence";
870 /* According to RFC2152 the remaining bits should be zero. We
871 choose to signal an error/insert a replacement character
872 here so indicate the potential of a misencoded character. */
874 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
875 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
876 errmsg
= "non-zero padding bits in shift sequence";
881 if ((s
< e
) && (*(s
) == '-')) {
885 } else if (SPECIAL(ch
,0,0)) {
886 errmsg
= "unexpected special character";
892 charsleft
= (charsleft
<< 6) | UB64(ch
);
895 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
898 else if ( ch
== '+' ) {
899 startinpos
= s
-starts
;
901 if (s
< e
&& *s
== '-') {
910 else if (SPECIAL(ch
,0,0)) {
911 errmsg
= "unexpected special character";
921 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
923 if (unicode_decode_call_errorhandler(
924 errors
, &errorHandler
,
926 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
927 (PyObject
**)&unicode
, &outpos
, &p
))
932 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
934 if (unicode_decode_call_errorhandler(
935 errors
, &errorHandler
,
936 "utf7", "unterminated shift sequence",
937 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
938 (PyObject
**)&unicode
, &outpos
, &p
))
944 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)))
947 Py_XDECREF(errorHandler
);
949 return (PyObject
*)unicode
;
952 Py_XDECREF(errorHandler
);
959 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
962 int encodeWhiteSpace
,
966 /* It might be possible to tighten this worst case */
967 unsigned int cbAllocated
= 5 * size
;
970 unsigned int bitsleft
= 0;
971 unsigned long charsleft
= 0;
976 return PyString_FromStringAndSize(NULL
, 0);
978 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
982 start
= out
= PyString_AS_STRING(v
);
983 for (;i
< size
; ++i
) {
984 Py_UNICODE ch
= s
[i
];
990 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
994 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
995 inShift
= bitsleft
> 0;
1000 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
1001 *out
++ = B64(charsleft
<< (6-bitsleft
));
1004 /* Characters not in the BASE64 set implicitly unshift the sequence
1005 so no '-' is required, except if the character is itself a '-' */
1006 if (B64CHAR(ch
) || ch
== '-') {
1013 charsleft
= (charsleft
<< 16) | ch
;
1014 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
1016 /* If the next character is special then we dont' need to terminate
1017 the shift sequence. If the next character is not a BASE64 character
1018 or '-' then the shift sequence will be terminated implicitly and we
1019 don't have to insert a '-'. */
1021 if (bitsleft
== 0) {
1023 Py_UNICODE ch2
= s
[i
+1];
1025 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
1027 } else if (B64CHAR(ch2
) || ch2
== '-') {
1044 *out
++= B64(charsleft
<< (6-bitsleft
) );
1048 _PyString_Resize(&v
, out
- start
);
1059 /* --- UTF-8 Codec -------------------------------------------------------- */
1062 char utf8_code_length
[256] = {
1063 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1064 illegal prefix. see RFC 2279 for details */
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1083 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1087 const char *starts
= s
;
1093 PyUnicodeObject
*unicode
;
1095 const char *errmsg
= "";
1096 PyObject
*errorHandler
= NULL
;
1097 PyObject
*exc
= NULL
;
1099 /* Note: size will always be longer than the resulting Unicode
1101 unicode
= _PyUnicode_New(size
);
1105 return (PyObject
*)unicode
;
1107 /* Unpack UTF-8 encoded data */
1112 Py_UCS4 ch
= (unsigned char)*s
;
1115 *p
++ = (Py_UNICODE
)ch
;
1120 n
= utf8_code_length
[ch
];
1123 errmsg
= "unexpected end of data";
1124 startinpos
= s
-starts
;
1132 errmsg
= "unexpected code byte";
1133 startinpos
= s
-starts
;
1134 endinpos
= startinpos
+1;
1138 errmsg
= "internal error";
1139 startinpos
= s
-starts
;
1140 endinpos
= startinpos
+1;
1144 if ((s
[1] & 0xc0) != 0x80) {
1145 errmsg
= "invalid data";
1146 startinpos
= s
-starts
;
1147 endinpos
= startinpos
+2;
1150 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1152 startinpos
= s
-starts
;
1153 endinpos
= startinpos
+2;
1154 errmsg
= "illegal encoding";
1158 *p
++ = (Py_UNICODE
)ch
;
1162 if ((s
[1] & 0xc0) != 0x80 ||
1163 (s
[2] & 0xc0) != 0x80) {
1164 errmsg
= "invalid data";
1165 startinpos
= s
-starts
;
1166 endinpos
= startinpos
+3;
1169 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1171 /* Note: UTF-8 encodings of surrogates are considered
1172 legal UTF-8 sequences;
1174 XXX For wide builds (UCS-4) we should probably try
1175 to recombine the surrogates into a single code
1178 errmsg
= "illegal encoding";
1179 startinpos
= s
-starts
;
1180 endinpos
= startinpos
+3;
1184 *p
++ = (Py_UNICODE
)ch
;
1188 if ((s
[1] & 0xc0) != 0x80 ||
1189 (s
[2] & 0xc0) != 0x80 ||
1190 (s
[3] & 0xc0) != 0x80) {
1191 errmsg
= "invalid data";
1192 startinpos
= s
-starts
;
1193 endinpos
= startinpos
+4;
1196 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1197 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1198 /* validate and convert to UTF-16 */
1199 if ((ch
< 0x10000) /* minimum value allowed for 4
1201 || (ch
> 0x10ffff)) /* maximum value allowed for
1204 errmsg
= "illegal encoding";
1205 startinpos
= s
-starts
;
1206 endinpos
= startinpos
+4;
1209 #ifdef Py_UNICODE_WIDE
1210 *p
++ = (Py_UNICODE
)ch
;
1212 /* compute and append the two surrogates: */
1214 /* translate from 10000..10FFFF to 0..FFFF */
1217 /* high surrogate = top 10 bits added to D800 */
1218 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1220 /* low surrogate = bottom 10 bits added to DC00 */
1221 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1226 /* Other sizes are only needed for UCS-4 */
1227 errmsg
= "unsupported Unicode code range";
1228 startinpos
= s
-starts
;
1229 endinpos
= startinpos
+n
;
1236 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1237 if (unicode_decode_call_errorhandler(
1238 errors
, &errorHandler
,
1240 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1241 (PyObject
**)&unicode
, &outpos
, &p
))
1246 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1249 Py_XDECREF(errorHandler
);
1251 return (PyObject
*)unicode
;
1254 Py_XDECREF(errorHandler
);
1260 /* Allocation strategy: if the string is short, convert into a stack buffer
1261 and allocate exactly as much space needed at the end. Else allocate the
1262 maximum possible needed (4 result bytes per Unicode character), and return
1263 the excess memory at the end.
1266 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1270 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
1272 int i
; /* index into s of next input byte */
1273 PyObject
*v
; /* result string object */
1274 char *p
; /* next free byte in output buffer */
1275 int nallocated
; /* number of result bytes allocated */
1276 int nneeded
; /* number of result bytes needed */
1277 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
1282 if (size
<= MAX_SHORT_UNICHARS
) {
1283 /* Write into the stack buffer; nallocated can't overflow.
1284 * At the end, we'll allocate exactly as much heap space as it
1285 * turns out we need.
1287 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
1288 v
= NULL
; /* will allocate after we're done */
1292 /* Overallocate on the heap, and give the excess back at the end. */
1293 nallocated
= size
* 4;
1294 if (nallocated
/ 4 != size
) /* overflow! */
1295 return PyErr_NoMemory();
1296 v
= PyString_FromStringAndSize(NULL
, nallocated
);
1299 p
= PyString_AS_STRING(v
);
1302 for (i
= 0; i
< size
;) {
1303 Py_UCS4 ch
= s
[i
++];
1309 else if (ch
< 0x0800) {
1310 /* Encode Latin-1 */
1311 *p
++ = (char)(0xc0 | (ch
>> 6));
1312 *p
++ = (char)(0x80 | (ch
& 0x3f));
1315 /* Encode UCS2 Unicode ordinals */
1317 /* Special case: check for high surrogate */
1318 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
1320 /* Check for low surrogate and combine the two to
1321 form a UCS4 value */
1322 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1323 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
1327 /* Fall through: handles isolated high surrogates */
1329 *p
++ = (char)(0xe0 | (ch
>> 12));
1330 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1331 *p
++ = (char)(0x80 | (ch
& 0x3f));
1335 /* Encode UCS4 Unicode ordinals */
1336 *p
++ = (char)(0xf0 | (ch
>> 18));
1337 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1338 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1339 *p
++ = (char)(0x80 | (ch
& 0x3f));
1344 /* This was stack allocated. */
1345 nneeded
= Py_SAFE_DOWNCAST(p
- stackbuf
, long, int);
1346 assert(nneeded
<= nallocated
);
1347 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
1350 /* Cut back to size actually needed. */
1351 nneeded
= Py_SAFE_DOWNCAST(p
- PyString_AS_STRING(v
), long, int);
1352 assert(nneeded
<= nallocated
);
1353 _PyString_Resize(&v
, nneeded
);
1357 #undef MAX_SHORT_UNICHARS
1360 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1362 if (!PyUnicode_Check(unicode
)) {
1363 PyErr_BadArgument();
1366 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1367 PyUnicode_GET_SIZE(unicode
),
1371 /* --- UTF-16 Codec ------------------------------------------------------- */
1374 PyUnicode_DecodeUTF16(const char *s
,
1379 const char *starts
= s
;
1383 PyUnicodeObject
*unicode
;
1385 const unsigned char *q
, *e
;
1386 int bo
= 0; /* assume native ordering by default */
1387 const char *errmsg
= "";
1388 /* Offsets from q for retrieving byte pairs in the right order. */
1389 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390 int ihi
= 1, ilo
= 0;
1392 int ihi
= 0, ilo
= 1;
1394 PyObject
*errorHandler
= NULL
;
1395 PyObject
*exc
= NULL
;
1397 /* Note: size will always be longer than the resulting Unicode
1399 unicode
= _PyUnicode_New(size
);
1403 return (PyObject
*)unicode
;
1405 /* Unpack UTF-16 encoded data */
1407 q
= (unsigned char *)s
;
1413 /* Check for BOM marks (U+FEFF) in the input and adjust current
1414 byte order setting accordingly. In native mode, the leading BOM
1415 mark is skipped, in all other modes, it is copied to the output
1416 stream as-is (giving a ZWNBSP character). */
1418 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1419 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1420 if (bom
== 0xFEFF) {
1424 else if (bom
== 0xFFFE) {
1429 if (bom
== 0xFEFF) {
1433 else if (bom
== 0xFFFE) {
1453 /* remaing bytes at the end? (size should be even) */
1455 errmsg
= "truncated data";
1456 startinpos
= ((const char *)q
)-starts
;
1457 endinpos
= ((const char *)e
)-starts
;
1459 /* The remaining input chars are ignored if the callback
1460 chooses to skip the input */
1462 ch
= (q
[ihi
] << 8) | q
[ilo
];
1466 if (ch
< 0xD800 || ch
> 0xDFFF) {
1471 /* UTF-16 code pair: */
1473 errmsg
= "unexpected end of data";
1474 startinpos
= (((const char *)q
)-2)-starts
;
1475 endinpos
= ((const char *)e
)-starts
;
1478 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1479 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1481 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1482 #ifndef Py_UNICODE_WIDE
1486 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1491 errmsg
= "illegal UTF-16 surrogate";
1492 startinpos
= (((const char *)q
)-4)-starts
;
1493 endinpos
= startinpos
+2;
1498 errmsg
= "illegal encoding";
1499 startinpos
= (((const char *)q
)-2)-starts
;
1500 endinpos
= startinpos
+2;
1501 /* Fall through to report the error */
1504 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1505 if (unicode_decode_call_errorhandler(
1506 errors
, &errorHandler
,
1508 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
1509 (PyObject
**)&unicode
, &outpos
, &p
))
1517 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1520 Py_XDECREF(errorHandler
);
1522 return (PyObject
*)unicode
;
1526 Py_XDECREF(errorHandler
);
1532 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1540 /* Offsets from p for storing byte pairs in the right order. */
1541 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542 int ihi
= 1, ilo
= 0;
1544 int ihi
= 0, ilo
= 1;
1547 #define STORECHAR(CH) \
1549 p[ihi] = ((CH) >> 8) & 0xff; \
1550 p[ilo] = (CH) & 0xff; \
1554 for (i
= pairs
= 0; i
< size
; i
++)
1555 if (s
[i
] >= 0x10000)
1557 v
= PyString_FromStringAndSize(NULL
,
1558 2 * (size
+ pairs
+ (byteorder
== 0)));
1562 p
= (unsigned char *)PyString_AS_STRING(v
);
1568 if (byteorder
== -1) {
1573 else if (byteorder
== 1) {
1579 while (size
-- > 0) {
1580 Py_UNICODE ch
= *s
++;
1582 if (ch
>= 0x10000) {
1583 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1584 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1594 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1596 if (!PyUnicode_Check(unicode
)) {
1597 PyErr_BadArgument();
1600 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1601 PyUnicode_GET_SIZE(unicode
),
1606 /* --- Unicode Escape Codec ----------------------------------------------- */
1608 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1610 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1614 const char *starts
= s
;
1623 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1624 PyObject
*errorHandler
= NULL
;
1625 PyObject
*exc
= NULL
;
1627 /* Escaped strings will always be longer than the resulting
1628 Unicode string, so we start with size here and then reduce the
1629 length after conversion to the true value.
1630 (but if the error callback returns a long replacement string
1631 we'll have to allocate more space) */
1632 v
= _PyUnicode_New(size
);
1636 return (PyObject
*)v
;
1638 p
= PyUnicode_AS_UNICODE(v
);
1646 /* Non-escape characters are interpreted as Unicode ordinals */
1648 *p
++ = (unsigned char) *s
++;
1652 startinpos
= s
-starts
;
1659 case '\\': *p
++ = '\\'; break;
1660 case '\'': *p
++ = '\''; break;
1661 case '\"': *p
++ = '\"'; break;
1662 case 'b': *p
++ = '\b'; break;
1663 case 'f': *p
++ = '\014'; break; /* FF */
1664 case 't': *p
++ = '\t'; break;
1665 case 'n': *p
++ = '\n'; break;
1666 case 'r': *p
++ = '\r'; break;
1667 case 'v': *p
++ = '\013'; break; /* VT */
1668 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1670 /* \OOO (octal) escapes */
1671 case '0': case '1': case '2': case '3':
1672 case '4': case '5': case '6': case '7':
1674 if ('0' <= *s
&& *s
<= '7') {
1675 x
= (x
<<3) + *s
++ - '0';
1676 if ('0' <= *s
&& *s
<= '7')
1677 x
= (x
<<3) + *s
++ - '0';
1686 message
= "truncated \\xXX escape";
1692 message
= "truncated \\uXXXX escape";
1698 message
= "truncated \\UXXXXXXXX escape";
1701 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1704 if (unicode_decode_call_errorhandler(
1705 errors
, &errorHandler
,
1706 "unicodeescape", "end of string in escape sequence",
1707 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1708 (PyObject
**)&v
, &outpos
, &p
))
1712 for (i
= 0; i
< digits
; ++i
) {
1713 c
= (unsigned char) s
[i
];
1715 endinpos
= (s
+i
+1)-starts
;
1716 if (unicode_decode_call_errorhandler(
1717 errors
, &errorHandler
,
1718 "unicodeescape", message
,
1719 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1720 (PyObject
**)&v
, &outpos
, &p
))
1724 chr
= (chr
<<4) & ~0xF;
1725 if (c
>= '0' && c
<= '9')
1727 else if (c
>= 'a' && c
<= 'f')
1728 chr
+= 10 + c
- 'a';
1730 chr
+= 10 + c
- 'A';
1733 if (chr
== 0xffffffff)
1734 /* _decoding_error will have already written into the
1738 /* when we get here, chr is a 32-bit unicode character */
1740 /* UCS-2 character */
1741 *p
++ = (Py_UNICODE
) chr
;
1742 else if (chr
<= 0x10ffff) {
1743 /* UCS-4 character. Either store directly, or as
1745 #ifdef Py_UNICODE_WIDE
1749 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1750 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1753 endinpos
= s
-starts
;
1754 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1755 if (unicode_decode_call_errorhandler(
1756 errors
, &errorHandler
,
1757 "unicodeescape", "illegal Unicode character",
1758 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1759 (PyObject
**)&v
, &outpos
, &p
))
1766 message
= "malformed \\N character escape";
1767 if (ucnhash_CAPI
== NULL
) {
1768 /* load the unicode data module */
1770 m
= PyImport_ImportModule("unicodedata");
1773 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1777 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1779 if (ucnhash_CAPI
== NULL
)
1783 const char *start
= s
+1;
1784 /* look for the closing brace */
1785 while (*s
!= '}' && s
< end
)
1787 if (s
> start
&& s
< end
&& *s
== '}') {
1788 /* found a name. look it up in the unicode database */
1789 message
= "unknown Unicode character name";
1791 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1795 endinpos
= s
-starts
;
1796 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1797 if (unicode_decode_call_errorhandler(
1798 errors
, &errorHandler
,
1799 "unicodeescape", message
,
1800 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1801 (PyObject
**)&v
, &outpos
, &p
))
1807 message
= "\\ at end of string";
1809 endinpos
= s
-starts
;
1810 outpos
= p
-PyUnicode_AS_UNICODE(v
);
1811 if (unicode_decode_call_errorhandler(
1812 errors
, &errorHandler
,
1813 "unicodeescape", message
,
1814 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1815 (PyObject
**)&v
, &outpos
, &p
))
1820 *p
++ = (unsigned char)s
[-1];
1827 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
1829 return (PyObject
*)v
;
1834 "\\N escapes not supported (can't load unicodedata module)"
1836 Py_XDECREF(errorHandler
);
1842 Py_XDECREF(errorHandler
);
1847 /* Return a Unicode-Escape string version of the Unicode object.
1849 If quotes is true, the string is enclosed in u"" or u'' quotes as
1854 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1859 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1866 static const char *hexdigit
= "0123456789abcdef";
1868 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1872 p
= PyString_AS_STRING(repr
);
1876 *p
++ = (findchar(s
, size
, '\'') &&
1877 !findchar(s
, size
, '"')) ? '"' : '\'';
1879 while (size
-- > 0) {
1880 Py_UNICODE ch
= *s
++;
1884 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
1890 #ifdef Py_UNICODE_WIDE
1891 /* Map 21-bit characters to '\U00xxxxxx' */
1892 else if (ch
>= 0x10000) {
1893 int offset
= p
- PyString_AS_STRING(repr
);
1895 /* Resize the string if necessary */
1896 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
1897 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
1899 p
= PyString_AS_STRING(repr
) + offset
;
1904 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
1905 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
1906 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
1907 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
1908 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
1909 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
1910 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
1911 *p
++ = hexdigit
[ch
& 0x0000000F];
1915 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916 else if (ch
>= 0xD800 && ch
< 0xDC00) {
1922 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
1923 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
1926 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
1927 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
1928 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
1929 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
1930 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
1931 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
1932 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
1933 *p
++ = hexdigit
[ucs
& 0x0000000F];
1936 /* Fall through: isolated surrogates are copied as-is */
1941 /* Map 16-bit characters to '\uxxxx' */
1945 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
1946 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
1947 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1948 *p
++ = hexdigit
[ch
& 0x000F];
1951 /* Map special whitespace to '\t', \n', '\r' */
1952 else if (ch
== '\t') {
1956 else if (ch
== '\n') {
1960 else if (ch
== '\r') {
1965 /* Map non-printable US ASCII to '\xhh' */
1966 else if (ch
< ' ' || ch
>= 0x7F) {
1969 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1970 *p
++ = hexdigit
[ch
& 0x000F];
1973 /* Copy everything else as-is */
1978 *p
++ = PyString_AS_STRING(repr
)[1];
1981 _PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
));
1985 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1988 return unicodeescape_string(s
, size
, 0);
1991 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1993 if (!PyUnicode_Check(unicode
)) {
1994 PyErr_BadArgument();
1997 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1998 PyUnicode_GET_SIZE(unicode
));
2001 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2003 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
2007 const char *starts
= s
;
2015 PyObject
*errorHandler
= NULL
;
2016 PyObject
*exc
= NULL
;
2018 /* Escaped strings will always be longer than the resulting
2019 Unicode string, so we start with size here and then reduce the
2020 length after conversion to the true value. (But decoding error
2021 handler might have to resize the string) */
2022 v
= _PyUnicode_New(size
);
2026 return (PyObject
*)v
;
2027 p
= PyUnicode_AS_UNICODE(v
);
2034 /* Non-escape characters are interpreted as Unicode ordinals */
2036 *p
++ = (unsigned char)*s
++;
2039 startinpos
= s
-starts
;
2041 /* \u-escapes are only interpreted iff the number of leading
2042 backslashes if odd */
2047 *p
++ = (unsigned char)*s
++;
2049 if (((s
- bs
) & 1) == 0 ||
2057 /* \uXXXX with 4 hex digits */
2058 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2059 for (x
= 0, i
= 0; i
< 4; ++i
, ++s
) {
2060 c
= (unsigned char)*s
;
2062 endinpos
= s
-starts
;
2063 if (unicode_decode_call_errorhandler(
2064 errors
, &errorHandler
,
2065 "rawunicodeescape", "truncated \\uXXXX",
2066 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2067 (PyObject
**)&v
, &outpos
, &p
))
2072 if (c
>= '0' && c
<= '9')
2074 else if (c
>= 'a' && c
<= 'f')
2083 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2085 Py_XDECREF(errorHandler
);
2087 return (PyObject
*)v
;
2091 Py_XDECREF(errorHandler
);
2096 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
2103 static const char *hexdigit
= "0123456789abcdef";
2105 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
2111 p
= q
= PyString_AS_STRING(repr
);
2112 while (size
-- > 0) {
2113 Py_UNICODE ch
= *s
++;
2114 /* Map 16-bit characters to '\uxxxx' */
2118 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
2119 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
2120 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
2121 *p
++ = hexdigit
[ch
& 15];
2123 /* Copy everything else as-is */
2128 _PyString_Resize(&repr
, p
- q
);
2132 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2134 if (!PyUnicode_Check(unicode
)) {
2135 PyErr_BadArgument();
2138 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2139 PyUnicode_GET_SIZE(unicode
));
2142 /* --- Latin-1 Codec ------------------------------------------------------ */
2144 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2151 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2152 if (size
== 1 && *(unsigned char*)s
< 256) {
2153 Py_UNICODE r
= *(unsigned char*)s
;
2154 return PyUnicode_FromUnicode(&r
, 1);
2157 v
= _PyUnicode_New(size
);
2161 return (PyObject
*)v
;
2162 p
= PyUnicode_AS_UNICODE(v
);
2164 *p
++ = (unsigned char)*s
++;
2165 return (PyObject
*)v
;
2172 /* create or adjust a UnicodeEncodeError */
2173 static void make_encode_exception(PyObject
**exceptionObject
,
2174 const char *encoding
,
2175 const Py_UNICODE
*unicode
, int size
,
2176 int startpos
, int endpos
,
2179 if (*exceptionObject
== NULL
) {
2180 *exceptionObject
= PyUnicodeEncodeError_Create(
2181 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2184 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
2186 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
2188 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
2192 Py_DECREF(*exceptionObject
);
2193 *exceptionObject
= NULL
;
2197 /* raises a UnicodeEncodeError */
2198 static void raise_encode_exception(PyObject
**exceptionObject
,
2199 const char *encoding
,
2200 const Py_UNICODE
*unicode
, int size
,
2201 int startpos
, int endpos
,
2204 make_encode_exception(exceptionObject
,
2205 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2206 if (*exceptionObject
!= NULL
)
2207 PyCodec_StrictErrors(*exceptionObject
);
2210 /* error handling callback helper:
2211 build arguments, call the callback and check the arguments,
2212 put the result into newpos and return the replacement string, which
2213 has to be freed by the caller */
2214 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
2215 PyObject
**errorHandler
,
2216 const char *encoding
, const char *reason
,
2217 const Py_UNICODE
*unicode
, int size
, PyObject
**exceptionObject
,
2218 int startpos
, int endpos
,
2221 static char *argparse
= "O!i;encoding error handler must return (unicode, int) tuple";
2224 PyObject
*resunicode
;
2226 if (*errorHandler
== NULL
) {
2227 *errorHandler
= PyCodec_LookupError(errors
);
2228 if (*errorHandler
== NULL
)
2232 make_encode_exception(exceptionObject
,
2233 encoding
, unicode
, size
, startpos
, endpos
, reason
);
2234 if (*exceptionObject
== NULL
)
2237 restuple
= PyObject_CallFunctionObjArgs(
2238 *errorHandler
, *exceptionObject
, NULL
);
2239 if (restuple
== NULL
)
2241 if (!PyTuple_Check(restuple
)) {
2242 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
2243 Py_DECREF(restuple
);
2246 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
2247 &resunicode
, newpos
)) {
2248 Py_DECREF(restuple
);
2252 *newpos
= size
+*newpos
;
2253 if (*newpos
<0 || *newpos
>size
) {
2254 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", *newpos
);
2255 Py_DECREF(restuple
);
2258 Py_INCREF(resunicode
);
2259 Py_DECREF(restuple
);
2263 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
2270 /* pointers to the beginning and end+1 of input */
2271 const Py_UNICODE
*startp
= p
;
2272 const Py_UNICODE
*endp
= p
+ size
;
2273 /* pointer to the beginning of the unencodable characters */
2274 /* const Py_UNICODE *badp = NULL; */
2275 /* pointer into the output */
2277 /* current output position */
2280 char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
2281 char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2282 PyObject
*errorHandler
= NULL
;
2283 PyObject
*exc
= NULL
;
2284 /* the following variable is used for caching string comparisons
2285 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2286 int known_errorHandler
= -1;
2288 /* allocate enough for a simple encoding without
2289 replacements, if we need more, we'll resize */
2290 res
= PyString_FromStringAndSize(NULL
, size
);
2295 str
= PyString_AS_STRING(res
);
2301 /* can we encode this? */
2303 /* no overflow check, because we know that the space is enough */
2308 int unicodepos
= p
-startp
;
2310 PyObject
*repunicode
;
2315 /* startpos for collecting unencodable chars */
2316 const Py_UNICODE
*collstart
= p
;
2317 const Py_UNICODE
*collend
= p
;
2318 /* find all unecodable characters */
2319 while ((collend
< endp
) && ((*collend
)>=limit
))
2321 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2322 if (known_errorHandler
==-1) {
2323 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2324 known_errorHandler
= 1;
2325 else if (!strcmp(errors
, "replace"))
2326 known_errorHandler
= 2;
2327 else if (!strcmp(errors
, "ignore"))
2328 known_errorHandler
= 3;
2329 else if (!strcmp(errors
, "xmlcharrefreplace"))
2330 known_errorHandler
= 4;
2332 known_errorHandler
= 0;
2334 switch (known_errorHandler
) {
2335 case 1: /* strict */
2336 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
2338 case 2: /* replace */
2339 while (collstart
++<collend
)
2340 *str
++ = '?'; /* fall through */
2341 case 3: /* ignore */
2344 case 4: /* xmlcharrefreplace */
2345 respos
= str
-PyString_AS_STRING(res
);
2346 /* determine replacement size (temporarily (mis)uses p) */
2347 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
2358 else if (*p
<1000000)
2363 requiredsize
= respos
+repsize
+(endp
-collend
);
2364 if (requiredsize
> ressize
) {
2365 if (requiredsize
<2*ressize
)
2366 requiredsize
= 2*ressize
;
2367 if (_PyString_Resize(&res
, requiredsize
))
2369 str
= PyString_AS_STRING(res
) + respos
;
2370 ressize
= requiredsize
;
2372 /* generate replacement (temporarily (mis)uses p) */
2373 for (p
= collstart
; p
< collend
; ++p
) {
2374 str
+= sprintf(str
, "&#%d;", (int)*p
);
2379 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2380 encoding
, reason
, startp
, size
, &exc
,
2381 collstart
-startp
, collend
-startp
, &newpos
);
2382 if (repunicode
== NULL
)
2384 /* need more space? (at least enough for what we
2385 have+the replacement+the rest of the string, so
2386 we won't have to check space for encodable characters) */
2387 respos
= str
-PyString_AS_STRING(res
);
2388 repsize
= PyUnicode_GET_SIZE(repunicode
);
2389 requiredsize
= respos
+repsize
+(endp
-collend
);
2390 if (requiredsize
> ressize
) {
2391 if (requiredsize
<2*ressize
)
2392 requiredsize
= 2*ressize
;
2393 if (_PyString_Resize(&res
, requiredsize
)) {
2394 Py_DECREF(repunicode
);
2397 str
= PyString_AS_STRING(res
) + respos
;
2398 ressize
= requiredsize
;
2400 /* check if there is anything unencodable in the replacement
2401 and copy it to the output */
2402 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
2405 raise_encode_exception(&exc
, encoding
, startp
, size
,
2406 unicodepos
, unicodepos
+1, reason
);
2407 Py_DECREF(repunicode
);
2412 p
= startp
+ newpos
;
2413 Py_DECREF(repunicode
);
2417 /* Resize if we allocated to much */
2418 respos
= str
-PyString_AS_STRING(res
);
2420 /* If this falls res will be NULL */
2421 _PyString_Resize(&res
, respos
);
2422 Py_XDECREF(errorHandler
);
2428 Py_XDECREF(errorHandler
);
2433 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2437 return unicode_encode_ucs1(p
, size
, errors
, 256);
2440 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2442 if (!PyUnicode_Check(unicode
)) {
2443 PyErr_BadArgument();
2446 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2447 PyUnicode_GET_SIZE(unicode
),
2451 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2453 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2457 const char *starts
= s
;
2464 PyObject
*errorHandler
= NULL
;
2465 PyObject
*exc
= NULL
;
2467 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2468 if (size
== 1 && *(unsigned char*)s
< 128) {
2469 Py_UNICODE r
= *(unsigned char*)s
;
2470 return PyUnicode_FromUnicode(&r
, 1);
2473 v
= _PyUnicode_New(size
);
2477 return (PyObject
*)v
;
2478 p
= PyUnicode_AS_UNICODE(v
);
2481 register unsigned char c
= (unsigned char)*s
;
2487 startinpos
= s
-starts
;
2488 endinpos
= startinpos
+ 1;
2489 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2490 if (unicode_decode_call_errorhandler(
2491 errors
, &errorHandler
,
2492 "ascii", "ordinal not in range(128)",
2493 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2494 (PyObject
**)&v
, &outpos
, &p
))
2498 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2499 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2501 Py_XDECREF(errorHandler
);
2503 return (PyObject
*)v
;
2507 Py_XDECREF(errorHandler
);
2512 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2516 return unicode_encode_ucs1(p
, size
, errors
, 128);
2519 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2521 if (!PyUnicode_Check(unicode
)) {
2522 PyErr_BadArgument();
2525 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2526 PyUnicode_GET_SIZE(unicode
),
2530 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2532 /* --- MBCS codecs for Windows -------------------------------------------- */
2534 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2541 /* First get the size of the result */
2542 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2543 if (size
> 0 && usize
==0)
2544 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2546 v
= _PyUnicode_New(usize
);
2550 return (PyObject
*)v
;
2551 p
= PyUnicode_AS_UNICODE(v
);
2552 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2554 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2557 return (PyObject
*)v
;
2560 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2568 /* If there are no characters, bail now! */
2570 return PyString_FromString("");
2572 /* First get the size of the result */
2573 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2575 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2577 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2583 /* Do the conversion */
2584 s
= PyString_AS_STRING(repr
);
2585 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2587 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2592 #endif /* MS_WINDOWS */
2594 /* --- Character Mapping Codec -------------------------------------------- */
2596 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2601 const char *starts
= s
;
2609 PyObject
*errorHandler
= NULL
;
2610 PyObject
*exc
= NULL
;
2612 /* Default to Latin-1 */
2613 if (mapping
== NULL
)
2614 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2616 v
= _PyUnicode_New(size
);
2620 return (PyObject
*)v
;
2621 p
= PyUnicode_AS_UNICODE(v
);
2624 unsigned char ch
= *s
;
2627 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2628 w
= PyInt_FromLong((long)ch
);
2631 x
= PyObject_GetItem(mapping
, w
);
2634 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2635 /* No mapping found means: mapping is undefined. */
2644 if (PyInt_Check(x
)) {
2645 long value
= PyInt_AS_LONG(x
);
2646 if (value
< 0 || value
> 65535) {
2647 PyErr_SetString(PyExc_TypeError
,
2648 "character mapping must be in range(65536)");
2652 *p
++ = (Py_UNICODE
)value
;
2654 else if (x
== Py_None
) {
2655 /* undefined mapping */
2656 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2657 startinpos
= s
-starts
;
2658 endinpos
= startinpos
+1;
2659 if (unicode_decode_call_errorhandler(
2660 errors
, &errorHandler
,
2661 "charmap", "character maps to <undefined>",
2662 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2663 (PyObject
**)&v
, &outpos
, &p
)) {
2669 else if (PyUnicode_Check(x
)) {
2670 int targetsize
= PyUnicode_GET_SIZE(x
);
2672 if (targetsize
== 1)
2674 *p
++ = *PyUnicode_AS_UNICODE(x
);
2676 else if (targetsize
> 1) {
2678 if (targetsize
> extrachars
) {
2680 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2681 int needed
= (targetsize
- extrachars
) + \
2683 extrachars
+= needed
;
2684 if (_PyUnicode_Resize(&v
,
2685 PyUnicode_GET_SIZE(v
) + needed
)) {
2689 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2692 PyUnicode_AS_UNICODE(x
),
2695 extrachars
-= targetsize
;
2697 /* 1-0 mapping: skip the character */
2700 /* wrong return value */
2701 PyErr_SetString(PyExc_TypeError
,
2702 "character mapping must return integer, None or unicode");
2709 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2710 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2712 Py_XDECREF(errorHandler
);
2714 return (PyObject
*)v
;
2717 Py_XDECREF(errorHandler
);
2723 /* Lookup the character ch in the mapping. If the character
2724 can't be found, Py_None is returned (or NULL, if another
2726 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
2728 PyObject
*w
= PyInt_FromLong((long)c
);
2733 x
= PyObject_GetItem(mapping
, w
);
2736 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2737 /* No mapping found means: mapping is undefined. */
2745 else if (x
== Py_None
)
2747 else if (PyInt_Check(x
)) {
2748 long value
= PyInt_AS_LONG(x
);
2749 if (value
< 0 || value
> 255) {
2750 PyErr_SetString(PyExc_TypeError
,
2751 "character mapping must be in range(256)");
2757 else if (PyString_Check(x
))
2760 /* wrong return value */
2761 PyErr_SetString(PyExc_TypeError
,
2762 "character mapping must return integer, None or str");
2768 /* lookup the character, put the result in the output string and adjust
2769 various state variables. Reallocate the output string if not enough
2770 space is available. Return a new reference to the object that
2771 was put in the output buffer, or Py_None, if the mapping was undefined
2772 (in which case no character was written) or NULL, if a
2773 reallocation error ocurred. The called must decref the result */
2775 PyObject
*charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
2776 PyObject
**outobj
, int *outpos
)
2778 PyObject
*rep
= charmapencode_lookup(c
, mapping
);
2782 else if (rep
==Py_None
)
2785 char *outstart
= PyString_AS_STRING(*outobj
);
2786 int outsize
= PyString_GET_SIZE(*outobj
);
2787 if (PyInt_Check(rep
)) {
2788 int requiredsize
= *outpos
+1;
2789 if (outsize
<requiredsize
) {
2790 /* exponentially overallocate to minimize reallocations */
2791 if (requiredsize
< 2*outsize
)
2792 requiredsize
= 2*outsize
;
2793 if (_PyString_Resize(outobj
, requiredsize
)) {
2797 outstart
= PyString_AS_STRING(*outobj
);
2799 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
2802 const char *repchars
= PyString_AS_STRING(rep
);
2803 int repsize
= PyString_GET_SIZE(rep
);
2804 int requiredsize
= *outpos
+repsize
;
2805 if (outsize
<requiredsize
) {
2806 /* exponentially overallocate to minimize reallocations */
2807 if (requiredsize
< 2*outsize
)
2808 requiredsize
= 2*outsize
;
2809 if (_PyString_Resize(outobj
, requiredsize
)) {
2813 outstart
= PyString_AS_STRING(*outobj
);
2815 memcpy(outstart
+ *outpos
, repchars
, repsize
);
2822 /* handle an error in PyUnicode_EncodeCharmap
2823 Return 0 on success, -1 on error */
2825 int charmap_encoding_error(
2826 const Py_UNICODE
*p
, int size
, int *inpos
, PyObject
*mapping
,
2827 PyObject
**exceptionObject
,
2828 int *known_errorHandler
, PyObject
*errorHandler
, const char *errors
,
2829 PyObject
**res
, int *respos
)
2831 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
2835 /* startpos for collecting unencodable chars */
2836 int collstartpos
= *inpos
;
2837 int collendpos
= *inpos
+1;
2839 char *encoding
= "charmap";
2840 char *reason
= "character maps to <undefined>";
2843 /* find all unencodable characters */
2844 while (collendpos
< size
) {
2845 x
= charmapencode_lookup(p
[collendpos
], mapping
);
2848 else if (x
!=Py_None
) {
2855 /* cache callback name lookup
2856 * (if not done yet, i.e. it's the first error) */
2857 if (*known_errorHandler
==-1) {
2858 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
2859 *known_errorHandler
= 1;
2860 else if (!strcmp(errors
, "replace"))
2861 *known_errorHandler
= 2;
2862 else if (!strcmp(errors
, "ignore"))
2863 *known_errorHandler
= 3;
2864 else if (!strcmp(errors
, "xmlcharrefreplace"))
2865 *known_errorHandler
= 4;
2867 *known_errorHandler
= 0;
2869 switch (*known_errorHandler
) {
2870 case 1: /* strict */
2871 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
2873 case 2: /* replace */
2874 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
2875 x
= charmapencode_output('?', mapping
, res
, respos
);
2879 else if (x
==Py_None
) {
2881 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
2887 case 3: /* ignore */
2888 *inpos
= collendpos
;
2890 case 4: /* xmlcharrefreplace */
2891 /* generate replacement (temporarily (mis)uses p) */
2892 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
2893 char buffer
[2+29+1+1];
2895 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
2896 for (cp
= buffer
; *cp
; ++cp
) {
2897 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
2900 else if (x
==Py_None
) {
2902 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
2908 *inpos
= collendpos
;
2911 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
2912 encoding
, reason
, p
, size
, exceptionObject
,
2913 collstartpos
, collendpos
, &newpos
);
2914 if (repunicode
== NULL
)
2916 /* generate replacement */
2917 repsize
= PyUnicode_GET_SIZE(repunicode
);
2918 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
2919 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
2921 Py_DECREF(repunicode
);
2924 else if (x
==Py_None
) {
2925 Py_DECREF(repunicode
);
2927 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
2933 Py_DECREF(repunicode
);
2938 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2944 PyObject
*res
= NULL
;
2945 /* current input position */
2947 /* current output position */
2949 PyObject
*errorHandler
= NULL
;
2950 PyObject
*exc
= NULL
;
2951 /* the following variable is used for caching string comparisons
2952 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2953 * 3=ignore, 4=xmlcharrefreplace */
2954 int known_errorHandler
= -1;
2956 /* Default to Latin-1 */
2957 if (mapping
== NULL
)
2958 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2960 /* allocate enough for a simple encoding without
2961 replacements, if we need more, we'll resize */
2962 res
= PyString_FromStringAndSize(NULL
, size
);
2968 while (inpos
<size
) {
2969 /* try to encode it */
2970 PyObject
*x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
2971 if (x
==NULL
) /* error */
2973 if (x
==Py_None
) { /* unencodable character */
2974 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
2976 &known_errorHandler
, errorHandler
, errors
,
2981 /* done with this character => adjust input position */
2986 /* Resize if we allocated to much */
2987 if (respos
<PyString_GET_SIZE(res
)) {
2988 if (_PyString_Resize(&res
, respos
))
2992 Py_XDECREF(errorHandler
);
2998 Py_XDECREF(errorHandler
);
3002 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
3005 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
3006 PyErr_BadArgument();
3009 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
3010 PyUnicode_GET_SIZE(unicode
),
3015 /* create or adjust a UnicodeTranslateError */
3016 static void make_translate_exception(PyObject
**exceptionObject
,
3017 const Py_UNICODE
*unicode
, int size
,
3018 int startpos
, int endpos
,
3021 if (*exceptionObject
== NULL
) {
3022 *exceptionObject
= PyUnicodeTranslateError_Create(
3023 unicode
, size
, startpos
, endpos
, reason
);
3026 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
3028 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
3030 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
3034 Py_DECREF(*exceptionObject
);
3035 *exceptionObject
= NULL
;
3039 /* raises a UnicodeTranslateError */
3040 static void raise_translate_exception(PyObject
**exceptionObject
,
3041 const Py_UNICODE
*unicode
, int size
,
3042 int startpos
, int endpos
,
3045 make_translate_exception(exceptionObject
,
3046 unicode
, size
, startpos
, endpos
, reason
);
3047 if (*exceptionObject
!= NULL
)
3048 PyCodec_StrictErrors(*exceptionObject
);
3051 /* error handling callback helper:
3052 build arguments, call the callback and check the arguments,
3053 put the result into newpos and return the replacement string, which
3054 has to be freed by the caller */
3055 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
3056 PyObject
**errorHandler
,
3058 const Py_UNICODE
*unicode
, int size
, PyObject
**exceptionObject
,
3059 int startpos
, int endpos
,
3062 static char *argparse
= "O!i;translating error handler must return (unicode, int) tuple";
3065 PyObject
*resunicode
;
3067 if (*errorHandler
== NULL
) {
3068 *errorHandler
= PyCodec_LookupError(errors
);
3069 if (*errorHandler
== NULL
)
3073 make_translate_exception(exceptionObject
,
3074 unicode
, size
, startpos
, endpos
, reason
);
3075 if (*exceptionObject
== NULL
)
3078 restuple
= PyObject_CallFunctionObjArgs(
3079 *errorHandler
, *exceptionObject
, NULL
);
3080 if (restuple
== NULL
)
3082 if (!PyTuple_Check(restuple
)) {
3083 PyErr_Format(PyExc_TypeError
, &argparse
[4]);
3084 Py_DECREF(restuple
);
3087 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3088 &resunicode
, newpos
)) {
3089 Py_DECREF(restuple
);
3093 *newpos
= size
+*newpos
;
3094 if (*newpos
<0 || *newpos
>size
) {
3095 PyErr_Format(PyExc_IndexError
, "position %d from error handler out of bounds", *newpos
);
3096 Py_DECREF(restuple
);
3099 Py_INCREF(resunicode
);
3100 Py_DECREF(restuple
);
3104 /* Lookup the character ch in the mapping and put the result in result,
3105 which must be decrefed by the caller.
3106 Return 0 on success, -1 on error */
3108 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
3110 PyObject
*w
= PyInt_FromLong((long)c
);
3115 x
= PyObject_GetItem(mapping
, w
);
3118 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
3119 /* No mapping found means: use 1:1 mapping. */
3126 else if (x
== Py_None
) {
3130 else if (PyInt_Check(x
)) {
3131 long value
= PyInt_AS_LONG(x
);
3132 long max
= PyUnicode_GetMax();
3133 if (value
< 0 || value
> max
) {
3134 PyErr_Format(PyExc_TypeError
,
3135 "character mapping must be in range(0x%lx)", max
+1);
3142 else if (PyUnicode_Check(x
)) {
3147 /* wrong return value */
3148 PyErr_SetString(PyExc_TypeError
,
3149 "character mapping must return integer, None or unicode");
3153 /* ensure that *outobj is at least requiredsize characters long,
3154 if not reallocate and adjust various state variables.
3155 Return 0 on success, -1 on error */
3157 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
, int *outsize
,
3160 if (requiredsize
> *outsize
) {
3161 /* remember old output position */
3162 int outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
3163 /* exponentially overallocate to minimize reallocations */
3164 if (requiredsize
< 2 * *outsize
)
3165 requiredsize
= 2 * *outsize
;
3166 if (_PyUnicode_Resize(outobj
, requiredsize
))
3168 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
3169 *outsize
= requiredsize
;
3173 /* lookup the character, put the result in the output string and adjust
3174 various state variables. Return a new reference to the object that
3175 was put in the output buffer in *result, or Py_None, if the mapping was
3176 undefined (in which case no character was written).
3177 The called must decref result.
3178 Return 0 on success, -1 on error. */
3180 int charmaptranslate_output(Py_UNICODE c
, PyObject
*mapping
,
3181 PyObject
**outobj
, int *outsize
, Py_UNICODE
**outp
, PyObject
**res
)
3183 if (charmaptranslate_lookup(c
, mapping
, res
))
3186 /* not found => default to 1:1 mapping */
3187 *(*outp
)++ = (Py_UNICODE
)c
;
3189 else if (*res
==Py_None
)
3191 else if (PyInt_Check(*res
)) {
3192 /* no overflow check, because we know that the space is enough */
3193 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
3195 else if (PyUnicode_Check(*res
)) {
3196 int repsize
= PyUnicode_GET_SIZE(*res
);
3198 /* no overflow check, because we know that the space is enough */
3199 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
3201 else if (repsize
!=0) {
3202 /* more than one character */
3203 int requiredsize
= *outsize
+ repsize
- 1;
3204 if (charmaptranslate_makespace(outobj
, outp
, outsize
, requiredsize
))
3206 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
3215 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
3221 PyObject
*res
= NULL
;
3222 /* pointers to the beginning and end+1 of input */
3223 const Py_UNICODE
*startp
= p
;
3224 const Py_UNICODE
*endp
= p
+ size
;
3225 /* pointer into the output */
3227 /* current output position */
3230 char *reason
= "character maps to <undefined>";
3231 PyObject
*errorHandler
= NULL
;
3232 PyObject
*exc
= NULL
;
3233 /* the following variable is used for caching string comparisons
3234 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3235 * 3=ignore, 4=xmlcharrefreplace */
3236 int known_errorHandler
= -1;
3238 if (mapping
== NULL
) {
3239 PyErr_BadArgument();
3243 /* allocate enough for a simple 1:1 translation without
3244 replacements, if we need more, we'll resize */
3245 res
= PyUnicode_FromUnicode(NULL
, size
);
3250 str
= PyUnicode_AS_UNICODE(res
);
3254 /* try to encode it */
3256 if (charmaptranslate_output(*p
, mapping
, &res
, &ressize
, &str
, &x
)) {
3261 if (x
!=Py_None
) /* it worked => adjust input pointer */
3263 else { /* untranslatable character */
3264 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
3268 /* startpos for collecting untranslatable chars */
3269 const Py_UNICODE
*collstart
= p
;
3270 const Py_UNICODE
*collend
= p
+1;
3271 const Py_UNICODE
*coll
;
3273 /* find all untranslatable characters */
3274 while (collend
< endp
) {
3275 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
3282 /* cache callback name lookup
3283 * (if not done yet, i.e. it's the first error) */
3284 if (known_errorHandler
==-1) {
3285 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3286 known_errorHandler
= 1;
3287 else if (!strcmp(errors
, "replace"))
3288 known_errorHandler
= 2;
3289 else if (!strcmp(errors
, "ignore"))
3290 known_errorHandler
= 3;
3291 else if (!strcmp(errors
, "xmlcharrefreplace"))
3292 known_errorHandler
= 4;
3294 known_errorHandler
= 0;
3296 switch (known_errorHandler
) {
3297 case 1: /* strict */
3298 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3300 case 2: /* replace */
3301 /* No need to check for space, this is a 1:1 replacement */
3302 for (coll
= collstart
; coll
<collend
; ++coll
)
3305 case 3: /* ignore */
3308 case 4: /* xmlcharrefreplace */
3309 /* generate replacement (temporarily (mis)uses p) */
3310 for (p
= collstart
; p
< collend
; ++p
) {
3311 char buffer
[2+29+1+1];
3313 sprintf(buffer
, "&#%d;", (int)*p
);
3314 if (charmaptranslate_makespace(&res
, &str
, &ressize
,
3315 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
3317 for (cp
= buffer
; *cp
; ++cp
)
3323 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
3324 reason
, startp
, size
, &exc
,
3325 collstart
-startp
, collend
-startp
, &newpos
);
3326 if (repunicode
== NULL
)
3328 /* generate replacement */
3329 repsize
= PyUnicode_GET_SIZE(repunicode
);
3330 if (charmaptranslate_makespace(&res
, &str
, &ressize
,
3331 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
3332 Py_DECREF(repunicode
);
3335 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
3337 p
= startp
+ newpos
;
3338 Py_DECREF(repunicode
);
3342 /* Resize if we allocated to much */
3343 respos
= str
-PyUnicode_AS_UNICODE(res
);
3344 if (respos
<ressize
) {
3345 if (_PyUnicode_Resize(&res
, respos
))
3349 Py_XDECREF(errorHandler
);
3355 Py_XDECREF(errorHandler
);
3359 PyObject
*PyUnicode_Translate(PyObject
*str
,
3365 str
= PyUnicode_FromObject(str
);
3368 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
3369 PyUnicode_GET_SIZE(str
),
3380 /* --- Decimal Encoder ---------------------------------------------------- */
3382 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
3387 Py_UNICODE
*p
, *end
;
3388 PyObject
*errorHandler
= NULL
;
3389 PyObject
*exc
= NULL
;
3390 const char *encoding
= "decimal";
3391 const char *reason
= "invalid decimal Unicode string";
3392 /* the following variable is used for caching string comparisons
3393 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3394 int known_errorHandler
= -1;
3396 if (output
== NULL
) {
3397 PyErr_BadArgument();
3404 register Py_UNICODE ch
= *p
;
3406 PyObject
*repunicode
;
3410 Py_UNICODE
*collstart
;
3411 Py_UNICODE
*collend
;
3413 if (Py_UNICODE_ISSPACE(ch
)) {
3418 decimal
= Py_UNICODE_TODECIMAL(ch
);
3420 *output
++ = '0' + decimal
;
3424 if (0 < ch
&& ch
< 256) {
3425 *output
++ = (char)ch
;
3429 /* All other characters are considered unencodable */
3432 while (collend
< end
) {
3433 if ((0 < *collend
&& *collend
< 256) ||
3434 !Py_UNICODE_ISSPACE(*collend
) ||
3435 Py_UNICODE_TODECIMAL(*collend
))
3438 /* cache callback name lookup
3439 * (if not done yet, i.e. it's the first error) */
3440 if (known_errorHandler
==-1) {
3441 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3442 known_errorHandler
= 1;
3443 else if (!strcmp(errors
, "replace"))
3444 known_errorHandler
= 2;
3445 else if (!strcmp(errors
, "ignore"))
3446 known_errorHandler
= 3;
3447 else if (!strcmp(errors
, "xmlcharrefreplace"))
3448 known_errorHandler
= 4;
3450 known_errorHandler
= 0;
3452 switch (known_errorHandler
) {
3453 case 1: /* strict */
3454 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
3456 case 2: /* replace */
3457 for (p
= collstart
; p
< collend
; ++p
)
3460 case 3: /* ignore */
3463 case 4: /* xmlcharrefreplace */
3464 /* generate replacement (temporarily (mis)uses p) */
3465 for (p
= collstart
; p
< collend
; ++p
)
3466 output
+= sprintf(output
, "&#%d;", (int)*p
);
3470 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3471 encoding
, reason
, s
, length
, &exc
,
3472 collstart
-s
, collend
-s
, &newpos
);
3473 if (repunicode
== NULL
)
3475 /* generate replacement */
3476 repsize
= PyUnicode_GET_SIZE(repunicode
);
3477 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
3478 Py_UNICODE ch
= *uni2
;
3479 if (Py_UNICODE_ISSPACE(ch
))
3482 decimal
= Py_UNICODE_TODECIMAL(ch
);
3484 *output
++ = '0' + decimal
;
3485 else if (0 < ch
&& ch
< 256)
3486 *output
++ = (char)ch
;
3488 Py_DECREF(repunicode
);
3489 raise_encode_exception(&exc
, encoding
,
3490 s
, length
, collstart
-s
, collend
-s
, reason
);
3496 Py_DECREF(repunicode
);
3499 /* 0-terminate the output string */
3502 Py_XDECREF(errorHandler
);
3507 Py_XDECREF(errorHandler
);
3511 /* --- Helpers ------------------------------------------------------------ */
3514 int count(PyUnicodeObject
*self
,
3517 PyUnicodeObject
*substring
)
3522 start
+= self
->length
;
3525 if (end
> self
->length
)
3528 end
+= self
->length
;
3532 if (substring
->length
== 0)
3533 return (end
- start
+ 1);
3535 end
-= substring
->length
;
3537 while (start
<= end
)
3538 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
3540 start
+= substring
->length
;
3547 int PyUnicode_Count(PyObject
*str
,
3554 str
= PyUnicode_FromObject(str
);
3557 substr
= PyUnicode_FromObject(substr
);
3558 if (substr
== NULL
) {
3563 result
= count((PyUnicodeObject
*)str
,
3565 (PyUnicodeObject
*)substr
);
3573 int findstring(PyUnicodeObject
*self
,
3574 PyUnicodeObject
*substring
,
3580 start
+= self
->length
;
3584 if (end
> self
->length
)
3587 end
+= self
->length
;
3591 if (substring
->length
== 0)
3592 return (direction
> 0) ? start
: end
;
3594 end
-= substring
->length
;
3596 if (direction
< 0) {
3597 for (; end
>= start
; end
--)
3598 if (Py_UNICODE_MATCH(self
, end
, substring
))
3601 for (; start
<= end
; start
++)
3602 if (Py_UNICODE_MATCH(self
, start
, substring
))
3609 int PyUnicode_Find(PyObject
*str
,
3617 str
= PyUnicode_FromObject(str
);
3620 substr
= PyUnicode_FromObject(substr
);
3621 if (substr
== NULL
) {
3626 result
= findstring((PyUnicodeObject
*)str
,
3627 (PyUnicodeObject
*)substr
,
3628 start
, end
, direction
);
3635 int tailmatch(PyUnicodeObject
*self
,
3636 PyUnicodeObject
*substring
,
3642 start
+= self
->length
;
3646 if (substring
->length
== 0)
3649 if (end
> self
->length
)
3652 end
+= self
->length
;
3656 end
-= substring
->length
;
3660 if (direction
> 0) {
3661 if (Py_UNICODE_MATCH(self
, end
, substring
))
3664 if (Py_UNICODE_MATCH(self
, start
, substring
))
3671 int PyUnicode_Tailmatch(PyObject
*str
,
3679 str
= PyUnicode_FromObject(str
);
3682 substr
= PyUnicode_FromObject(substr
);
3683 if (substr
== NULL
) {
3688 result
= tailmatch((PyUnicodeObject
*)str
,
3689 (PyUnicodeObject
*)substr
,
3690 start
, end
, direction
);
3697 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
3701 /* like wcschr, but doesn't stop at NULL characters */
3703 while (size
-- > 0) {
3712 /* Apply fixfct filter to the Unicode object self and return a
3713 reference to the modified object */
3716 PyObject
*fixup(PyUnicodeObject
*self
,
3717 int (*fixfct
)(PyUnicodeObject
*s
))
3722 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
3726 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
3728 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
3729 /* fixfct should return TRUE if it modified the buffer. If
3730 FALSE, return a reference to the original buffer instead
3731 (to save space, not time) */
3734 return (PyObject
*) self
;
3736 return (PyObject
*) u
;
3740 int fixupper(PyUnicodeObject
*self
)
3742 int len
= self
->length
;
3743 Py_UNICODE
*s
= self
->str
;
3747 register Py_UNICODE ch
;
3749 ch
= Py_UNICODE_TOUPPER(*s
);
3761 int fixlower(PyUnicodeObject
*self
)
3763 int len
= self
->length
;
3764 Py_UNICODE
*s
= self
->str
;
3768 register Py_UNICODE ch
;
3770 ch
= Py_UNICODE_TOLOWER(*s
);
3782 int fixswapcase(PyUnicodeObject
*self
)
3784 int len
= self
->length
;
3785 Py_UNICODE
*s
= self
->str
;
3789 if (Py_UNICODE_ISUPPER(*s
)) {
3790 *s
= Py_UNICODE_TOLOWER(*s
);
3792 } else if (Py_UNICODE_ISLOWER(*s
)) {
3793 *s
= Py_UNICODE_TOUPPER(*s
);
3803 int fixcapitalize(PyUnicodeObject
*self
)
3805 int len
= self
->length
;
3806 Py_UNICODE
*s
= self
->str
;
3811 if (Py_UNICODE_ISLOWER(*s
)) {
3812 *s
= Py_UNICODE_TOUPPER(*s
);
3817 if (Py_UNICODE_ISUPPER(*s
)) {
3818 *s
= Py_UNICODE_TOLOWER(*s
);
3827 int fixtitle(PyUnicodeObject
*self
)
3829 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3830 register Py_UNICODE
*e
;
3831 int previous_is_cased
;
3833 /* Shortcut for single character strings */
3834 if (PyUnicode_GET_SIZE(self
) == 1) {
3835 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
3844 e
= p
+ PyUnicode_GET_SIZE(self
);
3845 previous_is_cased
= 0;
3846 for (; p
< e
; p
++) {
3847 register const Py_UNICODE ch
= *p
;
3849 if (previous_is_cased
)
3850 *p
= Py_UNICODE_TOLOWER(ch
);
3852 *p
= Py_UNICODE_TOTITLE(ch
);
3854 if (Py_UNICODE_ISLOWER(ch
) ||
3855 Py_UNICODE_ISUPPER(ch
) ||
3856 Py_UNICODE_ISTITLE(ch
))
3857 previous_is_cased
= 1;
3859 previous_is_cased
= 0;
3864 PyObject
*PyUnicode_Join(PyObject
*separator
,
3869 PyUnicodeObject
*res
= NULL
;
3876 it
= PyObject_GetIter(seq
);
3880 if (separator
== NULL
) {
3881 Py_UNICODE blank
= ' ';
3886 separator
= PyUnicode_FromObject(separator
);
3887 if (separator
== NULL
)
3889 sep
= PyUnicode_AS_UNICODE(separator
);
3890 seplen
= PyUnicode_GET_SIZE(separator
);
3893 res
= _PyUnicode_New(sz
);
3896 p
= PyUnicode_AS_UNICODE(res
);
3899 for (i
= 0; ; ++i
) {
3901 PyObject
*item
= PyIter_Next(it
);
3903 if (PyErr_Occurred())
3907 if (!PyUnicode_Check(item
)) {
3909 if (!PyString_Check(item
)) {
3910 PyErr_Format(PyExc_TypeError
,
3911 "sequence item %i: expected string or Unicode,"
3913 i
, item
->ob_type
->tp_name
);
3917 v
= PyUnicode_FromObject(item
);
3923 itemlen
= PyUnicode_GET_SIZE(item
);
3924 while (reslen
+ itemlen
+ seplen
>= sz
) {
3925 if (_PyUnicode_Resize(&res
, sz
*2)) {
3930 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
3933 Py_UNICODE_COPY(p
, sep
, seplen
);
3937 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
3942 if (_PyUnicode_Resize(&res
, reslen
))
3945 Py_XDECREF(separator
);
3947 return (PyObject
*)res
;
3950 Py_XDECREF(separator
);
3957 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
3969 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
3974 u
= _PyUnicode_New(left
+ self
->length
+ right
);
3977 Py_UNICODE_FILL(u
->str
, fill
, left
);
3978 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
3980 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
3986 #define SPLIT_APPEND(data, left, right) \
3987 str = PyUnicode_FromUnicode(data + left, right - left); \
3990 if (PyList_Append(list, str)) { \
3998 PyObject
*split_whitespace(PyUnicodeObject
*self
,
4004 int len
= self
->length
;
4007 for (i
= j
= 0; i
< len
; ) {
4009 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4012 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
4015 if (maxcount
-- <= 0)
4017 SPLIT_APPEND(self
->str
, j
, i
);
4018 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
4024 SPLIT_APPEND(self
->str
, j
, len
);
4033 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
4043 string
= PyUnicode_FromObject(string
);
4046 data
= PyUnicode_AS_UNICODE(string
);
4047 len
= PyUnicode_GET_SIZE(string
);
4049 list
= PyList_New(0);
4053 for (i
= j
= 0; i
< len
; ) {
4056 /* Find a line and append it */
4057 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
4060 /* Skip the line break reading CRLF as one line break */
4063 if (data
[i
] == '\r' && i
+ 1 < len
&&
4071 SPLIT_APPEND(data
, j
, eol
);
4075 SPLIT_APPEND(data
, j
, len
);
4088 PyObject
*split_char(PyUnicodeObject
*self
,
4095 int len
= self
->length
;
4098 for (i
= j
= 0; i
< len
; ) {
4099 if (self
->str
[i
] == ch
) {
4100 if (maxcount
-- <= 0)
4102 SPLIT_APPEND(self
->str
, j
, i
);
4108 SPLIT_APPEND(self
->str
, j
, len
);
4118 PyObject
*split_substring(PyUnicodeObject
*self
,
4120 PyUnicodeObject
*substring
,
4125 int len
= self
->length
;
4126 int sublen
= substring
->length
;
4129 for (i
= j
= 0; i
<= len
- sublen
; ) {
4130 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
4131 if (maxcount
-- <= 0)
4133 SPLIT_APPEND(self
->str
, j
, i
);
4139 SPLIT_APPEND(self
->str
, j
, len
);
4151 PyObject
*split(PyUnicodeObject
*self
,
4152 PyUnicodeObject
*substring
,
4160 list
= PyList_New(0);
4164 if (substring
== NULL
)
4165 return split_whitespace(self
,list
,maxcount
);
4167 else if (substring
->length
== 1)
4168 return split_char(self
,list
,substring
->str
[0],maxcount
);
4170 else if (substring
->length
== 0) {
4172 PyErr_SetString(PyExc_ValueError
, "empty separator");
4176 return split_substring(self
,list
,substring
,maxcount
);
4180 PyObject
*replace(PyUnicodeObject
*self
,
4181 PyUnicodeObject
*str1
,
4182 PyUnicodeObject
*str2
,
4190 if (str1
->length
== 1 && str2
->length
== 1) {
4193 /* replace characters */
4194 if (!findchar(self
->str
, self
->length
, str1
->str
[0]) &&
4195 PyUnicode_CheckExact(self
)) {
4196 /* nothing to replace, return original string */
4200 Py_UNICODE u1
= str1
->str
[0];
4201 Py_UNICODE u2
= str2
->str
[0];
4203 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
4208 Py_UNICODE_COPY(u
->str
, self
->str
,
4210 for (i
= 0; i
< u
->length
; i
++)
4211 if (u
->str
[i
] == u1
) {
4223 /* replace strings */
4224 n
= count(self
, 0, self
->length
, str1
);
4228 /* nothing to replace, return original string */
4229 if (PyUnicode_CheckExact(self
)) {
4234 u
= (PyUnicodeObject
*)
4235 PyUnicode_FromUnicode(self
->str
, self
->length
);
4239 self
->length
+ n
* (str2
->length
- str1
->length
));
4243 if (str1
->length
> 0) {
4244 while (i
<= self
->length
- str1
->length
)
4245 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
4246 /* replace string segment */
4247 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
4251 /* copy remaining part */
4252 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
4256 *p
++ = self
->str
[i
++];
4259 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
4263 *p
++ = self
->str
[i
++];
4265 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
4271 return (PyObject
*) u
;
4274 /* --- Unicode Object Methods --------------------------------------------- */
4276 PyDoc_STRVAR(title__doc__
,
4277 "S.title() -> unicode\n\
4279 Return a titlecased version of S, i.e. words start with title case\n\
4280 characters, all remaining cased characters have lower case.");
4283 unicode_title(PyUnicodeObject
*self
)
4285 return fixup(self
, fixtitle
);
4288 PyDoc_STRVAR(capitalize__doc__
,
4289 "S.capitalize() -> unicode\n\
4291 Return a capitalized version of S, i.e. make the first character\n\
4295 unicode_capitalize(PyUnicodeObject
*self
)
4297 return fixup(self
, fixcapitalize
);
4301 PyDoc_STRVAR(capwords__doc__
,
4302 "S.capwords() -> unicode\n\
4304 Apply .capitalize() to all words in S and return the result with\n\
4305 normalized whitespace (all whitespace strings are replaced by ' ').");
4308 unicode_capwords(PyUnicodeObject
*self
)
4314 /* Split into words */
4315 list
= split(self
, NULL
, -1);
4319 /* Capitalize each word */
4320 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
4321 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
4325 Py_DECREF(PyList_GET_ITEM(list
, i
));
4326 PyList_SET_ITEM(list
, i
, item
);
4329 /* Join the words to form a new string */
4330 item
= PyUnicode_Join(NULL
, list
);
4334 return (PyObject
*)item
;
4338 PyDoc_STRVAR(center__doc__
,
4339 "S.center(width) -> unicode\n\
4341 Return S centered in a Unicode string of length width. Padding is done\n\
4345 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
4350 if (!PyArg_ParseTuple(args
, "i:center", &width
))
4353 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4355 return (PyObject
*) self
;
4358 marg
= width
- self
->length
;
4359 left
= marg
/ 2 + (marg
& width
& 1);
4361 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
4366 /* This code should go into some future Unicode collation support
4367 module. The basic comparison should compare ordinals on a naive
4368 basis (this is what Java does and thus JPython too). */
4370 /* speedy UTF-16 code point order comparison */
4372 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4374 static short utf16Fixup
[32] =
4376 0, 0, 0, 0, 0, 0, 0, 0,
4377 0, 0, 0, 0, 0, 0, 0, 0,
4378 0, 0, 0, 0, 0, 0, 0, 0,
4379 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4383 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
4387 Py_UNICODE
*s1
= str1
->str
;
4388 Py_UNICODE
*s2
= str2
->str
;
4390 len1
= str1
->length
;
4391 len2
= str2
->length
;
4393 while (len1
> 0 && len2
> 0) {
4399 if (c1
> (1<<11) * 26)
4400 c1
+= utf16Fixup
[c1
>>11];
4401 if (c2
> (1<<11) * 26)
4402 c2
+= utf16Fixup
[c2
>>11];
4403 /* now c1 and c2 are in UTF-32-compatible order */
4406 return (c1
< c2
) ? -1 : 1;
4411 return (len1
< len2
) ? -1 : (len1
!= len2
);
4417 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
4419 register int len1
, len2
;
4421 Py_UNICODE
*s1
= str1
->str
;
4422 Py_UNICODE
*s2
= str2
->str
;
4424 len1
= str1
->length
;
4425 len2
= str2
->length
;
4427 while (len1
> 0 && len2
> 0) {
4434 return (c1
< c2
) ? -1 : 1;
4439 return (len1
< len2
) ? -1 : (len1
!= len2
);
4444 int PyUnicode_Compare(PyObject
*left
,
4447 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
4450 /* Coerce the two arguments */
4451 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
4454 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
4458 /* Shortcut for empty or interned objects */
4465 result
= unicode_compare(u
, v
);
4477 int PyUnicode_Contains(PyObject
*container
,
4480 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
4482 register const Py_UNICODE
*lhs
, *end
, *rhs
;
4484 /* Coerce the two arguments */
4485 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
4487 PyErr_SetString(PyExc_TypeError
,
4488 "'in <string>' requires string as left operand");
4491 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
4495 size
= PyUnicode_GET_SIZE(v
);
4496 rhs
= PyUnicode_AS_UNICODE(v
);
4497 lhs
= PyUnicode_AS_UNICODE(u
);
4501 end
= lhs
+ PyUnicode_GET_SIZE(u
);
4503 if (*lhs
++ == *rhs
) {
4510 end
= lhs
+ (PyUnicode_GET_SIZE(u
) - size
);
4511 while (lhs
<= end
) {
4512 if (memcmp(lhs
++, rhs
, size
* sizeof(Py_UNICODE
)) == 0) {
4529 /* Concat to string or Unicode object giving a new Unicode object. */
4531 PyObject
*PyUnicode_Concat(PyObject
*left
,
4534 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
4536 /* Coerce the two arguments */
4537 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
4540 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
4545 if (v
== unicode_empty
) {
4547 return (PyObject
*)u
;
4549 if (u
== unicode_empty
) {
4551 return (PyObject
*)v
;
4554 /* Concat the two Unicode strings */
4555 w
= _PyUnicode_New(u
->length
+ v
->length
);
4558 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
4559 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
4563 return (PyObject
*)w
;
4571 PyDoc_STRVAR(count__doc__
,
4572 "S.count(sub[, start[, end]]) -> int\n\
4574 Return the number of occurrences of substring sub in Unicode string\n\
4575 S[start:end]. Optional arguments start and end are\n\
4576 interpreted as in slice notation.");
4579 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
4581 PyUnicodeObject
*substring
;
4586 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
4587 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4590 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4591 (PyObject
*)substring
);
4592 if (substring
== NULL
)
4596 start
+= self
->length
;
4599 if (end
> self
->length
)
4602 end
+= self
->length
;
4606 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
4608 Py_DECREF(substring
);
4612 PyDoc_STRVAR(encode__doc__
,
4613 "S.encode([encoding[,errors]]) -> string\n\
4615 Return an encoded string version of S. Default encoding is the current\n\
4616 default string encoding. errors may be given to set a different error\n\
4617 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4618 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4619 'xmlcharrefreplace' as well as any other name registered with\n\
4620 codecs.register_error that can handle UnicodeEncodeErrors.");
4623 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
4625 char *encoding
= NULL
;
4626 char *errors
= NULL
;
4627 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
4629 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
4632 PyDoc_STRVAR(expandtabs__doc__
,
4633 "S.expandtabs([tabsize]) -> unicode\n\
4635 Return a copy of S where all tab characters are expanded using spaces.\n\
4636 If tabsize is not given, a tab size of 8 characters is assumed.");
4639 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
4648 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
4651 /* First pass: determine size of output string */
4653 e
= self
->str
+ self
->length
;
4654 for (p
= self
->str
; p
< e
; p
++)
4657 j
+= tabsize
- (j
% tabsize
);
4661 if (*p
== '\n' || *p
== '\r') {
4667 /* Second pass: create output string and fill it */
4668 u
= _PyUnicode_New(i
+ j
);
4675 for (p
= self
->str
; p
< e
; p
++)
4678 i
= tabsize
- (j
% tabsize
);
4687 if (*p
== '\n' || *p
== '\r')
4691 return (PyObject
*) u
;
4694 PyDoc_STRVAR(find__doc__
,
4695 "S.find(sub [,start [,end]]) -> int\n\
4697 Return the lowest index in S where substring sub is found,\n\
4698 such that sub is contained within s[start,end]. Optional\n\
4699 arguments start and end are interpreted as in slice notation.\n\
4701 Return -1 on failure.");
4704 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
4706 PyUnicodeObject
*substring
;
4711 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
4712 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4714 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4715 (PyObject
*)substring
);
4716 if (substring
== NULL
)
4719 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
4721 Py_DECREF(substring
);
4726 unicode_getitem(PyUnicodeObject
*self
, int index
)
4728 if (index
< 0 || index
>= self
->length
) {
4729 PyErr_SetString(PyExc_IndexError
, "string index out of range");
4733 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
4737 unicode_hash(PyUnicodeObject
*self
)
4739 /* Since Unicode objects compare equal to their ASCII string
4740 counterparts, they should use the individual character values
4741 as basis for their hash value. This is needed to assure that
4742 strings and Unicode objects behave in the same way as
4746 register Py_UNICODE
*p
;
4749 if (self
->hash
!= -1)
4751 len
= PyUnicode_GET_SIZE(self
);
4752 p
= PyUnicode_AS_UNICODE(self
);
4755 x
= (1000003*x
) ^ *p
++;
4756 x
^= PyUnicode_GET_SIZE(self
);
4763 PyDoc_STRVAR(index__doc__
,
4764 "S.index(sub [,start [,end]]) -> int\n\
4766 Like S.find() but raise ValueError when the substring is not found.");
4769 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
4772 PyUnicodeObject
*substring
;
4776 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
4777 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4780 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4781 (PyObject
*)substring
);
4782 if (substring
== NULL
)
4785 result
= findstring(self
, substring
, start
, end
, 1);
4787 Py_DECREF(substring
);
4789 PyErr_SetString(PyExc_ValueError
, "substring not found");
4792 return PyInt_FromLong(result
);
4795 PyDoc_STRVAR(islower__doc__
,
4796 "S.islower() -> bool\n\
4798 Return True if all cased characters in S are lowercase and there is\n\
4799 at least one cased character in S, False otherwise.");
4802 unicode_islower(PyUnicodeObject
*self
)
4804 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4805 register const Py_UNICODE
*e
;
4808 /* Shortcut for single character strings */
4809 if (PyUnicode_GET_SIZE(self
) == 1)
4810 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
4812 /* Special case for empty strings */
4813 if (PyString_GET_SIZE(self
) == 0)
4814 return PyBool_FromLong(0);
4816 e
= p
+ PyUnicode_GET_SIZE(self
);
4818 for (; p
< e
; p
++) {
4819 register const Py_UNICODE ch
= *p
;
4821 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
4822 return PyBool_FromLong(0);
4823 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
4826 return PyBool_FromLong(cased
);
4829 PyDoc_STRVAR(isupper__doc__
,
4830 "S.isupper() -> bool\n\
4832 Return True if all cased characters in S are uppercase and there is\n\
4833 at least one cased character in S, False otherwise.");
4836 unicode_isupper(PyUnicodeObject
*self
)
4838 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4839 register const Py_UNICODE
*e
;
4842 /* Shortcut for single character strings */
4843 if (PyUnicode_GET_SIZE(self
) == 1)
4844 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
4846 /* Special case for empty strings */
4847 if (PyString_GET_SIZE(self
) == 0)
4848 return PyBool_FromLong(0);
4850 e
= p
+ PyUnicode_GET_SIZE(self
);
4852 for (; p
< e
; p
++) {
4853 register const Py_UNICODE ch
= *p
;
4855 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
4856 return PyBool_FromLong(0);
4857 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
4860 return PyBool_FromLong(cased
);
4863 PyDoc_STRVAR(istitle__doc__
,
4864 "S.istitle() -> bool\n\
4866 Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4867 characters may only follow uncased characters and lowercase characters\n\
4868 only cased ones. Return False otherwise.");
4871 unicode_istitle(PyUnicodeObject
*self
)
4873 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4874 register const Py_UNICODE
*e
;
4875 int cased
, previous_is_cased
;
4877 /* Shortcut for single character strings */
4878 if (PyUnicode_GET_SIZE(self
) == 1)
4879 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
4880 (Py_UNICODE_ISUPPER(*p
) != 0));
4882 /* Special case for empty strings */
4883 if (PyString_GET_SIZE(self
) == 0)
4884 return PyBool_FromLong(0);
4886 e
= p
+ PyUnicode_GET_SIZE(self
);
4888 previous_is_cased
= 0;
4889 for (; p
< e
; p
++) {
4890 register const Py_UNICODE ch
= *p
;
4892 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
4893 if (previous_is_cased
)
4894 return PyBool_FromLong(0);
4895 previous_is_cased
= 1;
4898 else if (Py_UNICODE_ISLOWER(ch
)) {
4899 if (!previous_is_cased
)
4900 return PyBool_FromLong(0);
4901 previous_is_cased
= 1;
4905 previous_is_cased
= 0;
4907 return PyBool_FromLong(cased
);
4910 PyDoc_STRVAR(isspace__doc__
,
4911 "S.isspace() -> bool\n\
4913 Return True if there are only whitespace characters in S,\n\
4917 unicode_isspace(PyUnicodeObject
*self
)
4919 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4920 register const Py_UNICODE
*e
;
4922 /* Shortcut for single character strings */
4923 if (PyUnicode_GET_SIZE(self
) == 1 &&
4924 Py_UNICODE_ISSPACE(*p
))
4925 return PyBool_FromLong(1);
4927 /* Special case for empty strings */
4928 if (PyString_GET_SIZE(self
) == 0)
4929 return PyBool_FromLong(0);
4931 e
= p
+ PyUnicode_GET_SIZE(self
);
4932 for (; p
< e
; p
++) {
4933 if (!Py_UNICODE_ISSPACE(*p
))
4934 return PyBool_FromLong(0);
4936 return PyBool_FromLong(1);
4939 PyDoc_STRVAR(isalpha__doc__
,
4940 "S.isalpha() -> bool\n\
4942 Return True if all characters in S are alphabetic\n\
4943 and there is at least one character in S, False otherwise.");
4946 unicode_isalpha(PyUnicodeObject
*self
)
4948 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4949 register const Py_UNICODE
*e
;
4951 /* Shortcut for single character strings */
4952 if (PyUnicode_GET_SIZE(self
) == 1 &&
4953 Py_UNICODE_ISALPHA(*p
))
4954 return PyBool_FromLong(1);
4956 /* Special case for empty strings */
4957 if (PyString_GET_SIZE(self
) == 0)
4958 return PyBool_FromLong(0);
4960 e
= p
+ PyUnicode_GET_SIZE(self
);
4961 for (; p
< e
; p
++) {
4962 if (!Py_UNICODE_ISALPHA(*p
))
4963 return PyBool_FromLong(0);
4965 return PyBool_FromLong(1);
4968 PyDoc_STRVAR(isalnum__doc__
,
4969 "S.isalnum() -> bool\n\
4971 Return True if all characters in S are alphanumeric\n\
4972 and there is at least one character in S, False otherwise.");
4975 unicode_isalnum(PyUnicodeObject
*self
)
4977 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4978 register const Py_UNICODE
*e
;
4980 /* Shortcut for single character strings */
4981 if (PyUnicode_GET_SIZE(self
) == 1 &&
4982 Py_UNICODE_ISALNUM(*p
))
4983 return PyBool_FromLong(1);
4985 /* Special case for empty strings */
4986 if (PyString_GET_SIZE(self
) == 0)
4987 return PyBool_FromLong(0);
4989 e
= p
+ PyUnicode_GET_SIZE(self
);
4990 for (; p
< e
; p
++) {
4991 if (!Py_UNICODE_ISALNUM(*p
))
4992 return PyBool_FromLong(0);
4994 return PyBool_FromLong(1);
4997 PyDoc_STRVAR(isdecimal__doc__
,
4998 "S.isdecimal() -> bool\n\
5000 Return True if there are only decimal characters in S,\n\
5004 unicode_isdecimal(PyUnicodeObject
*self
)
5006 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5007 register const Py_UNICODE
*e
;
5009 /* Shortcut for single character strings */
5010 if (PyUnicode_GET_SIZE(self
) == 1 &&
5011 Py_UNICODE_ISDECIMAL(*p
))
5012 return PyBool_FromLong(1);
5014 /* Special case for empty strings */
5015 if (PyString_GET_SIZE(self
) == 0)
5016 return PyBool_FromLong(0);
5018 e
= p
+ PyUnicode_GET_SIZE(self
);
5019 for (; p
< e
; p
++) {
5020 if (!Py_UNICODE_ISDECIMAL(*p
))
5021 return PyBool_FromLong(0);
5023 return PyBool_FromLong(1);
5026 PyDoc_STRVAR(isdigit__doc__
,
5027 "S.isdigit() -> bool\n\
5029 Return True if there are only digit characters in S,\n\
5033 unicode_isdigit(PyUnicodeObject
*self
)
5035 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5036 register const Py_UNICODE
*e
;
5038 /* Shortcut for single character strings */
5039 if (PyUnicode_GET_SIZE(self
) == 1 &&
5040 Py_UNICODE_ISDIGIT(*p
))
5041 return PyBool_FromLong(1);
5043 /* Special case for empty strings */
5044 if (PyString_GET_SIZE(self
) == 0)
5045 return PyBool_FromLong(0);
5047 e
= p
+ PyUnicode_GET_SIZE(self
);
5048 for (; p
< e
; p
++) {
5049 if (!Py_UNICODE_ISDIGIT(*p
))
5050 return PyBool_FromLong(0);
5052 return PyBool_FromLong(1);
5055 PyDoc_STRVAR(isnumeric__doc__
,
5056 "S.isnumeric() -> bool\n\
5058 Return True if there are only numeric characters in S,\n\
5062 unicode_isnumeric(PyUnicodeObject
*self
)
5064 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5065 register const Py_UNICODE
*e
;
5067 /* Shortcut for single character strings */
5068 if (PyUnicode_GET_SIZE(self
) == 1 &&
5069 Py_UNICODE_ISNUMERIC(*p
))
5070 return PyBool_FromLong(1);
5072 /* Special case for empty strings */
5073 if (PyString_GET_SIZE(self
) == 0)
5074 return PyBool_FromLong(0);
5076 e
= p
+ PyUnicode_GET_SIZE(self
);
5077 for (; p
< e
; p
++) {
5078 if (!Py_UNICODE_ISNUMERIC(*p
))
5079 return PyBool_FromLong(0);
5081 return PyBool_FromLong(1);
5084 PyDoc_STRVAR(join__doc__
,
5085 "S.join(sequence) -> unicode\n\
5087 Return a string which is the concatenation of the strings in the\n\
5088 sequence. The separator between elements is S.");
5091 unicode_join(PyObject
*self
, PyObject
*data
)
5093 return PyUnicode_Join(self
, data
);
5097 unicode_length(PyUnicodeObject
*self
)
5099 return self
->length
;
5102 PyDoc_STRVAR(ljust__doc__
,
5103 "S.ljust(width) -> unicode\n\
5105 Return S left justified in a Unicode string of length width. Padding is\n\
5106 done using spaces.");
5109 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
5112 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
5115 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5117 return (PyObject
*) self
;
5120 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
5123 PyDoc_STRVAR(lower__doc__
,
5124 "S.lower() -> unicode\n\
5126 Return a copy of the string S converted to lowercase.");
5129 unicode_lower(PyUnicodeObject
*self
)
5131 return fixup(self
, fixlower
);
5135 #define RIGHTSTRIP 1
5138 /* Arrays indexed by above */
5139 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5141 #define STRIPNAME(i) (stripformat[i]+3)
5143 static const Py_UNICODE
*
5144 unicode_memchr(const Py_UNICODE
*s
, Py_UNICODE c
, size_t n
)
5147 for (i
= 0; i
< n
; ++i
)
5153 /* externally visible for str.strip(unicode) */
5155 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
5157 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
5158 int len
= PyUnicode_GET_SIZE(self
);
5159 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
5160 int seplen
= PyUnicode_GET_SIZE(sepobj
);
5164 if (striptype
!= RIGHTSTRIP
) {
5165 while (i
< len
&& unicode_memchr(sep
, s
[i
], seplen
)) {
5171 if (striptype
!= LEFTSTRIP
) {
5174 } while (j
>= i
&& unicode_memchr(sep
, s
[j
], seplen
));
5178 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
5180 return (PyObject
*)self
;
5183 return PyUnicode_FromUnicode(s
+i
, j
-i
);
5188 do_strip(PyUnicodeObject
*self
, int striptype
)
5190 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
5191 int len
= PyUnicode_GET_SIZE(self
), i
, j
;
5194 if (striptype
!= RIGHTSTRIP
) {
5195 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
5201 if (striptype
!= LEFTSTRIP
) {
5204 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
5208 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
5210 return (PyObject
*)self
;
5213 return PyUnicode_FromUnicode(s
+i
, j
-i
);
5218 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
5220 PyObject
*sep
= NULL
;
5222 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
5225 if (sep
!= NULL
&& sep
!= Py_None
) {
5226 if (PyUnicode_Check(sep
))
5227 return _PyUnicode_XStrip(self
, striptype
, sep
);
5228 else if (PyString_Check(sep
)) {
5230 sep
= PyUnicode_FromObject(sep
);
5233 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
5238 PyErr_Format(PyExc_TypeError
,
5239 "%s arg must be None, unicode or str",
5240 STRIPNAME(striptype
));
5245 return do_strip(self
, striptype
);
5249 PyDoc_STRVAR(strip__doc__
,
5250 "S.strip([chars]) -> unicode\n\
5252 Return a copy of the string S with leading and trailing\n\
5253 whitespace removed.\n\
5254 If chars is given and not None, remove characters in chars instead.\n\
5255 If chars is a str, it will be converted to unicode before stripping");
5258 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
5260 if (PyTuple_GET_SIZE(args
) == 0)
5261 return do_strip(self
, BOTHSTRIP
); /* Common case */
5263 return do_argstrip(self
, BOTHSTRIP
, args
);
5267 PyDoc_STRVAR(lstrip__doc__
,
5268 "S.lstrip([chars]) -> unicode\n\
5270 Return a copy of the string S with leading whitespace removed.\n\
5271 If chars is given and not None, remove characters in chars instead.\n\
5272 If chars is a str, it will be converted to unicode before stripping");
5275 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
5277 if (PyTuple_GET_SIZE(args
) == 0)
5278 return do_strip(self
, LEFTSTRIP
); /* Common case */
5280 return do_argstrip(self
, LEFTSTRIP
, args
);
5284 PyDoc_STRVAR(rstrip__doc__
,
5285 "S.rstrip([chars]) -> unicode\n\
5287 Return a copy of the string S with trailing whitespace removed.\n\
5288 If chars is given and not None, remove characters in chars instead.\n\
5289 If chars is a str, it will be converted to unicode before stripping");
5292 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
5294 if (PyTuple_GET_SIZE(args
) == 0)
5295 return do_strip(self
, RIGHTSTRIP
); /* Common case */
5297 return do_argstrip(self
, RIGHTSTRIP
, args
);
5302 unicode_repeat(PyUnicodeObject
*str
, int len
)
5312 if (len
== 1 && PyUnicode_CheckExact(str
)) {
5313 /* no repeat, return original string */
5315 return (PyObject
*) str
;
5318 /* ensure # of chars needed doesn't overflow int and # of bytes
5319 * needed doesn't overflow size_t
5321 nchars
= len
* str
->length
;
5322 if (len
&& nchars
/ len
!= str
->length
) {
5323 PyErr_SetString(PyExc_OverflowError
,
5324 "repeated string is too long");
5327 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
5328 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
5329 PyErr_SetString(PyExc_OverflowError
,
5330 "repeated string is too long");
5333 u
= _PyUnicode_New(nchars
);
5340 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
5344 return (PyObject
*) u
;
5347 PyObject
*PyUnicode_Replace(PyObject
*obj
,
5357 self
= PyUnicode_FromObject(obj
);
5360 str1
= PyUnicode_FromObject(subobj
);
5365 str2
= PyUnicode_FromObject(replobj
);
5371 result
= replace((PyUnicodeObject
*)self
,
5372 (PyUnicodeObject
*)str1
,
5373 (PyUnicodeObject
*)str2
,
5381 PyDoc_STRVAR(replace__doc__
,
5382 "S.replace (old, new[, maxsplit]) -> unicode\n\
5384 Return a copy of S with all occurrences of substring\n\
5385 old replaced by new. If the optional argument maxsplit is\n\
5386 given, only the first maxsplit occurrences are replaced.");
5389 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
5391 PyUnicodeObject
*str1
;
5392 PyUnicodeObject
*str2
;
5396 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
5398 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
5401 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
5407 result
= replace(self
, str1
, str2
, maxcount
);
5415 PyObject
*unicode_repr(PyObject
*unicode
)
5417 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
5418 PyUnicode_GET_SIZE(unicode
),
5422 PyDoc_STRVAR(rfind__doc__
,
5423 "S.rfind(sub [,start [,end]]) -> int\n\
5425 Return the highest index in S where substring sub is found,\n\
5426 such that sub is contained within s[start,end]. Optional\n\
5427 arguments start and end are interpreted as in slice notation.\n\
5429 Return -1 on failure.");
5432 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
5434 PyUnicodeObject
*substring
;
5439 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
5440 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5442 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5443 (PyObject
*)substring
);
5444 if (substring
== NULL
)
5447 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
5449 Py_DECREF(substring
);
5453 PyDoc_STRVAR(rindex__doc__
,
5454 "S.rindex(sub [,start [,end]]) -> int\n\
5456 Like S.rfind() but raise ValueError when the substring is not found.");
5459 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
5462 PyUnicodeObject
*substring
;
5466 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
5467 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5469 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5470 (PyObject
*)substring
);
5471 if (substring
== NULL
)
5474 result
= findstring(self
, substring
, start
, end
, -1);
5476 Py_DECREF(substring
);
5478 PyErr_SetString(PyExc_ValueError
, "substring not found");
5481 return PyInt_FromLong(result
);
5484 PyDoc_STRVAR(rjust__doc__
,
5485 "S.rjust(width) -> unicode\n\
5487 Return S right justified in a Unicode string of length width. Padding is\n\
5488 done using spaces.");
5491 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
5494 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
5497 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
5499 return (PyObject
*) self
;
5502 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
5506 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
5508 /* standard clamping */
5513 if (end
> self
->length
)
5515 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
5516 /* full slice, return original string */
5518 return (PyObject
*) self
;
5523 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
5527 PyObject
*PyUnicode_Split(PyObject
*s
,
5533 s
= PyUnicode_FromObject(s
);
5537 sep
= PyUnicode_FromObject(sep
);
5544 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
5551 PyDoc_STRVAR(split__doc__
,
5552 "S.split([sep [,maxsplit]]) -> list of strings\n\
5554 Return a list of the words in S, using sep as the\n\
5555 delimiter string. If maxsplit is given, at most maxsplit\n\
5556 splits are done. If sep is not specified, any whitespace string\n\
5560 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
5562 PyObject
*substring
= Py_None
;
5565 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
5568 if (substring
== Py_None
)
5569 return split(self
, NULL
, maxcount
);
5570 else if (PyUnicode_Check(substring
))
5571 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
5573 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
5576 PyDoc_STRVAR(splitlines__doc__
,
5577 "S.splitlines([keepends]]) -> list of strings\n\
5579 Return a list of the lines in S, breaking at line boundaries.\n\
5580 Line breaks are not included in the resulting list unless keepends\n\
5581 is given and true.");
5584 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
5588 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
5591 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
5595 PyObject
*unicode_str(PyUnicodeObject
*self
)
5597 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
5600 PyDoc_STRVAR(swapcase__doc__
,
5601 "S.swapcase() -> unicode\n\
5603 Return a copy of S with uppercase characters converted to lowercase\n\
5607 unicode_swapcase(PyUnicodeObject
*self
)
5609 return fixup(self
, fixswapcase
);
5612 PyDoc_STRVAR(translate__doc__
,
5613 "S.translate(table) -> unicode\n\
5615 Return a copy of the string S, where all characters have been mapped\n\
5616 through the given translation table, which must be a mapping of\n\
5617 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5618 Unmapped characters are left untouched. Characters mapped to None\n\
5622 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
5624 return PyUnicode_TranslateCharmap(self
->str
,
5630 PyDoc_STRVAR(upper__doc__
,
5631 "S.upper() -> unicode\n\
5633 Return a copy of S converted to uppercase.");
5636 unicode_upper(PyUnicodeObject
*self
)
5638 return fixup(self
, fixupper
);
5641 PyDoc_STRVAR(zfill__doc__
,
5642 "S.zfill(width) -> unicode\n\
5644 Pad a numeric string x with zeros on the left, to fill a field\n\
5645 of the specified width. The string x is never truncated.");
5648 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
5654 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
5657 if (self
->length
>= width
) {
5658 if (PyUnicode_CheckExact(self
)) {
5660 return (PyObject
*) self
;
5663 return PyUnicode_FromUnicode(
5664 PyUnicode_AS_UNICODE(self
),
5665 PyUnicode_GET_SIZE(self
)
5669 fill
= width
- self
->length
;
5671 u
= pad(self
, fill
, 0, '0');
5676 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
5677 /* move sign to beginning of string */
5678 u
->str
[0] = u
->str
[fill
];
5682 return (PyObject
*) u
;
5687 unicode_freelistsize(PyUnicodeObject
*self
)
5689 return PyInt_FromLong(unicode_freelist_size
);
5693 PyDoc_STRVAR(startswith__doc__
,
5694 "S.startswith(prefix[, start[, end]]) -> bool\n\
5696 Return True if S starts with the specified prefix, False otherwise.\n\
5697 With optional start, test S beginning at that position.\n\
5698 With optional end, stop comparing S at that position.");
5701 unicode_startswith(PyUnicodeObject
*self
,
5704 PyUnicodeObject
*substring
;
5709 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
5710 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5712 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5713 (PyObject
*)substring
);
5714 if (substring
== NULL
)
5717 result
= PyBool_FromLong(tailmatch(self
, substring
, start
, end
, -1));
5719 Py_DECREF(substring
);
5724 PyDoc_STRVAR(endswith__doc__
,
5725 "S.endswith(suffix[, start[, end]]) -> bool\n\
5727 Return True if S ends with the specified suffix, False otherwise.\n\
5728 With optional start, test S beginning at that position.\n\
5729 With optional end, stop comparing S at that position.");
5732 unicode_endswith(PyUnicodeObject
*self
,
5735 PyUnicodeObject
*substring
;
5740 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
5741 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
5743 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
5744 (PyObject
*)substring
);
5745 if (substring
== NULL
)
5748 result
= PyBool_FromLong(tailmatch(self
, substring
, start
, end
, +1));
5750 Py_DECREF(substring
);
5757 unicode_getnewargs(PyUnicodeObject
*v
)
5759 return Py_BuildValue("(u#)", v
->str
, v
->length
);
5763 static PyMethodDef unicode_methods
[] = {
5765 /* Order is according to common usage: often used methods should
5766 appear first, since lookup is done sequentially. */
5768 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
5769 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
5770 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
5771 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
5772 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
5773 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
5774 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
5775 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
5776 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
5777 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
5778 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
5779 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
5780 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
5781 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
5782 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5783 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
5784 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
5785 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
5786 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
5787 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
5788 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
5789 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
5790 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
5791 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
5792 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
5793 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
5794 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
5795 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
5796 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
5797 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
5798 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
5799 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
5800 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
5801 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
5802 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
5803 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
5805 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
5809 /* This one is just used for debugging the implementation. */
5810 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
5813 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
5818 unicode_mod(PyObject
*v
, PyObject
*w
)
5820 if (!PyUnicode_Check(v
)) {
5821 Py_INCREF(Py_NotImplemented
);
5822 return Py_NotImplemented
;
5824 return PyUnicode_Format(v
, w
);
5827 static PyNumberMethods unicode_as_number
= {
5832 unicode_mod
, /*nb_remainder*/
5835 static PySequenceMethods unicode_as_sequence
= {
5836 (inquiry
) unicode_length
, /* sq_length */
5837 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
5838 (intargfunc
) unicode_repeat
, /* sq_repeat */
5839 (intargfunc
) unicode_getitem
, /* sq_item */
5840 (intintargfunc
) unicode_slice
, /* sq_slice */
5841 0, /* sq_ass_item */
5842 0, /* sq_ass_slice */
5843 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
5847 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
5849 if (PyInt_Check(item
)) {
5850 long i
= PyInt_AS_LONG(item
);
5852 i
+= PyString_GET_SIZE(self
);
5853 return unicode_getitem(self
, i
);
5854 } else if (PyLong_Check(item
)) {
5855 long i
= PyLong_AsLong(item
);
5856 if (i
== -1 && PyErr_Occurred())
5859 i
+= PyString_GET_SIZE(self
);
5860 return unicode_getitem(self
, i
);
5861 } else if (PySlice_Check(item
)) {
5862 int start
, stop
, step
, slicelength
, cur
, i
;
5863 Py_UNICODE
* source_buf
;
5864 Py_UNICODE
* result_buf
;
5867 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyString_GET_SIZE(self
),
5868 &start
, &stop
, &step
, &slicelength
) < 0) {
5872 if (slicelength
<= 0) {
5873 return PyUnicode_FromUnicode(NULL
, 0);
5875 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
5876 result_buf
= PyMem_MALLOC(slicelength
*sizeof(Py_UNICODE
));
5878 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
5879 result_buf
[i
] = source_buf
[cur
];
5882 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
5883 PyMem_FREE(result_buf
);
5887 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
5892 static PyMappingMethods unicode_as_mapping
= {
5893 (inquiry
)unicode_length
, /* mp_length */
5894 (binaryfunc
)unicode_subscript
, /* mp_subscript */
5895 (objobjargproc
)0, /* mp_ass_subscript */
5899 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
5904 PyErr_SetString(PyExc_SystemError
,
5905 "accessing non-existent unicode segment");
5908 *ptr
= (void *) self
->str
;
5909 return PyUnicode_GET_DATA_SIZE(self
);
5913 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
5916 PyErr_SetString(PyExc_TypeError
,
5917 "cannot use unicode as modifiable buffer");
5922 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
5926 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
5931 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
5938 PyErr_SetString(PyExc_SystemError
,
5939 "accessing non-existent unicode segment");
5942 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
5945 *ptr
= (void *) PyString_AS_STRING(str
);
5946 return PyString_GET_SIZE(str
);
5949 /* Helpers for PyUnicode_Format() */
5952 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
5954 int argidx
= *p_argidx
;
5955 if (argidx
< arglen
) {
5960 return PyTuple_GetItem(args
, argidx
);
5962 PyErr_SetString(PyExc_TypeError
,
5963 "not enough arguments for format string");
5967 #define F_LJUST (1<<0)
5968 #define F_SIGN (1<<1)
5969 #define F_BLANK (1<<2)
5970 #define F_ALT (1<<3)
5971 #define F_ZERO (1<<4)
5974 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
5980 va_start(va
, format
);
5982 /* First, format the string as char array, then expand to Py_UNICODE
5984 charbuffer
= (char *)buffer
;
5985 len
= vsprintf(charbuffer
, format
, va
);
5986 for (i
= len
- 1; i
>= 0; i
--)
5987 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
5993 /* XXX To save some code duplication, formatfloat/long/int could have been
5994 shared with stringobject.c, converting from 8-bit to Unicode after the
5995 formatting is done. */
5998 formatfloat(Py_UNICODE
*buf
,
6005 /* fmt = '%#.' + `prec` + `type`
6006 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6010 x
= PyFloat_AsDouble(v
);
6011 if (x
== -1.0 && PyErr_Occurred())
6015 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
6017 /* Worst case length calc to ensure no buffer overrun:
6021 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6022 for any double rep.)
6023 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6026 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6027 len = 1 + 50 + 1 + prec = 52 + prec
6029 If prec=0 the effective precision is 1 (the leading digit is
6030 always given), therefore increase the length by one.
6033 if ((type
== 'g' && buflen
<= (size_t)10 + (size_t)prec
) ||
6034 (type
== 'f' && buflen
<= (size_t)53 + (size_t)prec
)) {
6035 PyErr_SetString(PyExc_OverflowError
,
6036 "formatted float is too long (precision too large?)");
6039 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
6040 (flags
&F_ALT
) ? "#" : "",
6042 return usprintf(buf
, fmt
, x
);
6046 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
6050 PyObject
*str
; /* temporary string object. */
6051 PyUnicodeObject
*result
;
6053 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
6056 result
= _PyUnicode_New(len
);
6057 for (i
= 0; i
< len
; i
++)
6058 result
->str
[i
] = buf
[i
];
6059 result
->str
[len
] = 0;
6061 return (PyObject
*)result
;
6065 formatint(Py_UNICODE
*buf
,
6072 /* fmt = '%#.' + `prec` + 'l' + `type`
6073 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6077 char fmt
[64]; /* plenty big enough! */
6080 x
= PyInt_AsLong(v
);
6081 if (x
== -1 && PyErr_Occurred())
6083 if (x
< 0 && type
!= 'd' && type
!= 'i') {
6084 if (PyErr_Warn(PyExc_FutureWarning
,
6085 "%u/%o/%x/%X of negative int will return "
6086 "a signed string in Python 2.4 and up") < 0)
6092 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6093 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6095 if (buflen
<= 13 || buflen
<= (size_t)2 + (size_t)prec
) {
6096 PyErr_SetString(PyExc_OverflowError
,
6097 "formatted integer is too long (precision too large?)");
6101 if ((flags
& F_ALT
) &&
6102 (type
== 'x' || type
== 'X')) {
6103 /* When converting under %#x or %#X, there are a number
6104 * of issues that cause pain:
6105 * - when 0 is being converted, the C standard leaves off
6106 * the '0x' or '0X', which is inconsistent with other
6107 * %#x/%#X conversions and inconsistent with Python's
6109 * - there are platforms that violate the standard and
6110 * convert 0 with the '0x' or '0X'
6111 * (Metrowerks, Compaq Tru64)
6112 * - there are platforms that give '0x' when converting
6113 * under %#X, but convert 0 in accordance with the
6114 * standard (OS/2 EMX)
6116 * We can achieve the desired consistency by inserting our
6117 * own '0x' or '0X' prefix, and substituting %x/%X in place
6120 * Note that this is the same approach as used in
6121 * formatint() in stringobject.c
6123 PyOS_snprintf(fmt
, sizeof(fmt
), "0%c%%.%dl%c",
6127 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%dl%c",
6128 (flags
&F_ALT
) ? "#" : "",
6131 return usprintf(buf
, fmt
, x
);
6135 formatchar(Py_UNICODE
*buf
,
6139 /* presume that the buffer is at least 2 characters long */
6140 if (PyUnicode_Check(v
)) {
6141 if (PyUnicode_GET_SIZE(v
) != 1)
6143 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
6146 else if (PyString_Check(v
)) {
6147 if (PyString_GET_SIZE(v
) != 1)
6149 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
6153 /* Integer input truncated to a character */
6155 x
= PyInt_AsLong(v
);
6156 if (x
== -1 && PyErr_Occurred())
6158 #ifdef Py_UNICODE_WIDE
6159 if (x
< 0 || x
> 0x10ffff) {
6160 PyErr_SetString(PyExc_OverflowError
,
6161 "%c arg not in range(0x110000) "
6162 "(wide Python build)");
6166 if (x
< 0 || x
> 0xffff) {
6167 PyErr_SetString(PyExc_OverflowError
,
6168 "%c arg not in range(0x10000) "
6169 "(narrow Python build)");
6173 buf
[0] = (Py_UNICODE
) x
;
6179 PyErr_SetString(PyExc_TypeError
,
6180 "%c requires int or char");
6184 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6186 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6187 chars are formatted. XXX This is a magic number. Each formatting
6188 routine does bounds checking to ensure no overflow, but a better
6189 solution may be to malloc a buffer of appropriate size for each
6190 format. For now, the current solution is sufficient.
6192 #define FORMATBUFLEN (size_t)120
6194 PyObject
*PyUnicode_Format(PyObject
*format
,
6197 Py_UNICODE
*fmt
, *res
;
6198 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
6200 PyUnicodeObject
*result
= NULL
;
6201 PyObject
*dict
= NULL
;
6204 if (format
== NULL
|| args
== NULL
) {
6205 PyErr_BadInternalCall();
6208 uformat
= PyUnicode_FromObject(format
);
6209 if (uformat
== NULL
)
6211 fmt
= PyUnicode_AS_UNICODE(uformat
);
6212 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
6214 reslen
= rescnt
= fmtcnt
+ 100;
6215 result
= _PyUnicode_New(reslen
);
6218 res
= PyUnicode_AS_UNICODE(result
);
6220 if (PyTuple_Check(args
)) {
6221 arglen
= PyTuple_Size(args
);
6228 if (args
->ob_type
->tp_as_mapping
&& !PyTuple_Check(args
) &&
6229 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
6232 while (--fmtcnt
>= 0) {
6235 rescnt
= fmtcnt
+ 100;
6237 if (_PyUnicode_Resize(&result
, reslen
) < 0)
6239 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
6245 /* Got a format specifier */
6249 Py_UNICODE c
= '\0';
6252 PyObject
*temp
= NULL
;
6256 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
6260 Py_UNICODE
*keystart
;
6266 PyErr_SetString(PyExc_TypeError
,
6267 "format requires a mapping");
6273 /* Skip over balanced parentheses */
6274 while (pcount
> 0 && --fmtcnt
>= 0) {
6277 else if (*fmt
== '(')
6281 keylen
= fmt
- keystart
- 1;
6282 if (fmtcnt
< 0 || pcount
> 0) {
6283 PyErr_SetString(PyExc_ValueError
,
6284 "incomplete format key");
6288 /* keys are converted to strings using UTF-8 and
6289 then looked up since Python uses strings to hold
6290 variables names etc. in its namespaces and we
6291 wouldn't want to break common idioms. */
6292 key
= PyUnicode_EncodeUTF8(keystart
,
6296 key
= PyUnicode_FromUnicode(keystart
, keylen
);
6304 args
= PyObject_GetItem(dict
, key
);
6313 while (--fmtcnt
>= 0) {
6314 switch (c
= *fmt
++) {
6315 case '-': flags
|= F_LJUST
; continue;
6316 case '+': flags
|= F_SIGN
; continue;
6317 case ' ': flags
|= F_BLANK
; continue;
6318 case '#': flags
|= F_ALT
; continue;
6319 case '0': flags
|= F_ZERO
; continue;
6324 v
= getnextarg(args
, arglen
, &argidx
);
6327 if (!PyInt_Check(v
)) {
6328 PyErr_SetString(PyExc_TypeError
,
6332 width
= PyInt_AsLong(v
);
6340 else if (c
>= '0' && c
<= '9') {
6342 while (--fmtcnt
>= 0) {
6344 if (c
< '0' || c
> '9')
6346 if ((width
*10) / 10 != width
) {
6347 PyErr_SetString(PyExc_ValueError
,
6351 width
= width
*10 + (c
- '0');
6359 v
= getnextarg(args
, arglen
, &argidx
);
6362 if (!PyInt_Check(v
)) {
6363 PyErr_SetString(PyExc_TypeError
,
6367 prec
= PyInt_AsLong(v
);
6373 else if (c
>= '0' && c
<= '9') {
6375 while (--fmtcnt
>= 0) {
6376 c
= Py_CHARMASK(*fmt
++);
6377 if (c
< '0' || c
> '9')
6379 if ((prec
*10) / 10 != prec
) {
6380 PyErr_SetString(PyExc_ValueError
,
6384 prec
= prec
*10 + (c
- '0');
6389 if (c
== 'h' || c
== 'l' || c
== 'L') {
6395 PyErr_SetString(PyExc_ValueError
,
6396 "incomplete format");
6400 v
= getnextarg(args
, arglen
, &argidx
);
6410 /* presume that buffer length is at least 1 */
6417 if (PyUnicode_Check(v
) && c
== 's') {
6424 temp
= PyObject_Str(v
);
6426 temp
= PyObject_Repr(v
);
6429 if (!PyString_Check(temp
)) {
6430 /* XXX Note: this should never happen, since
6431 PyObject_Repr() and PyObject_Str() assure
6434 PyErr_SetString(PyExc_TypeError
,
6435 "%s argument has non-string str()");
6438 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
6439 PyString_GET_SIZE(temp
),
6447 pbuf
= PyUnicode_AS_UNICODE(temp
);
6448 len
= PyUnicode_GET_SIZE(temp
);
6449 if (prec
>= 0 && len
> prec
)
6461 if (PyLong_Check(v
)) {
6462 temp
= formatlong(v
, flags
, prec
, c
);
6465 pbuf
= PyUnicode_AS_UNICODE(temp
);
6466 len
= PyUnicode_GET_SIZE(temp
);
6467 /* unbounded ints can always produce
6468 a sign character! */
6473 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
6477 /* only d conversion is signed */
6490 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
6501 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
6507 PyErr_Format(PyExc_ValueError
,
6508 "unsupported format character '%c' (0x%x) "
6510 (31<=c
&& c
<=126) ? (char)c
: '?',
6512 (int)(fmt
-1 - PyUnicode_AS_UNICODE(uformat
)));
6516 if (*pbuf
== '-' || *pbuf
== '+') {
6520 else if (flags
& F_SIGN
)
6522 else if (flags
& F_BLANK
)
6529 if (rescnt
- (sign
!= 0) < width
) {
6531 rescnt
= width
+ fmtcnt
+ 100;
6535 return PyErr_NoMemory();
6537 if (_PyUnicode_Resize(&result
, reslen
) < 0)
6539 res
= PyUnicode_AS_UNICODE(result
)
6549 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
6550 assert(pbuf
[0] == '0');
6551 assert(pbuf
[1] == c
);
6562 if (width
> len
&& !(flags
& F_LJUST
)) {
6566 } while (--width
> len
);
6571 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
6572 assert(pbuf
[0] == '0');
6573 assert(pbuf
[1] == c
);
6578 Py_UNICODE_COPY(res
, pbuf
, len
);
6581 while (--width
>= len
) {
6585 if (dict
&& (argidx
< arglen
) && c
!= '%') {
6586 PyErr_SetString(PyExc_TypeError
,
6587 "not all arguments converted during string formatting");
6593 if (argidx
< arglen
&& !dict
) {
6594 PyErr_SetString(PyExc_TypeError
,
6595 "not all arguments converted during string formatting");
6603 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
6605 return (PyObject
*)result
;
6616 static PyBufferProcs unicode_as_buffer
= {
6617 (getreadbufferproc
) unicode_buffer_getreadbuf
,
6618 (getwritebufferproc
) unicode_buffer_getwritebuf
,
6619 (getsegcountproc
) unicode_buffer_getsegcount
,
6620 (getcharbufferproc
) unicode_buffer_getcharbuf
,
6624 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
6627 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
6630 static char *kwlist
[] = {"string", "encoding", "errors", 0};
6631 char *encoding
= NULL
;
6632 char *errors
= NULL
;
6634 if (type
!= &PyUnicode_Type
)
6635 return unicode_subtype_new(type
, args
, kwds
);
6636 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
6637 kwlist
, &x
, &encoding
, &errors
))
6640 return (PyObject
*)_PyUnicode_New(0);
6641 if (encoding
== NULL
&& errors
== NULL
)
6642 return PyObject_Unicode(x
);
6644 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
6648 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
6650 PyUnicodeObject
*tmp
, *pnew
;
6653 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
6654 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
6657 assert(PyUnicode_Check(tmp
));
6658 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
6661 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
6662 if (pnew
->str
== NULL
) {
6663 _Py_ForgetReference((PyObject
*)pnew
);
6665 return PyErr_NoMemory();
6667 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
6669 pnew
->hash
= tmp
->hash
;
6671 return (PyObject
*)pnew
;
6674 PyDoc_STRVAR(unicode_doc
,
6675 "unicode(string [, encoding[, errors]]) -> object\n\
6677 Create a new Unicode object from the given encoded string.\n\
6678 encoding defaults to the current default string encoding.\n\
6679 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6681 PyTypeObject PyUnicode_Type
= {
6682 PyObject_HEAD_INIT(&PyType_Type
)
6684 "unicode", /* tp_name */
6685 sizeof(PyUnicodeObject
), /* tp_size */
6686 0, /* tp_itemsize */
6688 (destructor
)unicode_dealloc
, /* tp_dealloc */
6692 (cmpfunc
) unicode_compare
, /* tp_compare */
6693 (reprfunc
) unicode_repr
, /* tp_repr */
6694 &unicode_as_number
, /* tp_as_number */
6695 &unicode_as_sequence
, /* tp_as_sequence */
6696 &unicode_as_mapping
, /* tp_as_mapping */
6697 (hashfunc
) unicode_hash
, /* tp_hash*/
6699 (reprfunc
) unicode_str
, /* tp_str */
6700 PyObject_GenericGetAttr
, /* tp_getattro */
6701 0, /* tp_setattro */
6702 &unicode_as_buffer
, /* tp_as_buffer */
6703 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
6704 Py_TPFLAGS_BASETYPE
, /* tp_flags */
6705 unicode_doc
, /* tp_doc */
6706 0, /* tp_traverse */
6708 0, /* tp_richcompare */
6709 0, /* tp_weaklistoffset */
6711 0, /* tp_iternext */
6712 unicode_methods
, /* tp_methods */
6715 &PyBaseString_Type
, /* tp_base */
6717 0, /* tp_descr_get */
6718 0, /* tp_descr_set */
6719 0, /* tp_dictoffset */
6722 unicode_new
, /* tp_new */
6723 PyObject_Del
, /* tp_free */
6726 /* Initialize the Unicode implementation */
6728 void _PyUnicode_Init(void)
6732 /* Init the implementation */
6733 unicode_freelist
= NULL
;
6734 unicode_freelist_size
= 0;
6735 unicode_empty
= _PyUnicode_New(0);
6736 strcpy(unicode_default_encoding
, "ascii");
6737 for (i
= 0; i
< 256; i
++)
6738 unicode_latin1
[i
] = NULL
;
6739 if (PyType_Ready(&PyUnicode_Type
) < 0)
6740 Py_FatalError("Can't initialize 'unicode'");
6743 /* Finalize the Unicode implementation */
6746 _PyUnicode_Fini(void)
6751 Py_XDECREF(unicode_empty
);
6752 unicode_empty
= NULL
;
6754 for (i
= 0; i
< 256; i
++) {
6755 if (unicode_latin1
[i
]) {
6756 Py_DECREF(unicode_latin1
[i
]);
6757 unicode_latin1
[i
] = NULL
;
6761 for (u
= unicode_freelist
; u
!= NULL
;) {
6762 PyUnicodeObject
*v
= u
;
6763 u
= *(PyUnicodeObject
**)u
;
6766 Py_XDECREF(v
->defenc
);
6769 unicode_freelist
= NULL
;
6770 unicode_freelist_size
= 0;