3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_DEL(unicode
);
227 void _PyUnicode_Free(register PyUnicodeObject
*unicode
)
229 if (unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
230 /* Keep-Alive optimization */
231 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
232 PyMem_DEL(unicode
->str
);
236 if (unicode
->defenc
) {
237 Py_DECREF(unicode
->defenc
);
238 unicode
->defenc
= NULL
;
240 /* Add to free list */
241 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
242 unicode_freelist
= unicode
;
243 unicode_freelist_size
++;
246 PyMem_DEL(unicode
->str
);
247 Py_XDECREF(unicode
->defenc
);
248 PyObject_DEL(unicode
);
252 int PyUnicode_Resize(PyObject
**unicode
,
255 register PyUnicodeObject
*v
;
257 /* Argument checks */
258 if (unicode
== NULL
) {
259 PyErr_BadInternalCall();
262 v
= (PyUnicodeObject
*)*unicode
;
263 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
264 PyErr_BadInternalCall();
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v
->length
!= length
&&
272 (v
== unicode_empty
|| v
->length
== 1)) {
273 PyUnicodeObject
*w
= _PyUnicode_New(length
);
276 Py_UNICODE_COPY(w
->str
, v
->str
,
277 length
< v
->length
? length
: v
->length
);
278 *unicode
= (PyObject
*)w
;
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v
, length
);
287 /* Internal API for use in unicodeobject.c only ! */
288 #define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
294 PyUnicodeObject
*unicode
;
296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
300 /* Optimization for empty strings */
301 if (size
== 0 && unicode_empty
!= NULL
) {
302 Py_INCREF(unicode_empty
);
303 return (PyObject
*)unicode_empty
;
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size
== 1 && *u
< 256) {
309 unicode
= unicode_latin1
[*u
];
311 unicode
= _PyUnicode_New(1);
314 unicode
->str
[0] = *u
;
315 unicode_latin1
[*u
] = unicode
;
318 return (PyObject
*)unicode
;
322 unicode
= _PyUnicode_New(size
);
326 /* Copy the Unicode data into the new object */
328 Py_UNICODE_COPY(unicode
->str
, u
, size
);
330 return (PyObject
*)unicode
;
335 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
338 PyUnicodeObject
*unicode
;
341 PyErr_BadInternalCall();
345 unicode
= _PyUnicode_New(size
);
349 /* Copy the wchar_t data into the new object */
350 #ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
354 register Py_UNICODE
*u
;
356 u
= PyUnicode_AS_UNICODE(unicode
);
357 for (i
= size
; i
>= 0; i
--)
362 return (PyObject
*)unicode
;
365 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
369 if (unicode
== NULL
) {
370 PyErr_BadInternalCall();
373 if (size
> PyUnicode_GET_SIZE(unicode
))
374 size
= PyUnicode_GET_SIZE(unicode
);
375 #ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
379 register Py_UNICODE
*u
;
381 u
= PyUnicode_AS_UNICODE(unicode
);
382 for (i
= size
; i
>= 0; i
--)
392 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
394 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
397 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
398 const char *encoding
,
401 const char *s
= NULL
;
408 PyErr_BadInternalCall();
413 for (reclevel
= 0; reclevel
< 2; reclevel
++) {
415 if (PyUnicode_Check(obj
)) {
417 PyErr_SetString(PyExc_TypeError
,
418 "decoding Unicode is not supported");
421 if (PyUnicode_CheckExact(obj
)) {
426 /* For a subclass of unicode, return a true unicode object
427 with the same string value. */
428 v
= PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
429 PyUnicode_GET_SIZE(obj
));
433 else if (PyString_Check(obj
)) {
434 s
= PyString_AS_STRING(obj
);
435 len
= PyString_GET_SIZE(obj
);
441 /* Try char buffer interface */
442 if (PyObject_AsCharBuffer(obj
, &s
, &len
))
447 /* Mimic the behaviour of str(object) if everything else
448 fails (see PyObject_Str()); this also covers instances
449 which implement __str__. */
450 if (obj
->ob_type
->tp_str
== NULL
)
451 w
= PyObject_Repr(obj
);
453 w
= (*obj
->ob_type
->tp_str
)(obj
);
465 PyErr_Format(PyExc_TypeError
,
466 "coercing to Unicode: __str__ recursion limit exceeded "
467 "(last type: %.80s)",
468 obj
->ob_type
->tp_name
);
472 /* Convert to Unicode */
474 Py_INCREF(unicode_empty
);
475 v
= (PyObject
*)unicode_empty
;
478 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
493 PyObject
*PyUnicode_Decode(const char *s
,
495 const char *encoding
,
498 PyObject
*buffer
= NULL
, *unicode
;
500 if (encoding
== NULL
)
501 encoding
= PyUnicode_GetDefaultEncoding();
503 /* Shortcuts for common default encodings */
504 if (strcmp(encoding
, "utf-8") == 0)
505 return PyUnicode_DecodeUTF8(s
, size
, errors
);
506 else if (strcmp(encoding
, "latin-1") == 0)
507 return PyUnicode_DecodeLatin1(s
, size
, errors
);
508 else if (strcmp(encoding
, "ascii") == 0)
509 return PyUnicode_DecodeASCII(s
, size
, errors
);
511 /* Decode via the codec registry */
512 buffer
= PyBuffer_FromMemory((void *)s
, size
);
515 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
518 if (!PyUnicode_Check(unicode
)) {
519 PyErr_Format(PyExc_TypeError
,
520 "decoder did not return an unicode object (type=%.400s)",
521 unicode
->ob_type
->tp_name
);
533 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
535 const char *encoding
,
538 PyObject
*v
, *unicode
;
540 unicode
= PyUnicode_FromUnicode(s
, size
);
543 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
548 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
549 const char *encoding
,
554 if (!PyUnicode_Check(unicode
)) {
559 if (encoding
== NULL
)
560 encoding
= PyUnicode_GetDefaultEncoding();
562 /* Shortcuts for common default encodings */
563 if (errors
== NULL
) {
564 if (strcmp(encoding
, "utf-8") == 0)
565 return PyUnicode_AsUTF8String(unicode
);
566 else if (strcmp(encoding
, "latin-1") == 0)
567 return PyUnicode_AsLatin1String(unicode
);
568 else if (strcmp(encoding
, "ascii") == 0)
569 return PyUnicode_AsASCIIString(unicode
);
572 /* Encode via the codec registry */
573 v
= PyCodec_Encode(unicode
, encoding
, errors
);
576 /* XXX Should we really enforce this ? */
577 if (!PyString_Check(v
)) {
578 PyErr_Format(PyExc_TypeError
,
579 "encoder did not return a string object (type=%.400s)",
580 v
->ob_type
->tp_name
);
590 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
593 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
597 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
598 if (v
&& errors
== NULL
)
599 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
603 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
605 if (!PyUnicode_Check(unicode
)) {
609 return PyUnicode_AS_UNICODE(unicode
);
615 int PyUnicode_GetSize(PyObject
*unicode
)
617 if (!PyUnicode_Check(unicode
)) {
621 return PyUnicode_GET_SIZE(unicode
);
627 const char *PyUnicode_GetDefaultEncoding(void)
629 return unicode_default_encoding
;
632 int PyUnicode_SetDefaultEncoding(const char *encoding
)
636 /* Make sure the encoding is valid. As side effect, this also
637 loads the encoding into the codec registry cache. */
638 v
= _PyCodec_Lookup(encoding
);
642 strncpy(unicode_default_encoding
,
644 sizeof(unicode_default_encoding
));
651 /* --- UTF-7 Codec -------------------------------------------------------- */
653 /* see RFC2152 for details */
656 char utf7_special
[128] = {
657 /* indicate whether a UTF-7 character is special i.e. cannot be directly
661 2 - whitespace (optional)
662 3 - RFC2152 Set O (optional) */
663 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
665 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
667 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
669 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
674 #define SPECIAL(c, encodeO, encodeWS) \
675 (((c)>127 || utf7_special[(c)] == 1) || \
676 (encodeWS && (utf7_special[(c)] == 2)) || \
677 (encodeO && (utf7_special[(c)] == 3)))
679 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
680 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
681 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
682 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
684 #define ENCODE(out, ch, bits) \
685 while (bits >= 6) { \
686 *out++ = B64(ch >> (bits-6)); \
690 #define DECODE(out, ch, bits, surrogate) \
691 while (bits >= 16) { \
692 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
695 /* We have already generated an error for the high surrogate
696 so let's not bother seeing if the low surrogate is correct or not */\
698 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
699 /* This is a surrogate pair. Unfortunately we can't represent \
700 it in a 16-bit character */ \
702 errmsg = "code pairs are not supported"; \
710 int utf7_decoding_error(Py_UNICODE
**dest
,
714 if ((errors
== NULL
) ||
715 (strcmp(errors
,"strict") == 0)) {
716 PyErr_Format(PyExc_UnicodeError
,
717 "UTF-7 decoding error: %.400s",
721 else if (strcmp(errors
,"ignore") == 0) {
724 else if (strcmp(errors
,"replace") == 0) {
726 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
732 PyErr_Format(PyExc_ValueError
,
733 "UTF-7 decoding error; unknown error handling code: %.400s",
739 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
744 PyUnicodeObject
*unicode
;
746 const char *errmsg
= "";
748 unsigned int bitsleft
= 0;
749 unsigned long charsleft
= 0;
752 unicode
= _PyUnicode_New(size
);
756 return (PyObject
*)unicode
;
765 if ((ch
== '-') || !B64CHAR(ch
)) {
769 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
771 /* The shift sequence has a partial character in it. If
772 bitsleft < 6 then we could just classify it as padding
773 but that is not the case here */
775 errmsg
= "partial character in shift sequence";
778 /* According to RFC2152 the remaining bits should be zero. We
779 choose to signal an error/insert a replacement character
780 here so indicate the potential of a misencoded character. */
782 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
783 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
784 errmsg
= "non-zero padding bits in shift sequence";
789 if ((s
< e
) && (*(s
) == '-')) {
793 } else if (SPECIAL(ch
,0,0)) {
794 errmsg
= "unexpected special character";
800 charsleft
= (charsleft
<< 6) | UB64(ch
);
803 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
806 else if ( ch
== '+' ) {
808 if (s
< e
&& *s
== '-') {
817 else if (SPECIAL(ch
,0,0)) {
818 errmsg
= "unexpected special character";
828 if (utf7_decoding_error(&p
, errors
, errmsg
))
833 if (utf7_decoding_error(&p
, errors
, "unterminated shift sequence"))
837 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
840 return (PyObject
*)unicode
;
848 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
851 int encodeWhiteSpace
,
855 /* It might be possible to tighten this worst case */
856 unsigned int cbAllocated
= 5 * size
;
859 unsigned int bitsleft
= 0;
860 unsigned long charsleft
= 0;
865 return PyString_FromStringAndSize(NULL
, 0);
867 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
871 start
= out
= PyString_AS_STRING(v
);
872 for (;i
< size
; ++i
) {
873 Py_UNICODE ch
= s
[i
];
879 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
883 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
884 inShift
= bitsleft
> 0;
889 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
890 *out
++ = B64(charsleft
<< (6-bitsleft
));
893 /* Characters not in the BASE64 set implicitly unshift the sequence
894 so no '-' is required, except if the character is itself a '-' */
895 if (B64CHAR(ch
) || ch
== '-') {
902 charsleft
= (charsleft
<< 16) | ch
;
903 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
905 /* If the next character is special then we dont' need to terminate
906 the shift sequence. If the next character is not a BASE64 character
907 or '-' then the shift sequence will be terminated implicitly and we
908 don't have to insert a '-'. */
912 Py_UNICODE ch2
= s
[i
+1];
914 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
916 } else if (B64CHAR(ch2
) || ch2
== '-') {
933 *out
++= B64(charsleft
<< (6-bitsleft
) );
937 if (_PyString_Resize(&v
, out
- start
)) {
951 /* --- UTF-8 Codec -------------------------------------------------------- */
954 char utf8_code_length
[256] = {
955 /* Map UTF-8 encoded prefix byte to sequence length. zero means
956 illegal prefix. see RFC 2279 for details */
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
971 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
972 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
976 int utf8_decoding_error(const char **source
,
981 if ((errors
== NULL
) ||
982 (strcmp(errors
,"strict") == 0)) {
983 PyErr_Format(PyExc_UnicodeError
,
984 "UTF-8 decoding error: %.400s",
988 else if (strcmp(errors
,"ignore") == 0) {
992 else if (strcmp(errors
,"replace") == 0) {
994 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
999 PyErr_Format(PyExc_ValueError
,
1000 "UTF-8 decoding error; unknown error handling code: %.400s",
1006 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1012 PyUnicodeObject
*unicode
;
1014 const char *errmsg
= "";
1016 /* Note: size will always be longer than the resulting Unicode
1018 unicode
= _PyUnicode_New(size
);
1022 return (PyObject
*)unicode
;
1024 /* Unpack UTF-8 encoded data */
1029 Py_UCS4 ch
= (unsigned char)*s
;
1032 *p
++ = (Py_UNICODE
)ch
;
1037 n
= utf8_code_length
[ch
];
1040 errmsg
= "unexpected end of data";
1047 errmsg
= "unexpected code byte";
1051 errmsg
= "internal error";
1055 if ((s
[1] & 0xc0) != 0x80) {
1056 errmsg
= "invalid data";
1059 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1061 errmsg
= "illegal encoding";
1065 *p
++ = (Py_UNICODE
)ch
;
1069 if ((s
[1] & 0xc0) != 0x80 ||
1070 (s
[2] & 0xc0) != 0x80) {
1071 errmsg
= "invalid data";
1074 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1075 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
1076 errmsg
= "illegal encoding";
1080 *p
++ = (Py_UNICODE
)ch
;
1084 if ((s
[1] & 0xc0) != 0x80 ||
1085 (s
[2] & 0xc0) != 0x80 ||
1086 (s
[3] & 0xc0) != 0x80) {
1087 errmsg
= "invalid data";
1090 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1091 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
1093 if ((ch
< 0x10000) /* minimum value allowed for 4
1095 || (ch
> 0x10ffff)) /* maximum value allowed for
1098 errmsg
= "illegal encoding";
1101 #ifdef Py_UNICODE_WIDE
1102 *p
++ = (Py_UNICODE
)ch
;
1104 /* compute and append the two surrogates: */
1106 /* translate from 10000..10FFFF to 0..FFFF */
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1112 /* low surrogate = bottom 10 bits added to DC00 */
1113 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1118 /* Other sizes are only needed for UCS-4 */
1119 errmsg
= "unsupported Unicode code range";
1126 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
1131 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1134 return (PyObject
*)unicode
;
1141 /* Not used anymore, now that the encoder supports UTF-16
1145 int utf8_encoding_error(const Py_UNICODE
**source
,
1148 const char *details
)
1150 if ((errors
== NULL
) ||
1151 (strcmp(errors
,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError
,
1153 "UTF-8 encoding error: %.400s",
1157 else if (strcmp(errors
,"ignore") == 0) {
1160 else if (strcmp(errors
,"replace") == 0) {
1166 PyErr_Format(PyExc_ValueError
,
1167 "UTF-8 encoding error; "
1168 "unknown error handling code: %.400s",
1175 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1183 unsigned int cbAllocated
= 3 * size
;
1184 unsigned int cbWritten
= 0;
1187 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1193 p
= q
= PyString_AS_STRING(v
);
1195 Py_UCS4 ch
= s
[i
++];
1200 else if (ch
< 0x0800) {
1201 *p
++ = 0xc0 | (ch
>> 6);
1202 *p
++ = 0x80 | (ch
& 0x3f);
1205 else if (ch
< 0x10000) {
1206 /* Check for high surrogate */
1207 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1210 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1212 if (cbWritten
>= (cbAllocated
- 4)) {
1213 /* Provide enough room for some more
1215 cbAllocated
+= 4*10;
1216 if (_PyString_Resize(&v
, cbAllocated
))
1220 /* combine the two values */
1221 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
1223 *p
++ = (char)((ch
>> 18) | 0xf0);
1224 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1231 *p
++ = (char)(0xe0 | (ch
>> 12));
1234 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1235 *p
++ = (char)(0x80 | (ch
& 0x3f));
1237 *p
++ = 0xf0 | (ch
>>18);
1238 *p
++ = 0x80 | ((ch
>>12) & 0x3f);
1239 *p
++ = 0x80 | ((ch
>>6) & 0x3f);
1240 *p
++ = 0x80 | (ch
& 0x3f);
1245 if (_PyString_Resize(&v
, p
- q
))
1254 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1256 if (!PyUnicode_Check(unicode
)) {
1257 PyErr_BadArgument();
1260 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1261 PyUnicode_GET_SIZE(unicode
),
1265 /* --- UTF-16 Codec ------------------------------------------------------- */
1268 int utf16_decoding_error(Py_UNICODE
**dest
,
1270 const char *details
)
1272 if ((errors
== NULL
) ||
1273 (strcmp(errors
,"strict") == 0)) {
1274 PyErr_Format(PyExc_UnicodeError
,
1275 "UTF-16 decoding error: %.400s",
1279 else if (strcmp(errors
,"ignore") == 0) {
1282 else if (strcmp(errors
,"replace") == 0) {
1284 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1290 PyErr_Format(PyExc_ValueError
,
1291 "UTF-16 decoding error; "
1292 "unknown error handling code: %.400s",
1299 PyUnicode_DecodeUTF16(const char *s
,
1304 PyUnicodeObject
*unicode
;
1306 const unsigned char *q
, *e
;
1307 int bo
= 0; /* assume native ordering by default */
1308 const char *errmsg
= "";
1309 /* Offsets from q for retrieving byte pairs in the right order. */
1310 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311 int ihi
= 1, ilo
= 0;
1313 int ihi
= 0, ilo
= 1;
1316 /* size should be an even number */
1318 if (utf16_decoding_error(NULL
, errors
, "truncated data"))
1320 --size
; /* else ignore the oddball byte */
1323 /* Note: size will always be longer than the resulting Unicode
1325 unicode
= _PyUnicode_New(size
);
1329 return (PyObject
*)unicode
;
1331 /* Unpack UTF-16 encoded data */
1333 q
= (unsigned char *)s
;
1339 /* Check for BOM marks (U+FEFF) in the input and adjust current
1340 byte order setting accordingly. In native mode, the leading BOM
1341 mark is skipped, in all other modes, it is copied to the output
1342 stream as-is (giving a ZWNBSP character). */
1344 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1345 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1346 if (bom
== 0xFEFF) {
1350 else if (bom
== 0xFFFE) {
1355 if (bom
== 0xFEFF) {
1359 else if (bom
== 0xFFFE) {
1378 Py_UNICODE ch
= (q
[ihi
] << 8) | q
[ilo
];
1381 if (ch
< 0xD800 || ch
> 0xDFFF) {
1386 /* UTF-16 code pair: */
1388 errmsg
= "unexpected end of data";
1391 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1392 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1394 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1395 #ifndef Py_UNICODE_WIDE
1399 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1404 errmsg
= "illegal UTF-16 surrogate";
1409 errmsg
= "illegal encoding";
1410 /* Fall through to report the error */
1413 if (utf16_decoding_error(&p
, errors
, errmsg
))
1421 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1424 return (PyObject
*)unicode
;
1432 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1440 /* Offsets from p for storing byte pairs in the right order. */
1441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi
= 1, ilo
= 0;
1444 int ihi
= 0, ilo
= 1;
1447 #define STORECHAR(CH) \
1449 p[ihi] = ((CH) >> 8) & 0xff; \
1450 p[ilo] = (CH) & 0xff; \
1454 for (i
= pairs
= 0; i
< size
; i
++)
1455 if (s
[i
] >= 0x10000)
1457 v
= PyString_FromStringAndSize(NULL
,
1458 2 * (size
+ pairs
+ (byteorder
== 0)));
1462 p
= (unsigned char *)PyString_AS_STRING(v
);
1468 if (byteorder
== -1) {
1473 else if (byteorder
== 1) {
1479 while (size
-- > 0) {
1480 Py_UNICODE ch
= *s
++;
1482 if (ch
>= 0x10000) {
1483 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1484 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1494 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1496 if (!PyUnicode_Check(unicode
)) {
1497 PyErr_BadArgument();
1500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1501 PyUnicode_GET_SIZE(unicode
),
1506 /* --- Unicode Escape Codec ----------------------------------------------- */
1509 int unicodeescape_decoding_error(const char **source
,
1512 const char *details
)
1514 if ((errors
== NULL
) ||
1515 (strcmp(errors
,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError
,
1517 "Unicode-Escape decoding error: %.400s",
1521 else if (strcmp(errors
,"ignore") == 0) {
1524 else if (strcmp(errors
,"replace") == 0) {
1525 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1529 PyErr_Format(PyExc_ValueError
,
1530 "Unicode-Escape decoding error; "
1531 "unknown error handling code: %.400s",
1537 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1539 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1544 Py_UNICODE
*p
, *buf
;
1547 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1549 /* Escaped strings will always be longer than the resulting
1550 Unicode string, so we start with size here and then reduce the
1551 length after conversion to the true value. */
1552 v
= _PyUnicode_New(size
);
1556 return (PyObject
*)v
;
1558 p
= buf
= PyUnicode_AS_UNICODE(v
);
1566 /* Non-escape characters are interpreted as Unicode ordinals */
1568 *p
++ = (unsigned char) *s
++;
1578 case '\\': *p
++ = '\\'; break;
1579 case '\'': *p
++ = '\''; break;
1580 case '\"': *p
++ = '\"'; break;
1581 case 'b': *p
++ = '\b'; break;
1582 case 'f': *p
++ = '\014'; break; /* FF */
1583 case 't': *p
++ = '\t'; break;
1584 case 'n': *p
++ = '\n'; break;
1585 case 'r': *p
++ = '\r'; break;
1586 case 'v': *p
++ = '\013'; break; /* VT */
1587 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1589 /* \OOO (octal) escapes */
1590 case '0': case '1': case '2': case '3':
1591 case '4': case '5': case '6': case '7':
1593 if ('0' <= *s
&& *s
<= '7') {
1594 x
= (x
<<3) + *s
++ - '0';
1595 if ('0' <= *s
&& *s
<= '7')
1596 x
= (x
<<3) + *s
++ - '0';
1605 message
= "truncated \\xXX escape";
1611 message
= "truncated \\uXXXX escape";
1617 message
= "truncated \\UXXXXXXXX escape";
1620 for (i
= 0; i
< digits
; i
++) {
1621 c
= (unsigned char) s
[i
];
1623 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1629 chr
= (chr
<<4) & ~0xF;
1630 if (c
>= '0' && c
<= '9')
1632 else if (c
>= 'a' && c
<= 'f')
1633 chr
+= 10 + c
- 'a';
1635 chr
+= 10 + c
- 'A';
1639 /* when we get here, chr is a 32-bit unicode character */
1641 /* UCS-2 character */
1642 *p
++ = (Py_UNICODE
) chr
;
1643 else if (chr
<= 0x10ffff) {
1644 /* UCS-4 character. Either store directly, or as
1646 #ifdef Py_UNICODE_WIDE
1650 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1651 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1654 if (unicodeescape_decoding_error(
1656 "illegal Unicode character")
1659 *p
++ = x
; /* store replacement character */
1665 message
= "malformed \\N character escape";
1666 if (ucnhash_CAPI
== NULL
) {
1667 /* load the unicode data module */
1669 m
= PyImport_ImportModule("unicodedata");
1672 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1676 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1678 if (ucnhash_CAPI
== NULL
)
1682 const char *start
= s
+1;
1683 /* look for the closing brace */
1684 while (*s
!= '}' && s
< end
)
1686 if (s
> start
&& s
< end
&& *s
== '}') {
1687 /* found a name. look it up in the unicode database */
1688 message
= "unknown Unicode character name";
1690 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1694 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1701 *p
++ = (unsigned char)s
[-1];
1705 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1707 return (PyObject
*)v
;
1712 "\\N escapes not supported (can't load unicodedata module)"
1721 /* Return a Unicode-Escape string version of the Unicode object.
1723 If quotes is true, the string is enclosed in u"" or u'' quotes as
1728 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1733 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1740 static const char *hexdigit
= "0123456789abcdef";
1742 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1746 p
= PyString_AS_STRING(repr
);
1750 *p
++ = (findchar(s
, size
, '\'') &&
1751 !findchar(s
, size
, '"')) ? '"' : '\'';
1753 while (size
-- > 0) {
1754 Py_UNICODE ch
= *s
++;
1758 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
1764 #ifdef Py_UNICODE_WIDE
1765 /* Map 21-bit characters to '\U00xxxxxx' */
1766 else if (ch
>= 0x10000) {
1767 int offset
= p
- PyString_AS_STRING(repr
);
1769 /* Resize the string if necessary */
1770 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
1771 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
1773 p
= PyString_AS_STRING(repr
) + offset
;
1778 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
1779 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
1780 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
1781 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
1782 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
1783 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
1784 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
1785 *p
++ = hexdigit
[ch
& 0x0000000F];
1789 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1790 else if (ch
>= 0xD800 && ch
< 0xDC00) {
1796 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
1797 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
1800 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
1801 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
1802 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
1803 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
1804 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
1805 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
1806 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
1807 *p
++ = hexdigit
[ucs
& 0x0000000F];
1810 /* Fall through: isolated surrogates are copied as-is */
1815 /* Map 16-bit characters to '\uxxxx' */
1819 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
1820 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
1821 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1822 *p
++ = hexdigit
[ch
& 0x000F];
1825 /* Map special whitespace to '\t', \n', '\r' */
1826 else if (ch
== '\t') {
1830 else if (ch
== '\n') {
1834 else if (ch
== '\r') {
1839 /* Map non-printable US ASCII to '\xhh' */
1840 else if (ch
< ' ' || ch
>= 128) {
1843 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1844 *p
++ = hexdigit
[ch
& 0x000F];
1847 /* Copy everything else as-is */
1852 *p
++ = PyString_AS_STRING(repr
)[1];
1855 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
1865 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1868 return unicodeescape_string(s
, size
, 0);
1871 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1873 if (!PyUnicode_Check(unicode
)) {
1874 PyErr_BadArgument();
1877 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1878 PyUnicode_GET_SIZE(unicode
));
1881 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1883 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1888 Py_UNICODE
*p
, *buf
;
1892 /* Escaped strings will always be longer than the resulting
1893 Unicode string, so we start with size here and then reduce the
1894 length after conversion to the true value. */
1895 v
= _PyUnicode_New(size
);
1899 return (PyObject
*)v
;
1900 p
= buf
= PyUnicode_AS_UNICODE(v
);
1907 /* Non-escape characters are interpreted as Unicode ordinals */
1909 *p
++ = (unsigned char)*s
++;
1913 /* \u-escapes are only interpreted iff the number of leading
1914 backslashes if odd */
1919 *p
++ = (unsigned char)*s
++;
1921 if (((s
- bs
) & 1) == 0 ||
1929 /* \uXXXX with 4 hex digits */
1930 for (x
= 0, i
= 0; i
< 4; i
++) {
1931 c
= (unsigned char)s
[i
];
1933 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1934 "truncated \\uXXXX"))
1940 if (c
>= '0' && c
<= '9')
1942 else if (c
>= 'a' && c
<= 'f')
1950 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1952 return (PyObject
*)v
;
1959 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1966 static const char *hexdigit
= "0123456789abcdef";
1968 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1974 p
= q
= PyString_AS_STRING(repr
);
1975 while (size
-- > 0) {
1976 Py_UNICODE ch
= *s
++;
1977 /* Map 16-bit characters to '\uxxxx' */
1981 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1982 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1983 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1984 *p
++ = hexdigit
[ch
& 15];
1986 /* Copy everything else as-is */
1991 if (_PyString_Resize(&repr
, p
- q
))
2001 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2003 if (!PyUnicode_Check(unicode
)) {
2004 PyErr_BadArgument();
2007 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2008 PyUnicode_GET_SIZE(unicode
));
2011 /* --- Latin-1 Codec ------------------------------------------------------ */
2013 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2020 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2021 if (size
== 1 && *(unsigned char*)s
< 256) {
2022 Py_UNICODE r
= *(unsigned char*)s
;
2023 return PyUnicode_FromUnicode(&r
, 1);
2026 v
= _PyUnicode_New(size
);
2030 return (PyObject
*)v
;
2031 p
= PyUnicode_AS_UNICODE(v
);
2033 *p
++ = (unsigned char)*s
++;
2034 return (PyObject
*)v
;
2042 int latin1_encoding_error(const Py_UNICODE
**source
,
2045 const char *details
)
2047 if ((errors
== NULL
) ||
2048 (strcmp(errors
,"strict") == 0)) {
2049 PyErr_Format(PyExc_UnicodeError
,
2050 "Latin-1 encoding error: %.400s",
2054 else if (strcmp(errors
,"ignore") == 0) {
2057 else if (strcmp(errors
,"replace") == 0) {
2063 PyErr_Format(PyExc_ValueError
,
2064 "Latin-1 encoding error; "
2065 "unknown error handling code: %.400s",
2071 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2078 repr
= PyString_FromStringAndSize(NULL
, size
);
2084 s
= PyString_AS_STRING(repr
);
2086 while (size
-- > 0) {
2087 Py_UNICODE ch
= *p
++;
2089 if (latin1_encoding_error(&p
, &s
, errors
,
2090 "ordinal not in range(256)"))
2096 /* Resize if error handling skipped some characters */
2097 if (s
- start
< PyString_GET_SIZE(repr
))
2098 if (_PyString_Resize(&repr
, s
- start
))
2107 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2109 if (!PyUnicode_Check(unicode
)) {
2110 PyErr_BadArgument();
2113 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2114 PyUnicode_GET_SIZE(unicode
),
2118 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2121 int ascii_decoding_error(const char **source
,
2124 const char *details
)
2126 if ((errors
== NULL
) ||
2127 (strcmp(errors
,"strict") == 0)) {
2128 PyErr_Format(PyExc_UnicodeError
,
2129 "ASCII decoding error: %.400s",
2133 else if (strcmp(errors
,"ignore") == 0) {
2136 else if (strcmp(errors
,"replace") == 0) {
2137 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2142 PyErr_Format(PyExc_ValueError
,
2143 "ASCII decoding error; "
2144 "unknown error handling code: %.400s",
2150 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2157 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2158 if (size
== 1 && *(unsigned char*)s
< 128) {
2159 Py_UNICODE r
= *(unsigned char*)s
;
2160 return PyUnicode_FromUnicode(&r
, 1);
2163 v
= _PyUnicode_New(size
);
2167 return (PyObject
*)v
;
2168 p
= PyUnicode_AS_UNICODE(v
);
2169 while (size
-- > 0) {
2170 register unsigned char c
;
2172 c
= (unsigned char)*s
++;
2175 else if (ascii_decoding_error(&s
, &p
, errors
,
2176 "ordinal not in range(128)"))
2179 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2180 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2182 return (PyObject
*)v
;
2190 int ascii_encoding_error(const Py_UNICODE
**source
,
2193 const char *details
)
2195 if ((errors
== NULL
) ||
2196 (strcmp(errors
,"strict") == 0)) {
2197 PyErr_Format(PyExc_UnicodeError
,
2198 "ASCII encoding error: %.400s",
2202 else if (strcmp(errors
,"ignore") == 0) {
2205 else if (strcmp(errors
,"replace") == 0) {
2211 PyErr_Format(PyExc_ValueError
,
2212 "ASCII encoding error; "
2213 "unknown error handling code: %.400s",
2219 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2226 repr
= PyString_FromStringAndSize(NULL
, size
);
2232 s
= PyString_AS_STRING(repr
);
2234 while (size
-- > 0) {
2235 Py_UNICODE ch
= *p
++;
2237 if (ascii_encoding_error(&p
, &s
, errors
,
2238 "ordinal not in range(128)"))
2244 /* Resize if error handling skipped some characters */
2245 if (s
- start
< PyString_GET_SIZE(repr
))
2246 if (_PyString_Resize(&repr
, s
- start
))
2255 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2257 if (!PyUnicode_Check(unicode
)) {
2258 PyErr_BadArgument();
2261 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2262 PyUnicode_GET_SIZE(unicode
),
2266 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2268 /* --- MBCS codecs for Windows -------------------------------------------- */
2270 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2277 /* First get the size of the result */
2278 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2279 if (size
> 0 && usize
==0)
2280 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2282 v
= _PyUnicode_New(usize
);
2286 return (PyObject
*)v
;
2287 p
= PyUnicode_AS_UNICODE(v
);
2288 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2290 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2293 return (PyObject
*)v
;
2296 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2304 /* If there are no characters, bail now! */
2306 return PyString_FromString("");
2308 /* First get the size of the result */
2309 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2311 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2313 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2319 /* Do the conversion */
2320 s
= PyString_AS_STRING(repr
);
2321 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2323 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2328 #endif /* MS_WIN32 */
2330 /* --- Character Mapping Codec -------------------------------------------- */
2333 int charmap_decoding_error(const char **source
,
2336 const char *details
)
2338 if ((errors
== NULL
) ||
2339 (strcmp(errors
,"strict") == 0)) {
2340 PyErr_Format(PyExc_UnicodeError
,
2341 "charmap decoding error: %.400s",
2345 else if (strcmp(errors
,"ignore") == 0) {
2348 else if (strcmp(errors
,"replace") == 0) {
2349 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2354 PyErr_Format(PyExc_ValueError
,
2355 "charmap decoding error; "
2356 "unknown error handling code: %.400s",
2362 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2371 /* Default to Latin-1 */
2372 if (mapping
== NULL
)
2373 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2375 v
= _PyUnicode_New(size
);
2379 return (PyObject
*)v
;
2380 p
= PyUnicode_AS_UNICODE(v
);
2381 while (size
-- > 0) {
2382 unsigned char ch
= *s
++;
2385 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2386 w
= PyInt_FromLong((long)ch
);
2389 x
= PyObject_GetItem(mapping
, w
);
2392 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2393 /* No mapping found means: mapping is undefined. */
2402 if (PyInt_Check(x
)) {
2403 long value
= PyInt_AS_LONG(x
);
2404 if (value
< 0 || value
> 65535) {
2405 PyErr_SetString(PyExc_TypeError
,
2406 "character mapping must be in range(65536)");
2410 *p
++ = (Py_UNICODE
)value
;
2412 else if (x
== Py_None
) {
2413 /* undefined mapping */
2414 if (charmap_decoding_error(&s
, &p
, errors
,
2415 "character maps to <undefined>")) {
2420 else if (PyUnicode_Check(x
)) {
2421 int targetsize
= PyUnicode_GET_SIZE(x
);
2423 if (targetsize
== 1)
2425 *p
++ = *PyUnicode_AS_UNICODE(x
);
2427 else if (targetsize
> 1) {
2429 if (targetsize
> extrachars
) {
2431 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2432 int needed
= (targetsize
- extrachars
) + \
2434 extrachars
+= needed
;
2435 if (_PyUnicode_Resize(&v
,
2436 PyUnicode_GET_SIZE(v
) + needed
)) {
2440 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2443 PyUnicode_AS_UNICODE(x
),
2446 extrachars
-= targetsize
;
2448 /* 1-0 mapping: skip the character */
2451 /* wrong return value */
2452 PyErr_SetString(PyExc_TypeError
,
2453 "character mapping must return integer, None or unicode");
2459 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2460 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2462 return (PyObject
*)v
;
2470 int charmap_encoding_error(const Py_UNICODE
**source
,
2473 const char *details
)
2475 if ((errors
== NULL
) ||
2476 (strcmp(errors
,"strict") == 0)) {
2477 PyErr_Format(PyExc_UnicodeError
,
2478 "charmap encoding error: %.400s",
2482 else if (strcmp(errors
,"ignore") == 0) {
2485 else if (strcmp(errors
,"replace") == 0) {
2491 PyErr_Format(PyExc_ValueError
,
2492 "charmap encoding error; "
2493 "unknown error handling code: %.400s",
2499 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2508 /* Default to Latin-1 */
2509 if (mapping
== NULL
)
2510 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2512 v
= PyString_FromStringAndSize(NULL
, size
);
2517 s
= PyString_AS_STRING(v
);
2518 while (size
-- > 0) {
2519 Py_UNICODE ch
= *p
++;
2522 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2523 w
= PyInt_FromLong((long)ch
);
2526 x
= PyObject_GetItem(mapping
, w
);
2529 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2530 /* No mapping found means: mapping is undefined. */
2539 if (PyInt_Check(x
)) {
2540 long value
= PyInt_AS_LONG(x
);
2541 if (value
< 0 || value
> 255) {
2542 PyErr_SetString(PyExc_TypeError
,
2543 "character mapping must be in range(256)");
2549 else if (x
== Py_None
) {
2550 /* undefined mapping */
2551 if (charmap_encoding_error(&p
, &s
, errors
,
2552 "character maps to <undefined>")) {
2557 else if (PyString_Check(x
)) {
2558 int targetsize
= PyString_GET_SIZE(x
);
2560 if (targetsize
== 1)
2562 *s
++ = *PyString_AS_STRING(x
);
2564 else if (targetsize
> 1) {
2566 if (targetsize
> extrachars
) {
2568 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2569 int needed
= (targetsize
- extrachars
) + \
2571 extrachars
+= needed
;
2572 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2576 s
= PyString_AS_STRING(v
) + oldpos
;
2578 memcpy(s
, PyString_AS_STRING(x
), targetsize
);
2580 extrachars
-= targetsize
;
2582 /* 1-0 mapping: skip the character */
2585 /* wrong return value */
2586 PyErr_SetString(PyExc_TypeError
,
2587 "character mapping must return integer, None or unicode");
2593 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2594 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2603 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2606 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2607 PyErr_BadArgument();
2610 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2611 PyUnicode_GET_SIZE(unicode
),
2617 int translate_error(const Py_UNICODE
**source
,
2620 const char *details
)
2622 if ((errors
== NULL
) ||
2623 (strcmp(errors
,"strict") == 0)) {
2624 PyErr_Format(PyExc_UnicodeError
,
2625 "translate error: %.400s",
2629 else if (strcmp(errors
,"ignore") == 0) {
2632 else if (strcmp(errors
,"replace") == 0) {
2638 PyErr_Format(PyExc_ValueError
,
2640 "unknown error handling code: %.400s",
2646 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2654 if (mapping
== NULL
) {
2655 PyErr_BadArgument();
2659 /* Output will never be longer than input */
2660 v
= _PyUnicode_New(size
);
2665 p
= PyUnicode_AS_UNICODE(v
);
2666 while (size
-- > 0) {
2667 Py_UNICODE ch
= *s
++;
2671 w
= PyInt_FromLong(ch
);
2674 x
= PyObject_GetItem(mapping
, w
);
2677 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2678 /* No mapping found: default to 1-1 mapping */
2688 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2689 else if (x
== Py_None
) {
2690 /* undefined mapping */
2691 if (translate_error(&s
, &p
, errors
,
2692 "character maps to <undefined>")) {
2697 else if (PyUnicode_Check(x
)) {
2698 if (PyUnicode_GET_SIZE(x
) != 1) {
2700 PyErr_SetString(PyExc_NotImplementedError
,
2701 "1-n mappings are currently not implemented");
2705 *p
++ = *PyUnicode_AS_UNICODE(x
);
2708 /* wrong return value */
2709 PyErr_SetString(PyExc_TypeError
,
2710 "translate mapping must return integer, None or unicode");
2716 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2717 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2721 return (PyObject
*)v
;
2728 PyObject
*PyUnicode_Translate(PyObject
*str
,
2734 str
= PyUnicode_FromObject(str
);
2737 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2738 PyUnicode_GET_SIZE(str
),
2749 /* --- Decimal Encoder ---------------------------------------------------- */
2751 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2756 Py_UNICODE
*p
, *end
;
2758 if (output
== NULL
) {
2759 PyErr_BadArgument();
2766 register Py_UNICODE ch
= *p
++;
2769 if (Py_UNICODE_ISSPACE(ch
)) {
2773 decimal
= Py_UNICODE_TODECIMAL(ch
);
2775 *output
++ = '0' + decimal
;
2778 if (0 < ch
&& ch
< 256) {
2779 *output
++ = (char)ch
;
2782 /* All other characters are considered invalid */
2783 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2784 PyErr_SetString(PyExc_ValueError
,
2785 "invalid decimal Unicode string");
2788 else if (strcmp(errors
, "ignore") == 0)
2790 else if (strcmp(errors
, "replace") == 0) {
2795 /* 0-terminate the output string */
2803 /* --- Helpers ------------------------------------------------------------ */
2806 int count(PyUnicodeObject
*self
,
2809 PyUnicodeObject
*substring
)
2814 start
+= self
->length
;
2817 if (end
> self
->length
)
2820 end
+= self
->length
;
2824 if (substring
->length
== 0)
2825 return (end
- start
+ 1);
2827 end
-= substring
->length
;
2829 while (start
<= end
)
2830 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2832 start
+= substring
->length
;
2839 int PyUnicode_Count(PyObject
*str
,
2846 str
= PyUnicode_FromObject(str
);
2849 substr
= PyUnicode_FromObject(substr
);
2850 if (substr
== NULL
) {
2855 result
= count((PyUnicodeObject
*)str
,
2857 (PyUnicodeObject
*)substr
);
2865 int findstring(PyUnicodeObject
*self
,
2866 PyUnicodeObject
*substring
,
2872 start
+= self
->length
;
2876 if (substring
->length
== 0)
2879 if (end
> self
->length
)
2882 end
+= self
->length
;
2886 end
-= substring
->length
;
2888 if (direction
< 0) {
2889 for (; end
>= start
; end
--)
2890 if (Py_UNICODE_MATCH(self
, end
, substring
))
2893 for (; start
<= end
; start
++)
2894 if (Py_UNICODE_MATCH(self
, start
, substring
))
2901 int PyUnicode_Find(PyObject
*str
,
2909 str
= PyUnicode_FromObject(str
);
2912 substr
= PyUnicode_FromObject(substr
);
2913 if (substr
== NULL
) {
2918 result
= findstring((PyUnicodeObject
*)str
,
2919 (PyUnicodeObject
*)substr
,
2920 start
, end
, direction
);
2927 int tailmatch(PyUnicodeObject
*self
,
2928 PyUnicodeObject
*substring
,
2934 start
+= self
->length
;
2938 if (substring
->length
== 0)
2941 if (end
> self
->length
)
2944 end
+= self
->length
;
2948 end
-= substring
->length
;
2952 if (direction
> 0) {
2953 if (Py_UNICODE_MATCH(self
, end
, substring
))
2956 if (Py_UNICODE_MATCH(self
, start
, substring
))
2963 int PyUnicode_Tailmatch(PyObject
*str
,
2971 str
= PyUnicode_FromObject(str
);
2974 substr
= PyUnicode_FromObject(substr
);
2975 if (substr
== NULL
) {
2980 result
= tailmatch((PyUnicodeObject
*)str
,
2981 (PyUnicodeObject
*)substr
,
2982 start
, end
, direction
);
2989 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2993 /* like wcschr, but doesn't stop at NULL characters */
2995 while (size
-- > 0) {
3004 /* Apply fixfct filter to the Unicode object self and return a
3005 reference to the modified object */
3008 PyObject
*fixup(PyUnicodeObject
*self
,
3009 int (*fixfct
)(PyUnicodeObject
*s
))
3014 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
3018 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
3020 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
3021 /* fixfct should return TRUE if it modified the buffer. If
3022 FALSE, return a reference to the original buffer instead
3023 (to save space, not time) */
3026 return (PyObject
*) self
;
3028 return (PyObject
*) u
;
3032 int fixupper(PyUnicodeObject
*self
)
3034 int len
= self
->length
;
3035 Py_UNICODE
*s
= self
->str
;
3039 register Py_UNICODE ch
;
3041 ch
= Py_UNICODE_TOUPPER(*s
);
3053 int fixlower(PyUnicodeObject
*self
)
3055 int len
= self
->length
;
3056 Py_UNICODE
*s
= self
->str
;
3060 register Py_UNICODE ch
;
3062 ch
= Py_UNICODE_TOLOWER(*s
);
3074 int fixswapcase(PyUnicodeObject
*self
)
3076 int len
= self
->length
;
3077 Py_UNICODE
*s
= self
->str
;
3081 if (Py_UNICODE_ISUPPER(*s
)) {
3082 *s
= Py_UNICODE_TOLOWER(*s
);
3084 } else if (Py_UNICODE_ISLOWER(*s
)) {
3085 *s
= Py_UNICODE_TOUPPER(*s
);
3095 int fixcapitalize(PyUnicodeObject
*self
)
3097 int len
= self
->length
;
3098 Py_UNICODE
*s
= self
->str
;
3103 if (Py_UNICODE_ISLOWER(*s
)) {
3104 *s
= Py_UNICODE_TOUPPER(*s
);
3109 if (Py_UNICODE_ISUPPER(*s
)) {
3110 *s
= Py_UNICODE_TOLOWER(*s
);
3119 int fixtitle(PyUnicodeObject
*self
)
3121 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3122 register Py_UNICODE
*e
;
3123 int previous_is_cased
;
3125 /* Shortcut for single character strings */
3126 if (PyUnicode_GET_SIZE(self
) == 1) {
3127 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
3136 e
= p
+ PyUnicode_GET_SIZE(self
);
3137 previous_is_cased
= 0;
3138 for (; p
< e
; p
++) {
3139 register const Py_UNICODE ch
= *p
;
3141 if (previous_is_cased
)
3142 *p
= Py_UNICODE_TOLOWER(ch
);
3144 *p
= Py_UNICODE_TOTITLE(ch
);
3146 if (Py_UNICODE_ISLOWER(ch
) ||
3147 Py_UNICODE_ISUPPER(ch
) ||
3148 Py_UNICODE_ISTITLE(ch
))
3149 previous_is_cased
= 1;
3151 previous_is_cased
= 0;
3156 PyObject
*PyUnicode_Join(PyObject
*separator
,
3161 PyUnicodeObject
*res
= NULL
;
3168 it
= PyObject_GetIter(seq
);
3172 if (separator
== NULL
) {
3173 Py_UNICODE blank
= ' ';
3178 separator
= PyUnicode_FromObject(separator
);
3179 if (separator
== NULL
)
3181 sep
= PyUnicode_AS_UNICODE(separator
);
3182 seplen
= PyUnicode_GET_SIZE(separator
);
3185 res
= _PyUnicode_New(sz
);
3188 p
= PyUnicode_AS_UNICODE(res
);
3191 for (i
= 0; ; ++i
) {
3193 PyObject
*item
= PyIter_Next(it
);
3195 if (PyErr_Occurred())
3199 if (!PyUnicode_Check(item
)) {
3201 if (!PyString_Check(item
)) {
3202 PyErr_Format(PyExc_TypeError
,
3203 "sequence item %i: expected string or Unicode,"
3205 i
, item
->ob_type
->tp_name
);
3209 v
= PyUnicode_FromObject(item
);
3215 itemlen
= PyUnicode_GET_SIZE(item
);
3216 while (reslen
+ itemlen
+ seplen
>= sz
) {
3217 if (_PyUnicode_Resize(&res
, sz
*2)) {
3222 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
3225 Py_UNICODE_COPY(p
, sep
, seplen
);
3229 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
3234 if (_PyUnicode_Resize(&res
, reslen
))
3237 Py_XDECREF(separator
);
3239 return (PyObject
*)res
;
3242 Py_XDECREF(separator
);
3249 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
3261 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
3266 u
= _PyUnicode_New(left
+ self
->length
+ right
);
3269 Py_UNICODE_FILL(u
->str
, fill
, left
);
3270 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
3272 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
3278 #define SPLIT_APPEND(data, left, right) \
3279 str = PyUnicode_FromUnicode(data + left, right - left); \
3282 if (PyList_Append(list, str)) { \
3290 PyObject
*split_whitespace(PyUnicodeObject
*self
,
3296 int len
= self
->length
;
3299 for (i
= j
= 0; i
< len
; ) {
3301 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3304 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
3307 if (maxcount
-- <= 0)
3309 SPLIT_APPEND(self
->str
, j
, i
);
3310 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3316 SPLIT_APPEND(self
->str
, j
, len
);
3325 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
3335 string
= PyUnicode_FromObject(string
);
3338 data
= PyUnicode_AS_UNICODE(string
);
3339 len
= PyUnicode_GET_SIZE(string
);
3341 list
= PyList_New(0);
3345 for (i
= j
= 0; i
< len
; ) {
3348 /* Find a line and append it */
3349 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
3352 /* Skip the line break reading CRLF as one line break */
3355 if (data
[i
] == '\r' && i
+ 1 < len
&&
3363 SPLIT_APPEND(data
, j
, eol
);
3367 SPLIT_APPEND(data
, j
, len
);
3380 PyObject
*split_char(PyUnicodeObject
*self
,
3387 int len
= self
->length
;
3390 for (i
= j
= 0; i
< len
; ) {
3391 if (self
->str
[i
] == ch
) {
3392 if (maxcount
-- <= 0)
3394 SPLIT_APPEND(self
->str
, j
, i
);
3400 SPLIT_APPEND(self
->str
, j
, len
);
3410 PyObject
*split_substring(PyUnicodeObject
*self
,
3412 PyUnicodeObject
*substring
,
3417 int len
= self
->length
;
3418 int sublen
= substring
->length
;
3421 for (i
= j
= 0; i
<= len
- sublen
; ) {
3422 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
3423 if (maxcount
-- <= 0)
3425 SPLIT_APPEND(self
->str
, j
, i
);
3431 SPLIT_APPEND(self
->str
, j
, len
);
3443 PyObject
*split(PyUnicodeObject
*self
,
3444 PyUnicodeObject
*substring
,
3452 list
= PyList_New(0);
3456 if (substring
== NULL
)
3457 return split_whitespace(self
,list
,maxcount
);
3459 else if (substring
->length
== 1)
3460 return split_char(self
,list
,substring
->str
[0],maxcount
);
3462 else if (substring
->length
== 0) {
3464 PyErr_SetString(PyExc_ValueError
, "empty separator");
3468 return split_substring(self
,list
,substring
,maxcount
);
3472 PyObject
*strip(PyUnicodeObject
*self
,
3476 Py_UNICODE
*p
= self
->str
;
3478 int end
= self
->length
;
3481 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
3485 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
3488 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
3489 /* couldn't strip anything off, return original string */
3491 return (PyObject
*) self
;
3494 return (PyObject
*) PyUnicode_FromUnicode(
3501 PyObject
*replace(PyUnicodeObject
*self
,
3502 PyUnicodeObject
*str1
,
3503 PyUnicodeObject
*str2
,
3511 if (str1
->length
== 1 && str2
->length
== 1) {
3514 /* replace characters */
3515 if (!findchar(self
->str
, self
->length
, str1
->str
[0]) &&
3516 PyUnicode_CheckExact(self
)) {
3517 /* nothing to replace, return original string */
3521 Py_UNICODE u1
= str1
->str
[0];
3522 Py_UNICODE u2
= str2
->str
[0];
3524 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3529 Py_UNICODE_COPY(u
->str
, self
->str
,
3531 for (i
= 0; i
< u
->length
; i
++)
3532 if (u
->str
[i
] == u1
) {
3544 /* replace strings */
3545 n
= count(self
, 0, self
->length
, str1
);
3548 if (n
== 0 && PyUnicode_CheckExact(self
)) {
3549 /* nothing to replace, return original string */
3554 self
->length
+ n
* (str2
->length
- str1
->length
));
3558 while (i
<= self
->length
- str1
->length
)
3559 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3560 /* replace string segment */
3561 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3565 /* copy remaining part */
3566 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3570 *p
++ = self
->str
[i
++];
3575 return (PyObject
*) u
;
3578 /* --- Unicode Object Methods --------------------------------------------- */
3580 static char title__doc__
[] =
3581 "S.title() -> unicode\n\
3583 Return a titlecased version of S, i.e. words start with title case\n\
3584 characters, all remaining cased characters have lower case.";
3587 unicode_title(PyUnicodeObject
*self
)
3589 return fixup(self
, fixtitle
);
3592 static char capitalize__doc__
[] =
3593 "S.capitalize() -> unicode\n\
3595 Return a capitalized version of S, i.e. make the first character\n\
3599 unicode_capitalize(PyUnicodeObject
*self
)
3601 return fixup(self
, fixcapitalize
);
3605 static char capwords__doc__
[] =
3606 "S.capwords() -> unicode\n\
3608 Apply .capitalize() to all words in S and return the result with\n\
3609 normalized whitespace (all whitespace strings are replaced by ' ').";
3612 unicode_capwords(PyUnicodeObject
*self
)
3618 /* Split into words */
3619 list
= split(self
, NULL
, -1);
3623 /* Capitalize each word */
3624 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3625 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3629 Py_DECREF(PyList_GET_ITEM(list
, i
));
3630 PyList_SET_ITEM(list
, i
, item
);
3633 /* Join the words to form a new string */
3634 item
= PyUnicode_Join(NULL
, list
);
3638 return (PyObject
*)item
;
3642 static char center__doc__
[] =
3643 "S.center(width) -> unicode\n\
3645 Return S centered in a Unicode string of length width. Padding is done\n\
3649 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3654 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3657 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
3659 return (PyObject
*) self
;
3662 marg
= width
- self
->length
;
3663 left
= marg
/ 2 + (marg
& width
& 1);
3665 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3670 /* This code should go into some future Unicode collation support
3671 module. The basic comparison should compare ordinals on a naive
3672 basis (this is what Java does and thus JPython too). */
3674 /* speedy UTF-16 code point order comparison */
3676 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3678 static short utf16Fixup
[32] =
3680 0, 0, 0, 0, 0, 0, 0, 0,
3681 0, 0, 0, 0, 0, 0, 0, 0,
3682 0, 0, 0, 0, 0, 0, 0, 0,
3683 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3687 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3691 Py_UNICODE
*s1
= str1
->str
;
3692 Py_UNICODE
*s2
= str2
->str
;
3694 len1
= str1
->length
;
3695 len2
= str2
->length
;
3697 while (len1
> 0 && len2
> 0) {
3703 if (c1
> (1<<11) * 26)
3704 c1
+= utf16Fixup
[c1
>>11];
3705 if (c2
> (1<<11) * 26)
3706 c2
+= utf16Fixup
[c2
>>11];
3707 /* now c1 and c2 are in UTF-32-compatible order */
3710 return (c1
< c2
) ? -1 : 1;
3715 return (len1
< len2
) ? -1 : (len1
!= len2
);
3721 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3723 register int len1
, len2
;
3725 Py_UNICODE
*s1
= str1
->str
;
3726 Py_UNICODE
*s2
= str2
->str
;
3728 len1
= str1
->length
;
3729 len2
= str2
->length
;
3731 while (len1
> 0 && len2
> 0) {
3738 return (c1
< c2
) ? -1 : 1;
3743 return (len1
< len2
) ? -1 : (len1
!= len2
);
3748 int PyUnicode_Compare(PyObject
*left
,
3751 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3754 /* Coerce the two arguments */
3755 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3758 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3762 /* Shortcut for empty or interned objects */
3769 result
= unicode_compare(u
, v
);
3781 int PyUnicode_Contains(PyObject
*container
,
3784 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3786 register const Py_UNICODE
*p
, *e
;
3787 register Py_UNICODE ch
;
3789 /* Coerce the two arguments */
3790 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3792 PyErr_SetString(PyExc_TypeError
,
3793 "'in <string>' requires character as left operand");
3796 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3803 if (PyUnicode_GET_SIZE(v
) != 1) {
3804 PyErr_SetString(PyExc_TypeError
,
3805 "'in <string>' requires character as left operand");
3808 ch
= *PyUnicode_AS_UNICODE(v
);
3809 p
= PyUnicode_AS_UNICODE(u
);
3810 e
= p
+ PyUnicode_GET_SIZE(u
);
3829 /* Concat to string or Unicode object giving a new Unicode object. */
3831 PyObject
*PyUnicode_Concat(PyObject
*left
,
3834 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3836 /* Coerce the two arguments */
3837 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3840 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3845 if (v
== unicode_empty
) {
3847 return (PyObject
*)u
;
3849 if (u
== unicode_empty
) {
3851 return (PyObject
*)v
;
3854 /* Concat the two Unicode strings */
3855 w
= _PyUnicode_New(u
->length
+ v
->length
);
3858 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3859 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3863 return (PyObject
*)w
;
3871 static char count__doc__
[] =
3872 "S.count(sub[, start[, end]]) -> int\n\
3874 Return the number of occurrences of substring sub in Unicode string\n\
3875 S[start:end]. Optional arguments start and end are\n\
3876 interpreted as in slice notation.";
3879 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3881 PyUnicodeObject
*substring
;
3886 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3887 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3890 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3891 (PyObject
*)substring
);
3892 if (substring
== NULL
)
3896 start
+= self
->length
;
3899 if (end
> self
->length
)
3902 end
+= self
->length
;
3906 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3908 Py_DECREF(substring
);
3912 static char encode__doc__
[] =
3913 "S.encode([encoding[,errors]]) -> string\n\
3915 Return an encoded string version of S. Default encoding is the current\n\
3916 default string encoding. errors may be given to set a different error\n\
3917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3918 a ValueError. Other possible values are 'ignore' and 'replace'.";
3921 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3923 char *encoding
= NULL
;
3924 char *errors
= NULL
;
3925 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3927 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3930 static char expandtabs__doc__
[] =
3931 "S.expandtabs([tabsize]) -> unicode\n\
3933 Return a copy of S where all tab characters are expanded using spaces.\n\
3934 If tabsize is not given, a tab size of 8 characters is assumed.";
3937 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3946 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3949 /* First pass: determine size of output string */
3951 e
= self
->str
+ self
->length
;
3952 for (p
= self
->str
; p
< e
; p
++)
3955 j
+= tabsize
- (j
% tabsize
);
3959 if (*p
== '\n' || *p
== '\r') {
3965 /* Second pass: create output string and fill it */
3966 u
= _PyUnicode_New(i
+ j
);
3973 for (p
= self
->str
; p
< e
; p
++)
3976 i
= tabsize
- (j
% tabsize
);
3985 if (*p
== '\n' || *p
== '\r')
3989 return (PyObject
*) u
;
3992 static char find__doc__
[] =
3993 "S.find(sub [,start [,end]]) -> int\n\
3995 Return the lowest index in S where substring sub is found,\n\
3996 such that sub is contained within s[start,end]. Optional\n\
3997 arguments start and end are interpreted as in slice notation.\n\
3999 Return -1 on failure.";
4002 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
4004 PyUnicodeObject
*substring
;
4009 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
4010 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4012 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4013 (PyObject
*)substring
);
4014 if (substring
== NULL
)
4017 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
4019 Py_DECREF(substring
);
4024 unicode_getitem(PyUnicodeObject
*self
, int index
)
4026 if (index
< 0 || index
>= self
->length
) {
4027 PyErr_SetString(PyExc_IndexError
, "string index out of range");
4031 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
4035 unicode_hash(PyUnicodeObject
*self
)
4037 /* Since Unicode objects compare equal to their ASCII string
4038 counterparts, they should use the individual character values
4039 as basis for their hash value. This is needed to assure that
4040 strings and Unicode objects behave in the same way as
4044 register Py_UNICODE
*p
;
4047 if (self
->hash
!= -1)
4049 len
= PyUnicode_GET_SIZE(self
);
4050 p
= PyUnicode_AS_UNICODE(self
);
4053 x
= (1000003*x
) ^ *p
++;
4054 x
^= PyUnicode_GET_SIZE(self
);
4061 static char index__doc__
[] =
4062 "S.index(sub [,start [,end]]) -> int\n\
4064 Like S.find() but raise ValueError when the substring is not found.";
4067 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
4070 PyUnicodeObject
*substring
;
4074 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
4075 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4078 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4079 (PyObject
*)substring
);
4080 if (substring
== NULL
)
4083 result
= findstring(self
, substring
, start
, end
, 1);
4085 Py_DECREF(substring
);
4087 PyErr_SetString(PyExc_ValueError
, "substring not found");
4090 return PyInt_FromLong(result
);
4093 static char islower__doc__
[] =
4094 "S.islower() -> int\n\
4096 Return 1 if all cased characters in S are lowercase and there is\n\
4097 at least one cased character in S, 0 otherwise.";
4100 unicode_islower(PyUnicodeObject
*self
)
4102 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4103 register const Py_UNICODE
*e
;
4106 /* Shortcut for single character strings */
4107 if (PyUnicode_GET_SIZE(self
) == 1)
4108 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
4110 /* Special case for empty strings */
4111 if (PyString_GET_SIZE(self
) == 0)
4112 return PyInt_FromLong(0);
4114 e
= p
+ PyUnicode_GET_SIZE(self
);
4116 for (; p
< e
; p
++) {
4117 register const Py_UNICODE ch
= *p
;
4119 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
4120 return PyInt_FromLong(0);
4121 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
4124 return PyInt_FromLong(cased
);
4127 static char isupper__doc__
[] =
4128 "S.isupper() -> int\n\
4130 Return 1 if all cased characters in S are uppercase and there is\n\
4131 at least one cased character in S, 0 otherwise.";
4134 unicode_isupper(PyUnicodeObject
*self
)
4136 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4137 register const Py_UNICODE
*e
;
4140 /* Shortcut for single character strings */
4141 if (PyUnicode_GET_SIZE(self
) == 1)
4142 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
4144 /* Special case for empty strings */
4145 if (PyString_GET_SIZE(self
) == 0)
4146 return PyInt_FromLong(0);
4148 e
= p
+ PyUnicode_GET_SIZE(self
);
4150 for (; p
< e
; p
++) {
4151 register const Py_UNICODE ch
= *p
;
4153 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
4154 return PyInt_FromLong(0);
4155 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
4158 return PyInt_FromLong(cased
);
4161 static char istitle__doc__
[] =
4162 "S.istitle() -> int\n\
4164 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4165 may only follow uncased characters and lowercase characters only cased\n\
4166 ones. Return 0 otherwise.";
4169 unicode_istitle(PyUnicodeObject
*self
)
4171 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4172 register const Py_UNICODE
*e
;
4173 int cased
, previous_is_cased
;
4175 /* Shortcut for single character strings */
4176 if (PyUnicode_GET_SIZE(self
) == 1)
4177 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
4178 (Py_UNICODE_ISUPPER(*p
) != 0));
4180 /* Special case for empty strings */
4181 if (PyString_GET_SIZE(self
) == 0)
4182 return PyInt_FromLong(0);
4184 e
= p
+ PyUnicode_GET_SIZE(self
);
4186 previous_is_cased
= 0;
4187 for (; p
< e
; p
++) {
4188 register const Py_UNICODE ch
= *p
;
4190 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
4191 if (previous_is_cased
)
4192 return PyInt_FromLong(0);
4193 previous_is_cased
= 1;
4196 else if (Py_UNICODE_ISLOWER(ch
)) {
4197 if (!previous_is_cased
)
4198 return PyInt_FromLong(0);
4199 previous_is_cased
= 1;
4203 previous_is_cased
= 0;
4205 return PyInt_FromLong(cased
);
4208 static char isspace__doc__
[] =
4209 "S.isspace() -> int\n\
4211 Return 1 if there are only whitespace characters in S,\n\
4215 unicode_isspace(PyUnicodeObject
*self
)
4217 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4218 register const Py_UNICODE
*e
;
4220 /* Shortcut for single character strings */
4221 if (PyUnicode_GET_SIZE(self
) == 1 &&
4222 Py_UNICODE_ISSPACE(*p
))
4223 return PyInt_FromLong(1);
4225 /* Special case for empty strings */
4226 if (PyString_GET_SIZE(self
) == 0)
4227 return PyInt_FromLong(0);
4229 e
= p
+ PyUnicode_GET_SIZE(self
);
4230 for (; p
< e
; p
++) {
4231 if (!Py_UNICODE_ISSPACE(*p
))
4232 return PyInt_FromLong(0);
4234 return PyInt_FromLong(1);
4237 static char isalpha__doc__
[] =
4238 "S.isalpha() -> int\n\
4240 Return 1 if all characters in S are alphabetic\n\
4241 and there is at least one character in S, 0 otherwise.";
4244 unicode_isalpha(PyUnicodeObject
*self
)
4246 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4247 register const Py_UNICODE
*e
;
4249 /* Shortcut for single character strings */
4250 if (PyUnicode_GET_SIZE(self
) == 1 &&
4251 Py_UNICODE_ISALPHA(*p
))
4252 return PyInt_FromLong(1);
4254 /* Special case for empty strings */
4255 if (PyString_GET_SIZE(self
) == 0)
4256 return PyInt_FromLong(0);
4258 e
= p
+ PyUnicode_GET_SIZE(self
);
4259 for (; p
< e
; p
++) {
4260 if (!Py_UNICODE_ISALPHA(*p
))
4261 return PyInt_FromLong(0);
4263 return PyInt_FromLong(1);
4266 static char isalnum__doc__
[] =
4267 "S.isalnum() -> int\n\
4269 Return 1 if all characters in S are alphanumeric\n\
4270 and there is at least one character in S, 0 otherwise.";
4273 unicode_isalnum(PyUnicodeObject
*self
)
4275 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4276 register const Py_UNICODE
*e
;
4278 /* Shortcut for single character strings */
4279 if (PyUnicode_GET_SIZE(self
) == 1 &&
4280 Py_UNICODE_ISALNUM(*p
))
4281 return PyInt_FromLong(1);
4283 /* Special case for empty strings */
4284 if (PyString_GET_SIZE(self
) == 0)
4285 return PyInt_FromLong(0);
4287 e
= p
+ PyUnicode_GET_SIZE(self
);
4288 for (; p
< e
; p
++) {
4289 if (!Py_UNICODE_ISALNUM(*p
))
4290 return PyInt_FromLong(0);
4292 return PyInt_FromLong(1);
4295 static char isdecimal__doc__
[] =
4296 "S.isdecimal() -> int\n\
4298 Return 1 if there are only decimal characters in S,\n\
4302 unicode_isdecimal(PyUnicodeObject
*self
)
4304 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4305 register const Py_UNICODE
*e
;
4307 /* Shortcut for single character strings */
4308 if (PyUnicode_GET_SIZE(self
) == 1 &&
4309 Py_UNICODE_ISDECIMAL(*p
))
4310 return PyInt_FromLong(1);
4312 /* Special case for empty strings */
4313 if (PyString_GET_SIZE(self
) == 0)
4314 return PyInt_FromLong(0);
4316 e
= p
+ PyUnicode_GET_SIZE(self
);
4317 for (; p
< e
; p
++) {
4318 if (!Py_UNICODE_ISDECIMAL(*p
))
4319 return PyInt_FromLong(0);
4321 return PyInt_FromLong(1);
4324 static char isdigit__doc__
[] =
4325 "S.isdigit() -> int\n\
4327 Return 1 if there are only digit characters in S,\n\
4331 unicode_isdigit(PyUnicodeObject
*self
)
4333 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4334 register const Py_UNICODE
*e
;
4336 /* Shortcut for single character strings */
4337 if (PyUnicode_GET_SIZE(self
) == 1 &&
4338 Py_UNICODE_ISDIGIT(*p
))
4339 return PyInt_FromLong(1);
4341 /* Special case for empty strings */
4342 if (PyString_GET_SIZE(self
) == 0)
4343 return PyInt_FromLong(0);
4345 e
= p
+ PyUnicode_GET_SIZE(self
);
4346 for (; p
< e
; p
++) {
4347 if (!Py_UNICODE_ISDIGIT(*p
))
4348 return PyInt_FromLong(0);
4350 return PyInt_FromLong(1);
4353 static char isnumeric__doc__
[] =
4354 "S.isnumeric() -> int\n\
4356 Return 1 if there are only numeric characters in S,\n\
4360 unicode_isnumeric(PyUnicodeObject
*self
)
4362 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4363 register const Py_UNICODE
*e
;
4365 /* Shortcut for single character strings */
4366 if (PyUnicode_GET_SIZE(self
) == 1 &&
4367 Py_UNICODE_ISNUMERIC(*p
))
4368 return PyInt_FromLong(1);
4370 /* Special case for empty strings */
4371 if (PyString_GET_SIZE(self
) == 0)
4372 return PyInt_FromLong(0);
4374 e
= p
+ PyUnicode_GET_SIZE(self
);
4375 for (; p
< e
; p
++) {
4376 if (!Py_UNICODE_ISNUMERIC(*p
))
4377 return PyInt_FromLong(0);
4379 return PyInt_FromLong(1);
4382 static char join__doc__
[] =
4383 "S.join(sequence) -> unicode\n\
4385 Return a string which is the concatenation of the strings in the\n\
4386 sequence. The separator between elements is S.";
4389 unicode_join(PyObject
*self
, PyObject
*data
)
4391 return PyUnicode_Join(self
, data
);
4395 unicode_length(PyUnicodeObject
*self
)
4397 return self
->length
;
4400 static char ljust__doc__
[] =
4401 "S.ljust(width) -> unicode\n\
4403 Return S left justified in a Unicode string of length width. Padding is\n\
4404 done using spaces.";
4407 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
4410 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
4413 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4415 return (PyObject
*) self
;
4418 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
4421 static char lower__doc__
[] =
4422 "S.lower() -> unicode\n\
4424 Return a copy of the string S converted to lowercase.";
4427 unicode_lower(PyUnicodeObject
*self
)
4429 return fixup(self
, fixlower
);
4432 static char lstrip__doc__
[] =
4433 "S.lstrip() -> unicode\n\
4435 Return a copy of the string S with leading whitespace removed.";
4438 unicode_lstrip(PyUnicodeObject
*self
)
4440 return strip(self
, 1, 0);
4444 unicode_repeat(PyUnicodeObject
*str
, int len
)
4454 if (len
== 1 && PyUnicode_CheckExact(str
)) {
4455 /* no repeat, return original string */
4457 return (PyObject
*) str
;
4460 /* ensure # of chars needed doesn't overflow int and # of bytes
4461 * needed doesn't overflow size_t
4463 nchars
= len
* str
->length
;
4464 if (len
&& nchars
/ len
!= str
->length
) {
4465 PyErr_SetString(PyExc_OverflowError
,
4466 "repeated string is too long");
4469 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4470 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4471 PyErr_SetString(PyExc_OverflowError
,
4472 "repeated string is too long");
4475 u
= _PyUnicode_New(nchars
);
4482 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4486 return (PyObject
*) u
;
4489 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4499 self
= PyUnicode_FromObject(obj
);
4502 str1
= PyUnicode_FromObject(subobj
);
4507 str2
= PyUnicode_FromObject(replobj
);
4513 result
= replace((PyUnicodeObject
*)self
,
4514 (PyUnicodeObject
*)str1
,
4515 (PyUnicodeObject
*)str2
,
4523 static char replace__doc__
[] =
4524 "S.replace (old, new[, maxsplit]) -> unicode\n\
4526 Return a copy of S with all occurrences of substring\n\
4527 old replaced by new. If the optional argument maxsplit is\n\
4528 given, only the first maxsplit occurrences are replaced.";
4531 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4533 PyUnicodeObject
*str1
;
4534 PyUnicodeObject
*str2
;
4538 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4540 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4543 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4547 result
= replace(self
, str1
, str2
, maxcount
);
4555 PyObject
*unicode_repr(PyObject
*unicode
)
4557 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4558 PyUnicode_GET_SIZE(unicode
),
4562 static char rfind__doc__
[] =
4563 "S.rfind(sub [,start [,end]]) -> int\n\
4565 Return the highest index in S where substring sub is found,\n\
4566 such that sub is contained within s[start,end]. Optional\n\
4567 arguments start and end are interpreted as in slice notation.\n\
4569 Return -1 on failure.";
4572 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4574 PyUnicodeObject
*substring
;
4579 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4580 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4582 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4583 (PyObject
*)substring
);
4584 if (substring
== NULL
)
4587 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4589 Py_DECREF(substring
);
4593 static char rindex__doc__
[] =
4594 "S.rindex(sub [,start [,end]]) -> int\n\
4596 Like S.rfind() but raise ValueError when the substring is not found.";
4599 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4602 PyUnicodeObject
*substring
;
4606 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4607 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4609 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4610 (PyObject
*)substring
);
4611 if (substring
== NULL
)
4614 result
= findstring(self
, substring
, start
, end
, -1);
4616 Py_DECREF(substring
);
4618 PyErr_SetString(PyExc_ValueError
, "substring not found");
4621 return PyInt_FromLong(result
);
4624 static char rjust__doc__
[] =
4625 "S.rjust(width) -> unicode\n\
4627 Return S right justified in a Unicode string of length width. Padding is\n\
4628 done using spaces.";
4631 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4634 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4637 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4639 return (PyObject
*) self
;
4642 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4645 static char rstrip__doc__
[] =
4646 "S.rstrip() -> unicode\n\
4648 Return a copy of the string S with trailing whitespace removed.";
4651 unicode_rstrip(PyUnicodeObject
*self
)
4653 return strip(self
, 0, 1);
4657 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4659 /* standard clamping */
4664 if (end
> self
->length
)
4666 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
4667 /* full slice, return original string */
4669 return (PyObject
*) self
;
4674 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4678 PyObject
*PyUnicode_Split(PyObject
*s
,
4684 s
= PyUnicode_FromObject(s
);
4688 sep
= PyUnicode_FromObject(sep
);
4695 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4702 static char split__doc__
[] =
4703 "S.split([sep [,maxsplit]]) -> list of strings\n\
4705 Return a list of the words in S, using sep as the\n\
4706 delimiter string. If maxsplit is given, at most maxsplit\n\
4707 splits are done. If sep is not specified, any whitespace string\n\
4711 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4713 PyObject
*substring
= Py_None
;
4716 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4719 if (substring
== Py_None
)
4720 return split(self
, NULL
, maxcount
);
4721 else if (PyUnicode_Check(substring
))
4722 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4724 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4727 static char splitlines__doc__
[] =
4728 "S.splitlines([keepends]]) -> list of strings\n\
4730 Return a list of the lines in S, breaking at line boundaries.\n\
4731 Line breaks are not included in the resulting list unless keepends\n\
4732 is given and true.";
4735 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4739 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4742 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4746 PyObject
*unicode_str(PyUnicodeObject
*self
)
4748 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4751 static char strip__doc__
[] =
4752 "S.strip() -> unicode\n\
4754 Return a copy of S with leading and trailing whitespace removed.";
4757 unicode_strip(PyUnicodeObject
*self
)
4759 return strip(self
, 1, 1);
4762 static char swapcase__doc__
[] =
4763 "S.swapcase() -> unicode\n\
4765 Return a copy of S with uppercase characters converted to lowercase\n\
4769 unicode_swapcase(PyUnicodeObject
*self
)
4771 return fixup(self
, fixswapcase
);
4774 static char translate__doc__
[] =
4775 "S.translate(table) -> unicode\n\
4777 Return a copy of the string S, where all characters have been mapped\n\
4778 through the given translation table, which must be a mapping of\n\
4779 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4780 are left untouched. Characters mapped to None are deleted.";
4783 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
4785 return PyUnicode_TranslateCharmap(self
->str
,
4791 static char upper__doc__
[] =
4792 "S.upper() -> unicode\n\
4794 Return a copy of S converted to uppercase.";
4797 unicode_upper(PyUnicodeObject
*self
)
4799 return fixup(self
, fixupper
);
4803 static char zfill__doc__
[] =
4804 "S.zfill(width) -> unicode\n\
4806 Pad a numeric string x with zeros on the left, to fill a field\n\
4807 of the specified width. The string x is never truncated.";
4810 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4816 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4819 if (self
->length
>= width
) {
4821 return (PyObject
*) self
;
4824 fill
= width
- self
->length
;
4826 u
= pad(self
, fill
, 0, '0');
4828 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4829 /* move sign to beginning of string */
4830 u
->str
[0] = u
->str
[fill
];
4834 return (PyObject
*) u
;
4840 unicode_freelistsize(PyUnicodeObject
*self
)
4842 return PyInt_FromLong(unicode_freelist_size
);
4846 static char startswith__doc__
[] =
4847 "S.startswith(prefix[, start[, end]]) -> int\n\
4849 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4850 optional start, test S beginning at that position. With optional end, stop\n\
4851 comparing S at that position.";
4854 unicode_startswith(PyUnicodeObject
*self
,
4857 PyUnicodeObject
*substring
;
4862 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4863 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4865 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4866 (PyObject
*)substring
);
4867 if (substring
== NULL
)
4870 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4872 Py_DECREF(substring
);
4877 static char endswith__doc__
[] =
4878 "S.endswith(suffix[, start[, end]]) -> int\n\
4880 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4881 optional start, test S beginning at that position. With optional end, stop\n\
4882 comparing S at that position.";
4885 unicode_endswith(PyUnicodeObject
*self
,
4888 PyUnicodeObject
*substring
;
4893 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4894 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4896 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4897 (PyObject
*)substring
);
4898 if (substring
== NULL
)
4901 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4903 Py_DECREF(substring
);
4908 static PyMethodDef unicode_methods
[] = {
4910 /* Order is according to common usage: often used methods should
4911 appear first, since lookup is done sequentially. */
4913 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
4914 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
4915 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
4916 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
4917 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
4918 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
4919 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
4920 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
4921 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
4922 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
4923 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
4924 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
4925 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
4926 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_NOARGS
, lstrip__doc__
},
4927 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4928 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
4929 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
4930 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
4931 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_NOARGS
, rstrip__doc__
},
4932 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
4933 {"strip", (PyCFunction
) unicode_strip
, METH_NOARGS
, strip__doc__
},
4934 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
4935 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
4936 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
4937 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
4938 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
4939 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
4940 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
4941 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
4942 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
4943 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
4944 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
4945 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
4946 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
4947 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
4949 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
4950 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
4954 /* This one is just used for debugging the implementation. */
4955 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
4961 static PySequenceMethods unicode_as_sequence
= {
4962 (inquiry
) unicode_length
, /* sq_length */
4963 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4964 (intargfunc
) unicode_repeat
, /* sq_repeat */
4965 (intargfunc
) unicode_getitem
, /* sq_item */
4966 (intintargfunc
) unicode_slice
, /* sq_slice */
4967 0, /* sq_ass_item */
4968 0, /* sq_ass_slice */
4969 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4973 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4978 PyErr_SetString(PyExc_SystemError
,
4979 "accessing non-existent unicode segment");
4982 *ptr
= (void *) self
->str
;
4983 return PyUnicode_GET_DATA_SIZE(self
);
4987 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4990 PyErr_SetString(PyExc_TypeError
,
4991 "cannot use unicode as modifyable buffer");
4996 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
5000 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
5005 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
5012 PyErr_SetString(PyExc_SystemError
,
5013 "accessing non-existent unicode segment");
5016 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
5019 *ptr
= (void *) PyString_AS_STRING(str
);
5020 return PyString_GET_SIZE(str
);
5023 /* Helpers for PyUnicode_Format() */
5026 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
5028 int argidx
= *p_argidx
;
5029 if (argidx
< arglen
) {
5034 return PyTuple_GetItem(args
, argidx
);
5036 PyErr_SetString(PyExc_TypeError
,
5037 "not enough arguments for format string");
5041 #define F_LJUST (1<<0)
5042 #define F_SIGN (1<<1)
5043 #define F_BLANK (1<<2)
5044 #define F_ALT (1<<3)
5045 #define F_ZERO (1<<4)
5048 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
5054 va_start(va
, format
);
5056 /* First, format the string as char array, then expand to Py_UNICODE
5058 charbuffer
= (char *)buffer
;
5059 len
= vsprintf(charbuffer
, format
, va
);
5060 for (i
= len
- 1; i
>= 0; i
--)
5061 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
5068 formatfloat(Py_UNICODE
*buf
,
5075 /* fmt = '%#.' + `prec` + `type`
5076 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5080 x
= PyFloat_AsDouble(v
);
5081 if (x
== -1.0 && PyErr_Occurred())
5085 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
5087 sprintf(fmt
, "%%%s.%d%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
5088 /* worst case length calc to ensure no buffer overrun:
5090 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5091 for any double rep.)
5092 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5093 If prec=0 the effective precision is 1 (the leading digit is
5094 always given), therefore increase by one to 10+prec. */
5095 if (buflen
<= (size_t)10 + (size_t)prec
) {
5096 PyErr_SetString(PyExc_OverflowError
,
5097 "formatted float is too long (precision too long?)");
5100 return usprintf(buf
, fmt
, x
);
5104 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
5108 PyObject
*str
; /* temporary string object. */
5109 PyUnicodeObject
*result
;
5111 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
5114 result
= _PyUnicode_New(len
);
5115 for (i
= 0; i
< len
; i
++)
5116 result
->str
[i
] = buf
[i
];
5117 result
->str
[len
] = 0;
5119 return (PyObject
*)result
;
5123 formatint(Py_UNICODE
*buf
,
5130 /* fmt = '%#.' + `prec` + 'l' + `type`
5131 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5133 char fmt
[64]; /* plenty big enough! */
5135 int use_native_c_format
= 1;
5137 x
= PyInt_AsLong(v
);
5138 if (x
== -1 && PyErr_Occurred())
5142 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5143 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5144 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
5145 PyErr_SetString(PyExc_OverflowError
,
5146 "formatted integer is too long (precision too long?)");
5149 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5150 * but we want it (for consistency with other %#x conversions, and
5151 * for consistency with Python's hex() function).
5152 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5153 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5154 * So add it only if the platform doesn't already.
5156 if (x
== 0 && (flags
& F_ALT
) && (type
== 'x' || type
== 'X')) {
5157 /* Only way to know what the platform does is to try it. */
5158 sprintf(fmt
, type
== 'x' ? "%#x" : "%#X", 0);
5159 if (fmt
[1] != (char)type
) {
5160 /* Supply our own leading 0x/0X -- needed under std C */
5161 use_native_c_format
= 0;
5162 sprintf(fmt
, "0%c%%#.%dl%c", type
, prec
, type
);
5165 if (use_native_c_format
)
5166 sprintf(fmt
, "%%%s.%dl%c", (flags
& F_ALT
) ? "#" : "", prec
, type
);
5167 return usprintf(buf
, fmt
, x
);
5171 formatchar(Py_UNICODE
*buf
,
5175 /* presume that the buffer is at least 2 characters long */
5176 if (PyUnicode_Check(v
)) {
5177 if (PyUnicode_GET_SIZE(v
) != 1)
5179 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
5182 else if (PyString_Check(v
)) {
5183 if (PyString_GET_SIZE(v
) != 1)
5185 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
5189 /* Integer input truncated to a character */
5191 x
= PyInt_AsLong(v
);
5192 if (x
== -1 && PyErr_Occurred())
5200 PyErr_SetString(PyExc_TypeError
,
5201 "%c requires int or char");
5205 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5207 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5208 chars are formatted. XXX This is a magic number. Each formatting
5209 routine does bounds checking to ensure no overflow, but a better
5210 solution may be to malloc a buffer of appropriate size for each
5211 format. For now, the current solution is sufficient.
5213 #define FORMATBUFLEN (size_t)120
5215 PyObject
*PyUnicode_Format(PyObject
*format
,
5218 Py_UNICODE
*fmt
, *res
;
5219 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
5221 PyUnicodeObject
*result
= NULL
;
5222 PyObject
*dict
= NULL
;
5225 if (format
== NULL
|| args
== NULL
) {
5226 PyErr_BadInternalCall();
5229 uformat
= PyUnicode_FromObject(format
);
5230 if (uformat
== NULL
)
5232 fmt
= PyUnicode_AS_UNICODE(uformat
);
5233 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
5235 reslen
= rescnt
= fmtcnt
+ 100;
5236 result
= _PyUnicode_New(reslen
);
5239 res
= PyUnicode_AS_UNICODE(result
);
5241 if (PyTuple_Check(args
)) {
5242 arglen
= PyTuple_Size(args
);
5249 if (args
->ob_type
->tp_as_mapping
)
5252 while (--fmtcnt
>= 0) {
5255 rescnt
= fmtcnt
+ 100;
5257 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5259 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
5265 /* Got a format specifier */
5269 Py_UNICODE c
= '\0';
5272 PyObject
*temp
= NULL
;
5276 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
5280 Py_UNICODE
*keystart
;
5286 PyErr_SetString(PyExc_TypeError
,
5287 "format requires a mapping");
5293 /* Skip over balanced parentheses */
5294 while (pcount
> 0 && --fmtcnt
>= 0) {
5297 else if (*fmt
== '(')
5301 keylen
= fmt
- keystart
- 1;
5302 if (fmtcnt
< 0 || pcount
> 0) {
5303 PyErr_SetString(PyExc_ValueError
,
5304 "incomplete format key");
5307 /* keys are converted to strings using UTF-8 and
5308 then looked up since Python uses strings to hold
5309 variables names etc. in its namespaces and we
5310 wouldn't want to break common idioms. */
5311 key
= PyUnicode_EncodeUTF8(keystart
,
5320 args
= PyObject_GetItem(dict
, key
);
5329 while (--fmtcnt
>= 0) {
5330 switch (c
= *fmt
++) {
5331 case '-': flags
|= F_LJUST
; continue;
5332 case '+': flags
|= F_SIGN
; continue;
5333 case ' ': flags
|= F_BLANK
; continue;
5334 case '#': flags
|= F_ALT
; continue;
5335 case '0': flags
|= F_ZERO
; continue;
5340 v
= getnextarg(args
, arglen
, &argidx
);
5343 if (!PyInt_Check(v
)) {
5344 PyErr_SetString(PyExc_TypeError
,
5348 width
= PyInt_AsLong(v
);
5356 else if (c
>= '0' && c
<= '9') {
5358 while (--fmtcnt
>= 0) {
5360 if (c
< '0' || c
> '9')
5362 if ((width
*10) / 10 != width
) {
5363 PyErr_SetString(PyExc_ValueError
,
5367 width
= width
*10 + (c
- '0');
5375 v
= getnextarg(args
, arglen
, &argidx
);
5378 if (!PyInt_Check(v
)) {
5379 PyErr_SetString(PyExc_TypeError
,
5383 prec
= PyInt_AsLong(v
);
5389 else if (c
>= '0' && c
<= '9') {
5391 while (--fmtcnt
>= 0) {
5392 c
= Py_CHARMASK(*fmt
++);
5393 if (c
< '0' || c
> '9')
5395 if ((prec
*10) / 10 != prec
) {
5396 PyErr_SetString(PyExc_ValueError
,
5400 prec
= prec
*10 + (c
- '0');
5405 if (c
== 'h' || c
== 'l' || c
== 'L') {
5411 PyErr_SetString(PyExc_ValueError
,
5412 "incomplete format");
5416 v
= getnextarg(args
, arglen
, &argidx
);
5426 /* presume that buffer length is at least 1 */
5433 if (PyUnicode_Check(v
) && c
== 's') {
5440 temp
= PyObject_Str(v
);
5442 temp
= PyObject_Repr(v
);
5445 if (!PyString_Check(temp
)) {
5446 /* XXX Note: this should never happen, since
5447 PyObject_Repr() and PyObject_Str() assure
5450 PyErr_SetString(PyExc_TypeError
,
5451 "%s argument has non-string str()");
5454 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5455 PyString_GET_SIZE(temp
),
5463 pbuf
= PyUnicode_AS_UNICODE(temp
);
5464 len
= PyUnicode_GET_SIZE(temp
);
5465 if (prec
>= 0 && len
> prec
)
5477 if (PyLong_Check(v
)) {
5478 temp
= formatlong(v
, flags
, prec
, c
);
5481 pbuf
= PyUnicode_AS_UNICODE(temp
);
5482 len
= PyUnicode_GET_SIZE(temp
);
5483 /* unbounded ints can always produce
5484 a sign character! */
5489 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5493 /* only d conversion is signed */
5506 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5517 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5523 PyErr_Format(PyExc_ValueError
,
5524 "unsupported format character '%c' (0x%x) "
5526 (31<=c
&& c
<=126) ? c
: '?',
5527 c
, fmt
-1 - PyUnicode_AS_UNICODE(uformat
));
5531 if (*pbuf
== '-' || *pbuf
== '+') {
5535 else if (flags
& F_SIGN
)
5537 else if (flags
& F_BLANK
)
5544 if (rescnt
< width
+ (sign
!= 0)) {
5546 rescnt
= width
+ fmtcnt
+ 100;
5548 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5550 res
= PyUnicode_AS_UNICODE(result
)
5560 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5561 assert(pbuf
[0] == '0');
5562 assert(pbuf
[1] == c
);
5573 if (width
> len
&& !(flags
& F_LJUST
)) {
5577 } while (--width
> len
);
5582 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5583 assert(pbuf
[0] == '0');
5584 assert(pbuf
[1] == c
);
5589 Py_UNICODE_COPY(res
, pbuf
, len
);
5592 while (--width
>= len
) {
5596 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5597 PyErr_SetString(PyExc_TypeError
,
5598 "not all arguments converted");
5604 if (argidx
< arglen
&& !dict
) {
5605 PyErr_SetString(PyExc_TypeError
,
5606 "not all arguments converted");
5614 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
5616 return (PyObject
*)result
;
5627 static PyBufferProcs unicode_as_buffer
= {
5628 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5629 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5630 (getsegcountproc
) unicode_buffer_getsegcount
,
5631 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5634 staticforward PyObject
*
5635 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
5638 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5641 static char *kwlist
[] = {"string", "encoding", "errors", 0};
5642 char *encoding
= NULL
;
5643 char *errors
= NULL
;
5645 if (type
!= &PyUnicode_Type
)
5646 return unicode_subtype_new(type
, args
, kwds
);
5647 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
5648 kwlist
, &x
, &encoding
, &errors
))
5651 return (PyObject
*)_PyUnicode_New(0);
5652 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
5656 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5658 PyUnicodeObject
*tmp
, *pnew
;
5661 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
5662 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
5665 assert(PyUnicode_Check(tmp
));
5666 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
5669 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
5670 if (pnew
->str
== NULL
) {
5671 _Py_ForgetReference((PyObject
*)pnew
);
5675 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
5677 pnew
->hash
= tmp
->hash
;
5679 return (PyObject
*)pnew
;
5682 static char unicode_doc
[] =
5683 "unicode(string [, encoding[, errors]]) -> object\n\
5685 Create a new Unicode object from the given encoded string.\n\
5686 encoding defaults to the current default string encoding and \n\
5687 errors, defining the error handling, to 'strict'.";
5689 PyTypeObject PyUnicode_Type
= {
5690 PyObject_HEAD_INIT(&PyType_Type
)
5692 "unicode", /* tp_name */
5693 sizeof(PyUnicodeObject
), /* tp_size */
5694 0, /* tp_itemsize */
5696 (destructor
)_PyUnicode_Free
, /* tp_dealloc */
5700 (cmpfunc
) unicode_compare
, /* tp_compare */
5701 (reprfunc
) unicode_repr
, /* tp_repr */
5702 0, /* tp_as_number */
5703 &unicode_as_sequence
, /* tp_as_sequence */
5704 0, /* tp_as_mapping */
5705 (hashfunc
) unicode_hash
, /* tp_hash*/
5707 (reprfunc
) unicode_str
, /* tp_str */
5708 PyObject_GenericGetAttr
, /* tp_getattro */
5709 0, /* tp_setattro */
5710 &unicode_as_buffer
, /* tp_as_buffer */
5711 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_BASETYPE
, /* tp_flags */
5712 unicode_doc
, /* tp_doc */
5713 0, /* tp_traverse */
5715 0, /* tp_richcompare */
5716 0, /* tp_weaklistoffset */
5718 0, /* tp_iternext */
5719 unicode_methods
, /* tp_methods */
5724 0, /* tp_descr_get */
5725 0, /* tp_descr_set */
5726 0, /* tp_dictoffset */
5729 unicode_new
, /* tp_new */
5732 /* Initialize the Unicode implementation */
5734 void _PyUnicode_Init(void)
5738 /* Init the implementation */
5739 unicode_freelist
= NULL
;
5740 unicode_freelist_size
= 0;
5741 unicode_empty
= _PyUnicode_New(0);
5742 strcpy(unicode_default_encoding
, "ascii");
5743 for (i
= 0; i
< 256; i
++)
5744 unicode_latin1
[i
] = NULL
;
5747 /* Finalize the Unicode implementation */
5750 _PyUnicode_Fini(void)
5755 Py_XDECREF(unicode_empty
);
5756 unicode_empty
= NULL
;
5758 for (i
= 0; i
< 256; i
++) {
5759 if (unicode_latin1
[i
]) {
5760 Py_DECREF(unicode_latin1
[i
]);
5761 unicode_latin1
[i
] = NULL
;
5765 for (u
= unicode_freelist
; u
!= NULL
;) {
5766 PyUnicodeObject
*v
= u
;
5767 u
= *(PyUnicodeObject
**)u
;
5770 Py_XDECREF(v
->defenc
);
5773 unicode_freelist
= NULL
;
5774 unicode_freelist_size
= 0;