3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_DEL(unicode
);
227 void unicode_dealloc(register PyUnicodeObject
*unicode
)
229 if (PyUnicode_CheckExact(unicode
) &&
230 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
231 /* Keep-Alive optimization */
232 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
233 PyMem_DEL(unicode
->str
);
237 if (unicode
->defenc
) {
238 Py_DECREF(unicode
->defenc
);
239 unicode
->defenc
= NULL
;
241 /* Add to free list */
242 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
243 unicode_freelist
= unicode
;
244 unicode_freelist_size
++;
247 PyMem_DEL(unicode
->str
);
248 Py_XDECREF(unicode
->defenc
);
249 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
253 int PyUnicode_Resize(PyObject
**unicode
,
256 register PyUnicodeObject
*v
;
258 /* Argument checks */
259 if (unicode
== NULL
) {
260 PyErr_BadInternalCall();
263 v
= (PyUnicodeObject
*)*unicode
;
264 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
265 PyErr_BadInternalCall();
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v
->length
!= length
&&
273 (v
== unicode_empty
|| v
->length
== 1)) {
274 PyUnicodeObject
*w
= _PyUnicode_New(length
);
277 Py_UNICODE_COPY(w
->str
, v
->str
,
278 length
< v
->length
? length
: v
->length
);
279 *unicode
= (PyObject
*)w
;
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v
, length
);
288 /* Internal API for use in unicodeobject.c only ! */
289 #define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
295 PyUnicodeObject
*unicode
;
297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
301 /* Optimization for empty strings */
302 if (size
== 0 && unicode_empty
!= NULL
) {
303 Py_INCREF(unicode_empty
);
304 return (PyObject
*)unicode_empty
;
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size
== 1 && *u
< 256) {
310 unicode
= unicode_latin1
[*u
];
312 unicode
= _PyUnicode_New(1);
315 unicode
->str
[0] = *u
;
316 unicode_latin1
[*u
] = unicode
;
319 return (PyObject
*)unicode
;
323 unicode
= _PyUnicode_New(size
);
327 /* Copy the Unicode data into the new object */
329 Py_UNICODE_COPY(unicode
->str
, u
, size
);
331 return (PyObject
*)unicode
;
336 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
339 PyUnicodeObject
*unicode
;
342 PyErr_BadInternalCall();
346 unicode
= _PyUnicode_New(size
);
350 /* Copy the wchar_t data into the new object */
351 #ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
355 register Py_UNICODE
*u
;
357 u
= PyUnicode_AS_UNICODE(unicode
);
358 for (i
= size
; i
>= 0; i
--)
363 return (PyObject
*)unicode
;
366 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
370 if (unicode
== NULL
) {
371 PyErr_BadInternalCall();
374 if (size
> PyUnicode_GET_SIZE(unicode
))
375 size
= PyUnicode_GET_SIZE(unicode
);
376 #ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
380 register Py_UNICODE
*u
;
382 u
= PyUnicode_AS_UNICODE(unicode
);
383 for (i
= size
; i
>= 0; i
--)
393 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj
)) {
401 if (PyUnicode_Check(obj
)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
405 PyUnicode_GET_SIZE(obj
));
407 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
410 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
411 const char *encoding
,
414 const char *s
= NULL
;
420 PyErr_BadInternalCall();
425 /* For b/w compatibility we also accept Unicode objects provided
426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
434 if (PyUnicode_Check(obj
)) {
436 PyErr_SetString(PyExc_TypeError
,
437 "decoding Unicode is not supported");
440 return PyObject_Unicode(obj
);
443 if (PyUnicode_Check(obj
)) {
444 PyErr_SetString(PyExc_TypeError
,
445 "decoding Unicode is not supported");
451 if (PyString_Check(obj
)) {
452 s
= PyString_AS_STRING(obj
);
453 len
= PyString_GET_SIZE(obj
);
455 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError
))
459 PyErr_Format(PyExc_TypeError
,
460 "coercing to Unicode: need string or buffer, "
462 obj
->ob_type
->tp_name
);
466 /* Convert to Unicode */
468 Py_INCREF(unicode_empty
);
469 v
= (PyObject
*)unicode_empty
;
472 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
486 PyObject
*PyUnicode_Decode(const char *s
,
488 const char *encoding
,
491 PyObject
*buffer
= NULL
, *unicode
;
493 if (encoding
== NULL
)
494 encoding
= PyUnicode_GetDefaultEncoding();
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding
, "utf-8") == 0)
498 return PyUnicode_DecodeUTF8(s
, size
, errors
);
499 else if (strcmp(encoding
, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s
, size
, errors
);
501 else if (strcmp(encoding
, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s
, size
, errors
);
504 /* Decode via the codec registry */
505 buffer
= PyBuffer_FromMemory((void *)s
, size
);
508 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
511 if (!PyUnicode_Check(unicode
)) {
512 PyErr_Format(PyExc_TypeError
,
513 "decoder did not return an unicode object (type=%.400s)",
514 unicode
->ob_type
->tp_name
);
526 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
528 const char *encoding
,
531 PyObject
*v
, *unicode
;
533 unicode
= PyUnicode_FromUnicode(s
, size
);
536 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
541 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
542 const char *encoding
,
547 if (!PyUnicode_Check(unicode
)) {
552 if (encoding
== NULL
)
553 encoding
= PyUnicode_GetDefaultEncoding();
555 /* Shortcuts for common default encodings */
556 if (errors
== NULL
) {
557 if (strcmp(encoding
, "utf-8") == 0)
558 return PyUnicode_AsUTF8String(unicode
);
559 else if (strcmp(encoding
, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode
);
561 else if (strcmp(encoding
, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode
);
565 /* Encode via the codec registry */
566 v
= PyCodec_Encode(unicode
, encoding
, errors
);
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v
)) {
571 PyErr_Format(PyExc_TypeError
,
572 "encoder did not return a string object (type=%.400s)",
573 v
->ob_type
->tp_name
);
583 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
586 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
590 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
591 if (v
&& errors
== NULL
)
592 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
596 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
598 if (!PyUnicode_Check(unicode
)) {
602 return PyUnicode_AS_UNICODE(unicode
);
608 int PyUnicode_GetSize(PyObject
*unicode
)
610 if (!PyUnicode_Check(unicode
)) {
614 return PyUnicode_GET_SIZE(unicode
);
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding
;
625 int PyUnicode_SetDefaultEncoding(const char *encoding
)
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v
= _PyCodec_Lookup(encoding
);
635 strncpy(unicode_default_encoding
,
637 sizeof(unicode_default_encoding
));
644 /* --- UTF-7 Codec -------------------------------------------------------- */
646 /* see RFC2152 for details */
649 char utf7_special
[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667 #define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
672 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
677 #define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
683 #define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
695 errmsg = "code pairs are not supported"; \
703 int utf7_decoding_error(Py_UNICODE
**dest
,
707 if ((errors
== NULL
) ||
708 (strcmp(errors
,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError
,
710 "UTF-7 decoding error: %.400s",
714 else if (strcmp(errors
,"ignore") == 0) {
717 else if (strcmp(errors
,"replace") == 0) {
719 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
725 PyErr_Format(PyExc_ValueError
,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
732 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
737 PyUnicodeObject
*unicode
;
739 const char *errmsg
= "";
741 unsigned int bitsleft
= 0;
742 unsigned long charsleft
= 0;
745 unicode
= _PyUnicode_New(size
);
749 return (PyObject
*)unicode
;
758 if ((ch
== '-') || !B64CHAR(ch
)) {
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
768 errmsg
= "partial character in shift sequence";
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
777 errmsg
= "non-zero padding bits in shift sequence";
782 if ((s
< e
) && (*(s
) == '-')) {
786 } else if (SPECIAL(ch
,0,0)) {
787 errmsg
= "unexpected special character";
793 charsleft
= (charsleft
<< 6) | UB64(ch
);
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
799 else if ( ch
== '+' ) {
801 if (s
< e
&& *s
== '-') {
810 else if (SPECIAL(ch
,0,0)) {
811 errmsg
= "unexpected special character";
821 if (utf7_decoding_error(&p
, errors
, errmsg
))
826 if (utf7_decoding_error(&p
, errors
, "unterminated shift sequence"))
830 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
833 return (PyObject
*)unicode
;
841 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
844 int encodeWhiteSpace
,
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated
= 5 * size
;
852 unsigned int bitsleft
= 0;
853 unsigned long charsleft
= 0;
858 return PyString_FromStringAndSize(NULL
, 0);
860 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
864 start
= out
= PyString_AS_STRING(v
);
865 for (;i
< size
; ++i
) {
866 Py_UNICODE ch
= s
[i
];
872 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
876 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
877 inShift
= bitsleft
> 0;
882 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
883 *out
++ = B64(charsleft
<< (6-bitsleft
));
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch
) || ch
== '-') {
895 charsleft
= (charsleft
<< 16) | ch
;
896 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
905 Py_UNICODE ch2
= s
[i
+1];
907 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
909 } else if (B64CHAR(ch2
) || ch2
== '-') {
926 *out
++= B64(charsleft
<< (6-bitsleft
) );
930 if (_PyString_Resize(&v
, out
- start
)) {
944 /* --- UTF-8 Codec -------------------------------------------------------- */
947 char utf8_code_length
[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
969 int utf8_decoding_error(const char **source
,
974 if ((errors
== NULL
) ||
975 (strcmp(errors
,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError
,
977 "UTF-8 decoding error: %.400s",
981 else if (strcmp(errors
,"ignore") == 0) {
985 else if (strcmp(errors
,"replace") == 0) {
987 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
992 PyErr_Format(PyExc_ValueError
,
993 "UTF-8 decoding error; unknown error handling code: %.400s",
999 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1005 PyUnicodeObject
*unicode
;
1007 const char *errmsg
= "";
1009 /* Note: size will always be longer than the resulting Unicode
1011 unicode
= _PyUnicode_New(size
);
1015 return (PyObject
*)unicode
;
1017 /* Unpack UTF-8 encoded data */
1022 Py_UCS4 ch
= (unsigned char)*s
;
1025 *p
++ = (Py_UNICODE
)ch
;
1030 n
= utf8_code_length
[ch
];
1033 errmsg
= "unexpected end of data";
1040 errmsg
= "unexpected code byte";
1044 errmsg
= "internal error";
1048 if ((s
[1] & 0xc0) != 0x80) {
1049 errmsg
= "invalid data";
1052 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1054 errmsg
= "illegal encoding";
1058 *p
++ = (Py_UNICODE
)ch
;
1062 if ((s
[1] & 0xc0) != 0x80 ||
1063 (s
[2] & 0xc0) != 0x80) {
1064 errmsg
= "invalid data";
1067 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1068 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
1069 errmsg
= "illegal encoding";
1073 *p
++ = (Py_UNICODE
)ch
;
1077 if ((s
[1] & 0xc0) != 0x80 ||
1078 (s
[2] & 0xc0) != 0x80 ||
1079 (s
[3] & 0xc0) != 0x80) {
1080 errmsg
= "invalid data";
1083 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1084 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
1086 if ((ch
< 0x10000) /* minimum value allowed for 4
1088 || (ch
> 0x10ffff)) /* maximum value allowed for
1091 errmsg
= "illegal encoding";
1094 #ifdef Py_UNICODE_WIDE
1095 *p
++ = (Py_UNICODE
)ch
;
1097 /* compute and append the two surrogates: */
1099 /* translate from 10000..10FFFF to 0..FFFF */
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1105 /* low surrogate = bottom 10 bits added to DC00 */
1106 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1111 /* Other sizes are only needed for UCS-4 */
1112 errmsg
= "unsupported Unicode code range";
1119 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
1124 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1127 return (PyObject
*)unicode
;
1134 /* Not used anymore, now that the encoder supports UTF-16
1138 int utf8_encoding_error(const Py_UNICODE
**source
,
1141 const char *details
)
1143 if ((errors
== NULL
) ||
1144 (strcmp(errors
,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError
,
1146 "UTF-8 encoding error: %.400s",
1150 else if (strcmp(errors
,"ignore") == 0) {
1153 else if (strcmp(errors
,"replace") == 0) {
1159 PyErr_Format(PyExc_ValueError
,
1160 "UTF-8 encoding error; "
1161 "unknown error handling code: %.400s",
1168 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1176 unsigned int cbAllocated
= 3 * size
;
1177 unsigned int cbWritten
= 0;
1180 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1186 p
= q
= PyString_AS_STRING(v
);
1188 Py_UCS4 ch
= s
[i
++];
1193 else if (ch
< 0x0800) {
1194 *p
++ = 0xc0 | (ch
>> 6);
1195 *p
++ = 0x80 | (ch
& 0x3f);
1198 else if (ch
< 0x10000) {
1199 /* Check for high surrogate */
1200 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1203 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1205 if (cbWritten
>= (cbAllocated
- 4)) {
1206 /* Provide enough room for some more
1208 cbAllocated
+= 4*10;
1209 if (_PyString_Resize(&v
, cbAllocated
))
1213 /* combine the two values */
1214 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
1216 *p
++ = (char)((ch
>> 18) | 0xf0);
1217 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1224 *p
++ = (char)(0xe0 | (ch
>> 12));
1227 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1228 *p
++ = (char)(0x80 | (ch
& 0x3f));
1230 *p
++ = 0xf0 | (ch
>>18);
1231 *p
++ = 0x80 | ((ch
>>12) & 0x3f);
1232 *p
++ = 0x80 | ((ch
>>6) & 0x3f);
1233 *p
++ = 0x80 | (ch
& 0x3f);
1238 if (_PyString_Resize(&v
, p
- q
))
1247 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1249 if (!PyUnicode_Check(unicode
)) {
1250 PyErr_BadArgument();
1253 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1254 PyUnicode_GET_SIZE(unicode
),
1258 /* --- UTF-16 Codec ------------------------------------------------------- */
1261 int utf16_decoding_error(Py_UNICODE
**dest
,
1263 const char *details
)
1265 if ((errors
== NULL
) ||
1266 (strcmp(errors
,"strict") == 0)) {
1267 PyErr_Format(PyExc_UnicodeError
,
1268 "UTF-16 decoding error: %.400s",
1272 else if (strcmp(errors
,"ignore") == 0) {
1275 else if (strcmp(errors
,"replace") == 0) {
1277 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1283 PyErr_Format(PyExc_ValueError
,
1284 "UTF-16 decoding error; "
1285 "unknown error handling code: %.400s",
1292 PyUnicode_DecodeUTF16(const char *s
,
1297 PyUnicodeObject
*unicode
;
1299 const unsigned char *q
, *e
;
1300 int bo
= 0; /* assume native ordering by default */
1301 const char *errmsg
= "";
1302 /* Offsets from q for retrieving byte pairs in the right order. */
1303 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1304 int ihi
= 1, ilo
= 0;
1306 int ihi
= 0, ilo
= 1;
1309 /* size should be an even number */
1311 if (utf16_decoding_error(NULL
, errors
, "truncated data"))
1313 --size
; /* else ignore the oddball byte */
1316 /* Note: size will always be longer than the resulting Unicode
1318 unicode
= _PyUnicode_New(size
);
1322 return (PyObject
*)unicode
;
1324 /* Unpack UTF-16 encoded data */
1326 q
= (unsigned char *)s
;
1332 /* Check for BOM marks (U+FEFF) in the input and adjust current
1333 byte order setting accordingly. In native mode, the leading BOM
1334 mark is skipped, in all other modes, it is copied to the output
1335 stream as-is (giving a ZWNBSP character). */
1337 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1338 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1339 if (bom
== 0xFEFF) {
1343 else if (bom
== 0xFFFE) {
1348 if (bom
== 0xFEFF) {
1352 else if (bom
== 0xFFFE) {
1371 Py_UNICODE ch
= (q
[ihi
] << 8) | q
[ilo
];
1374 if (ch
< 0xD800 || ch
> 0xDFFF) {
1379 /* UTF-16 code pair: */
1381 errmsg
= "unexpected end of data";
1384 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1385 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1387 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1388 #ifndef Py_UNICODE_WIDE
1392 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1397 errmsg
= "illegal UTF-16 surrogate";
1402 errmsg
= "illegal encoding";
1403 /* Fall through to report the error */
1406 if (utf16_decoding_error(&p
, errors
, errmsg
))
1414 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1417 return (PyObject
*)unicode
;
1425 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1433 /* Offsets from p for storing byte pairs in the right order. */
1434 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1435 int ihi
= 1, ilo
= 0;
1437 int ihi
= 0, ilo
= 1;
1440 #define STORECHAR(CH) \
1442 p[ihi] = ((CH) >> 8) & 0xff; \
1443 p[ilo] = (CH) & 0xff; \
1447 for (i
= pairs
= 0; i
< size
; i
++)
1448 if (s
[i
] >= 0x10000)
1450 v
= PyString_FromStringAndSize(NULL
,
1451 2 * (size
+ pairs
+ (byteorder
== 0)));
1455 p
= (unsigned char *)PyString_AS_STRING(v
);
1461 if (byteorder
== -1) {
1466 else if (byteorder
== 1) {
1472 while (size
-- > 0) {
1473 Py_UNICODE ch
= *s
++;
1475 if (ch
>= 0x10000) {
1476 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1477 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1487 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1489 if (!PyUnicode_Check(unicode
)) {
1490 PyErr_BadArgument();
1493 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1494 PyUnicode_GET_SIZE(unicode
),
1499 /* --- Unicode Escape Codec ----------------------------------------------- */
1502 int unicodeescape_decoding_error(const char **source
,
1505 const char *details
)
1507 if ((errors
== NULL
) ||
1508 (strcmp(errors
,"strict") == 0)) {
1509 PyErr_Format(PyExc_UnicodeError
,
1510 "Unicode-Escape decoding error: %.400s",
1514 else if (strcmp(errors
,"ignore") == 0) {
1517 else if (strcmp(errors
,"replace") == 0) {
1518 *x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1522 PyErr_Format(PyExc_ValueError
,
1523 "Unicode-Escape decoding error; "
1524 "unknown error handling code: %.400s",
1530 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1532 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1537 Py_UNICODE
*p
, *buf
;
1540 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1542 /* Escaped strings will always be longer than the resulting
1543 Unicode string, so we start with size here and then reduce the
1544 length after conversion to the true value. */
1545 v
= _PyUnicode_New(size
);
1549 return (PyObject
*)v
;
1551 p
= buf
= PyUnicode_AS_UNICODE(v
);
1559 /* Non-escape characters are interpreted as Unicode ordinals */
1561 *p
++ = (unsigned char) *s
++;
1571 case '\\': *p
++ = '\\'; break;
1572 case '\'': *p
++ = '\''; break;
1573 case '\"': *p
++ = '\"'; break;
1574 case 'b': *p
++ = '\b'; break;
1575 case 'f': *p
++ = '\014'; break; /* FF */
1576 case 't': *p
++ = '\t'; break;
1577 case 'n': *p
++ = '\n'; break;
1578 case 'r': *p
++ = '\r'; break;
1579 case 'v': *p
++ = '\013'; break; /* VT */
1580 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1582 /* \OOO (octal) escapes */
1583 case '0': case '1': case '2': case '3':
1584 case '4': case '5': case '6': case '7':
1586 if ('0' <= *s
&& *s
<= '7') {
1587 x
= (x
<<3) + *s
++ - '0';
1588 if ('0' <= *s
&& *s
<= '7')
1589 x
= (x
<<3) + *s
++ - '0';
1598 message
= "truncated \\xXX escape";
1604 message
= "truncated \\uXXXX escape";
1610 message
= "truncated \\UXXXXXXXX escape";
1613 for (i
= 0; i
< digits
; i
++) {
1614 c
= (unsigned char) s
[i
];
1616 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1622 chr
= (chr
<<4) & ~0xF;
1623 if (c
>= '0' && c
<= '9')
1625 else if (c
>= 'a' && c
<= 'f')
1626 chr
+= 10 + c
- 'a';
1628 chr
+= 10 + c
- 'A';
1632 /* when we get here, chr is a 32-bit unicode character */
1634 /* UCS-2 character */
1635 *p
++ = (Py_UNICODE
) chr
;
1636 else if (chr
<= 0x10ffff) {
1637 /* UCS-4 character. Either store directly, or as
1639 #ifdef Py_UNICODE_WIDE
1643 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1644 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1647 if (unicodeescape_decoding_error(
1649 "illegal Unicode character")
1652 *p
++ = x
; /* store replacement character */
1658 message
= "malformed \\N character escape";
1659 if (ucnhash_CAPI
== NULL
) {
1660 /* load the unicode data module */
1662 m
= PyImport_ImportModule("unicodedata");
1665 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1669 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1671 if (ucnhash_CAPI
== NULL
)
1675 const char *start
= s
+1;
1676 /* look for the closing brace */
1677 while (*s
!= '}' && s
< end
)
1679 if (s
> start
&& s
< end
&& *s
== '}') {
1680 /* found a name. look it up in the unicode database */
1681 message
= "unknown Unicode character name";
1683 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1687 if (unicodeescape_decoding_error(&s
, &x
, errors
, message
))
1694 *p
++ = (unsigned char)s
[-1];
1698 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1700 return (PyObject
*)v
;
1705 "\\N escapes not supported (can't load unicodedata module)"
1714 /* Return a Unicode-Escape string version of the Unicode object.
1716 If quotes is true, the string is enclosed in u"" or u'' quotes as
1721 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1726 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1733 static const char *hexdigit
= "0123456789abcdef";
1735 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1739 p
= PyString_AS_STRING(repr
);
1743 *p
++ = (findchar(s
, size
, '\'') &&
1744 !findchar(s
, size
, '"')) ? '"' : '\'';
1746 while (size
-- > 0) {
1747 Py_UNICODE ch
= *s
++;
1751 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
1757 #ifdef Py_UNICODE_WIDE
1758 /* Map 21-bit characters to '\U00xxxxxx' */
1759 else if (ch
>= 0x10000) {
1760 int offset
= p
- PyString_AS_STRING(repr
);
1762 /* Resize the string if necessary */
1763 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
1764 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
1766 p
= PyString_AS_STRING(repr
) + offset
;
1771 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
1772 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
1773 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
1774 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
1775 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
1776 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
1777 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
1778 *p
++ = hexdigit
[ch
& 0x0000000F];
1782 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1783 else if (ch
>= 0xD800 && ch
< 0xDC00) {
1789 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
1790 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
1793 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
1794 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
1795 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
1796 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
1797 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
1798 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
1799 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
1800 *p
++ = hexdigit
[ucs
& 0x0000000F];
1803 /* Fall through: isolated surrogates are copied as-is */
1808 /* Map 16-bit characters to '\uxxxx' */
1812 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
1813 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
1814 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1815 *p
++ = hexdigit
[ch
& 0x000F];
1818 /* Map special whitespace to '\t', \n', '\r' */
1819 else if (ch
== '\t') {
1823 else if (ch
== '\n') {
1827 else if (ch
== '\r') {
1832 /* Map non-printable US ASCII to '\xhh' */
1833 else if (ch
< ' ' || ch
>= 0x7F) {
1836 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1837 *p
++ = hexdigit
[ch
& 0x000F];
1840 /* Copy everything else as-is */
1845 *p
++ = PyString_AS_STRING(repr
)[1];
1848 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
1858 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1861 return unicodeescape_string(s
, size
, 0);
1864 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1866 if (!PyUnicode_Check(unicode
)) {
1867 PyErr_BadArgument();
1870 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1871 PyUnicode_GET_SIZE(unicode
));
1874 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1876 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1881 Py_UNICODE
*p
, *buf
;
1885 /* Escaped strings will always be longer than the resulting
1886 Unicode string, so we start with size here and then reduce the
1887 length after conversion to the true value. */
1888 v
= _PyUnicode_New(size
);
1892 return (PyObject
*)v
;
1893 p
= buf
= PyUnicode_AS_UNICODE(v
);
1900 /* Non-escape characters are interpreted as Unicode ordinals */
1902 *p
++ = (unsigned char)*s
++;
1906 /* \u-escapes are only interpreted iff the number of leading
1907 backslashes if odd */
1912 *p
++ = (unsigned char)*s
++;
1914 if (((s
- bs
) & 1) == 0 ||
1922 /* \uXXXX with 4 hex digits */
1923 for (x
= 0, i
= 0; i
< 4; i
++) {
1924 c
= (unsigned char)s
[i
];
1926 if (unicodeescape_decoding_error(&s
, &x
, errors
,
1927 "truncated \\uXXXX"))
1933 if (c
>= '0' && c
<= '9')
1935 else if (c
>= 'a' && c
<= 'f')
1943 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1945 return (PyObject
*)v
;
1952 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1959 static const char *hexdigit
= "0123456789abcdef";
1961 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1967 p
= q
= PyString_AS_STRING(repr
);
1968 while (size
-- > 0) {
1969 Py_UNICODE ch
= *s
++;
1970 /* Map 16-bit characters to '\uxxxx' */
1974 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1975 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1976 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1977 *p
++ = hexdigit
[ch
& 15];
1979 /* Copy everything else as-is */
1984 if (_PyString_Resize(&repr
, p
- q
))
1994 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
1996 if (!PyUnicode_Check(unicode
)) {
1997 PyErr_BadArgument();
2000 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2001 PyUnicode_GET_SIZE(unicode
));
2004 /* --- Latin-1 Codec ------------------------------------------------------ */
2006 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2013 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2014 if (size
== 1 && *(unsigned char*)s
< 256) {
2015 Py_UNICODE r
= *(unsigned char*)s
;
2016 return PyUnicode_FromUnicode(&r
, 1);
2019 v
= _PyUnicode_New(size
);
2023 return (PyObject
*)v
;
2024 p
= PyUnicode_AS_UNICODE(v
);
2026 *p
++ = (unsigned char)*s
++;
2027 return (PyObject
*)v
;
2035 int latin1_encoding_error(const Py_UNICODE
**source
,
2038 const char *details
)
2040 if ((errors
== NULL
) ||
2041 (strcmp(errors
,"strict") == 0)) {
2042 PyErr_Format(PyExc_UnicodeError
,
2043 "Latin-1 encoding error: %.400s",
2047 else if (strcmp(errors
,"ignore") == 0) {
2050 else if (strcmp(errors
,"replace") == 0) {
2056 PyErr_Format(PyExc_ValueError
,
2057 "Latin-1 encoding error; "
2058 "unknown error handling code: %.400s",
2064 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2071 repr
= PyString_FromStringAndSize(NULL
, size
);
2077 s
= PyString_AS_STRING(repr
);
2079 while (size
-- > 0) {
2080 Py_UNICODE ch
= *p
++;
2082 if (latin1_encoding_error(&p
, &s
, errors
,
2083 "ordinal not in range(256)"))
2089 /* Resize if error handling skipped some characters */
2090 if (s
- start
< PyString_GET_SIZE(repr
))
2091 if (_PyString_Resize(&repr
, s
- start
))
2100 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2102 if (!PyUnicode_Check(unicode
)) {
2103 PyErr_BadArgument();
2106 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2107 PyUnicode_GET_SIZE(unicode
),
2111 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2114 int ascii_decoding_error(const char **source
,
2117 const char *details
)
2119 if ((errors
== NULL
) ||
2120 (strcmp(errors
,"strict") == 0)) {
2121 PyErr_Format(PyExc_UnicodeError
,
2122 "ASCII decoding error: %.400s",
2126 else if (strcmp(errors
,"ignore") == 0) {
2129 else if (strcmp(errors
,"replace") == 0) {
2130 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2135 PyErr_Format(PyExc_ValueError
,
2136 "ASCII decoding error; "
2137 "unknown error handling code: %.400s",
2143 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2150 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2151 if (size
== 1 && *(unsigned char*)s
< 128) {
2152 Py_UNICODE r
= *(unsigned char*)s
;
2153 return PyUnicode_FromUnicode(&r
, 1);
2156 v
= _PyUnicode_New(size
);
2160 return (PyObject
*)v
;
2161 p
= PyUnicode_AS_UNICODE(v
);
2162 while (size
-- > 0) {
2163 register unsigned char c
;
2165 c
= (unsigned char)*s
++;
2168 else if (ascii_decoding_error(&s
, &p
, errors
,
2169 "ordinal not in range(128)"))
2172 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2173 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2175 return (PyObject
*)v
;
2183 int ascii_encoding_error(const Py_UNICODE
**source
,
2186 const char *details
)
2188 if ((errors
== NULL
) ||
2189 (strcmp(errors
,"strict") == 0)) {
2190 PyErr_Format(PyExc_UnicodeError
,
2191 "ASCII encoding error: %.400s",
2195 else if (strcmp(errors
,"ignore") == 0) {
2198 else if (strcmp(errors
,"replace") == 0) {
2204 PyErr_Format(PyExc_ValueError
,
2205 "ASCII encoding error; "
2206 "unknown error handling code: %.400s",
2212 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2219 repr
= PyString_FromStringAndSize(NULL
, size
);
2225 s
= PyString_AS_STRING(repr
);
2227 while (size
-- > 0) {
2228 Py_UNICODE ch
= *p
++;
2230 if (ascii_encoding_error(&p
, &s
, errors
,
2231 "ordinal not in range(128)"))
2237 /* Resize if error handling skipped some characters */
2238 if (s
- start
< PyString_GET_SIZE(repr
))
2239 if (_PyString_Resize(&repr
, s
- start
))
2248 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2250 if (!PyUnicode_Check(unicode
)) {
2251 PyErr_BadArgument();
2254 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2255 PyUnicode_GET_SIZE(unicode
),
2259 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2261 /* --- MBCS codecs for Windows -------------------------------------------- */
2263 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2270 /* First get the size of the result */
2271 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2272 if (size
> 0 && usize
==0)
2273 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2275 v
= _PyUnicode_New(usize
);
2279 return (PyObject
*)v
;
2280 p
= PyUnicode_AS_UNICODE(v
);
2281 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2283 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2286 return (PyObject
*)v
;
2289 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2297 /* If there are no characters, bail now! */
2299 return PyString_FromString("");
2301 /* First get the size of the result */
2302 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2304 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2306 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2312 /* Do the conversion */
2313 s
= PyString_AS_STRING(repr
);
2314 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2316 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2321 #endif /* MS_WIN32 */
2323 /* --- Character Mapping Codec -------------------------------------------- */
2326 int charmap_decoding_error(const char **source
,
2329 const char *details
)
2331 if ((errors
== NULL
) ||
2332 (strcmp(errors
,"strict") == 0)) {
2333 PyErr_Format(PyExc_UnicodeError
,
2334 "charmap decoding error: %.400s",
2338 else if (strcmp(errors
,"ignore") == 0) {
2341 else if (strcmp(errors
,"replace") == 0) {
2342 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2347 PyErr_Format(PyExc_ValueError
,
2348 "charmap decoding error; "
2349 "unknown error handling code: %.400s",
2355 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2364 /* Default to Latin-1 */
2365 if (mapping
== NULL
)
2366 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2368 v
= _PyUnicode_New(size
);
2372 return (PyObject
*)v
;
2373 p
= PyUnicode_AS_UNICODE(v
);
2374 while (size
-- > 0) {
2375 unsigned char ch
= *s
++;
2378 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2379 w
= PyInt_FromLong((long)ch
);
2382 x
= PyObject_GetItem(mapping
, w
);
2385 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2386 /* No mapping found means: mapping is undefined. */
2395 if (PyInt_Check(x
)) {
2396 long value
= PyInt_AS_LONG(x
);
2397 if (value
< 0 || value
> 65535) {
2398 PyErr_SetString(PyExc_TypeError
,
2399 "character mapping must be in range(65536)");
2403 *p
++ = (Py_UNICODE
)value
;
2405 else if (x
== Py_None
) {
2406 /* undefined mapping */
2407 if (charmap_decoding_error(&s
, &p
, errors
,
2408 "character maps to <undefined>")) {
2413 else if (PyUnicode_Check(x
)) {
2414 int targetsize
= PyUnicode_GET_SIZE(x
);
2416 if (targetsize
== 1)
2418 *p
++ = *PyUnicode_AS_UNICODE(x
);
2420 else if (targetsize
> 1) {
2422 if (targetsize
> extrachars
) {
2424 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2425 int needed
= (targetsize
- extrachars
) + \
2427 extrachars
+= needed
;
2428 if (_PyUnicode_Resize(&v
,
2429 PyUnicode_GET_SIZE(v
) + needed
)) {
2433 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2436 PyUnicode_AS_UNICODE(x
),
2439 extrachars
-= targetsize
;
2441 /* 1-0 mapping: skip the character */
2444 /* wrong return value */
2445 PyErr_SetString(PyExc_TypeError
,
2446 "character mapping must return integer, None or unicode");
2452 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2453 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2455 return (PyObject
*)v
;
2463 int charmap_encoding_error(const Py_UNICODE
**source
,
2466 const char *details
)
2468 if ((errors
== NULL
) ||
2469 (strcmp(errors
,"strict") == 0)) {
2470 PyErr_Format(PyExc_UnicodeError
,
2471 "charmap encoding error: %.400s",
2475 else if (strcmp(errors
,"ignore") == 0) {
2478 else if (strcmp(errors
,"replace") == 0) {
2484 PyErr_Format(PyExc_ValueError
,
2485 "charmap encoding error; "
2486 "unknown error handling code: %.400s",
2492 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2501 /* Default to Latin-1 */
2502 if (mapping
== NULL
)
2503 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2505 v
= PyString_FromStringAndSize(NULL
, size
);
2510 s
= PyString_AS_STRING(v
);
2511 while (size
-- > 0) {
2512 Py_UNICODE ch
= *p
++;
2515 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2516 w
= PyInt_FromLong((long)ch
);
2519 x
= PyObject_GetItem(mapping
, w
);
2522 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2523 /* No mapping found means: mapping is undefined. */
2532 if (PyInt_Check(x
)) {
2533 long value
= PyInt_AS_LONG(x
);
2534 if (value
< 0 || value
> 255) {
2535 PyErr_SetString(PyExc_TypeError
,
2536 "character mapping must be in range(256)");
2542 else if (x
== Py_None
) {
2543 /* undefined mapping */
2544 if (charmap_encoding_error(&p
, &s
, errors
,
2545 "character maps to <undefined>")) {
2550 else if (PyString_Check(x
)) {
2551 int targetsize
= PyString_GET_SIZE(x
);
2553 if (targetsize
== 1)
2555 *s
++ = *PyString_AS_STRING(x
);
2557 else if (targetsize
> 1) {
2559 if (targetsize
> extrachars
) {
2561 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2562 int needed
= (targetsize
- extrachars
) + \
2564 extrachars
+= needed
;
2565 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2569 s
= PyString_AS_STRING(v
) + oldpos
;
2571 memcpy(s
, PyString_AS_STRING(x
), targetsize
);
2573 extrachars
-= targetsize
;
2575 /* 1-0 mapping: skip the character */
2578 /* wrong return value */
2579 PyErr_SetString(PyExc_TypeError
,
2580 "character mapping must return integer, None or unicode");
2586 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2587 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2596 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2599 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2600 PyErr_BadArgument();
2603 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2604 PyUnicode_GET_SIZE(unicode
),
2610 int translate_error(const Py_UNICODE
**source
,
2613 const char *details
)
2615 if ((errors
== NULL
) ||
2616 (strcmp(errors
,"strict") == 0)) {
2617 PyErr_Format(PyExc_UnicodeError
,
2618 "translate error: %.400s",
2622 else if (strcmp(errors
,"ignore") == 0) {
2625 else if (strcmp(errors
,"replace") == 0) {
2631 PyErr_Format(PyExc_ValueError
,
2633 "unknown error handling code: %.400s",
2639 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2647 if (mapping
== NULL
) {
2648 PyErr_BadArgument();
2652 /* Output will never be longer than input */
2653 v
= _PyUnicode_New(size
);
2658 p
= PyUnicode_AS_UNICODE(v
);
2659 while (size
-- > 0) {
2660 Py_UNICODE ch
= *s
++;
2664 w
= PyInt_FromLong(ch
);
2667 x
= PyObject_GetItem(mapping
, w
);
2670 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2671 /* No mapping found: default to 1-1 mapping */
2681 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2682 else if (x
== Py_None
) {
2683 /* undefined mapping */
2684 if (translate_error(&s
, &p
, errors
,
2685 "character maps to <undefined>")) {
2690 else if (PyUnicode_Check(x
)) {
2691 if (PyUnicode_GET_SIZE(x
) != 1) {
2693 PyErr_SetString(PyExc_NotImplementedError
,
2694 "1-n mappings are currently not implemented");
2698 *p
++ = *PyUnicode_AS_UNICODE(x
);
2701 /* wrong return value */
2702 PyErr_SetString(PyExc_TypeError
,
2703 "translate mapping must return integer, None or unicode");
2709 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2710 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2714 return (PyObject
*)v
;
2721 PyObject
*PyUnicode_Translate(PyObject
*str
,
2727 str
= PyUnicode_FromObject(str
);
2730 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2731 PyUnicode_GET_SIZE(str
),
2742 /* --- Decimal Encoder ---------------------------------------------------- */
2744 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2749 Py_UNICODE
*p
, *end
;
2751 if (output
== NULL
) {
2752 PyErr_BadArgument();
2759 register Py_UNICODE ch
= *p
++;
2762 if (Py_UNICODE_ISSPACE(ch
)) {
2766 decimal
= Py_UNICODE_TODECIMAL(ch
);
2768 *output
++ = '0' + decimal
;
2771 if (0 < ch
&& ch
< 256) {
2772 *output
++ = (char)ch
;
2775 /* All other characters are considered invalid */
2776 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2777 PyErr_SetString(PyExc_ValueError
,
2778 "invalid decimal Unicode string");
2781 else if (strcmp(errors
, "ignore") == 0)
2783 else if (strcmp(errors
, "replace") == 0) {
2788 /* 0-terminate the output string */
2796 /* --- Helpers ------------------------------------------------------------ */
2799 int count(PyUnicodeObject
*self
,
2802 PyUnicodeObject
*substring
)
2807 start
+= self
->length
;
2810 if (end
> self
->length
)
2813 end
+= self
->length
;
2817 if (substring
->length
== 0)
2818 return (end
- start
+ 1);
2820 end
-= substring
->length
;
2822 while (start
<= end
)
2823 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2825 start
+= substring
->length
;
2832 int PyUnicode_Count(PyObject
*str
,
2839 str
= PyUnicode_FromObject(str
);
2842 substr
= PyUnicode_FromObject(substr
);
2843 if (substr
== NULL
) {
2848 result
= count((PyUnicodeObject
*)str
,
2850 (PyUnicodeObject
*)substr
);
2858 int findstring(PyUnicodeObject
*self
,
2859 PyUnicodeObject
*substring
,
2865 start
+= self
->length
;
2869 if (substring
->length
== 0)
2872 if (end
> self
->length
)
2875 end
+= self
->length
;
2879 end
-= substring
->length
;
2881 if (direction
< 0) {
2882 for (; end
>= start
; end
--)
2883 if (Py_UNICODE_MATCH(self
, end
, substring
))
2886 for (; start
<= end
; start
++)
2887 if (Py_UNICODE_MATCH(self
, start
, substring
))
2894 int PyUnicode_Find(PyObject
*str
,
2902 str
= PyUnicode_FromObject(str
);
2905 substr
= PyUnicode_FromObject(substr
);
2906 if (substr
== NULL
) {
2911 result
= findstring((PyUnicodeObject
*)str
,
2912 (PyUnicodeObject
*)substr
,
2913 start
, end
, direction
);
2920 int tailmatch(PyUnicodeObject
*self
,
2921 PyUnicodeObject
*substring
,
2927 start
+= self
->length
;
2931 if (substring
->length
== 0)
2934 if (end
> self
->length
)
2937 end
+= self
->length
;
2941 end
-= substring
->length
;
2945 if (direction
> 0) {
2946 if (Py_UNICODE_MATCH(self
, end
, substring
))
2949 if (Py_UNICODE_MATCH(self
, start
, substring
))
2956 int PyUnicode_Tailmatch(PyObject
*str
,
2964 str
= PyUnicode_FromObject(str
);
2967 substr
= PyUnicode_FromObject(substr
);
2968 if (substr
== NULL
) {
2973 result
= tailmatch((PyUnicodeObject
*)str
,
2974 (PyUnicodeObject
*)substr
,
2975 start
, end
, direction
);
2982 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
2986 /* like wcschr, but doesn't stop at NULL characters */
2988 while (size
-- > 0) {
2997 /* Apply fixfct filter to the Unicode object self and return a
2998 reference to the modified object */
3001 PyObject
*fixup(PyUnicodeObject
*self
,
3002 int (*fixfct
)(PyUnicodeObject
*s
))
3007 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
3011 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
3013 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
3014 /* fixfct should return TRUE if it modified the buffer. If
3015 FALSE, return a reference to the original buffer instead
3016 (to save space, not time) */
3019 return (PyObject
*) self
;
3021 return (PyObject
*) u
;
3025 int fixupper(PyUnicodeObject
*self
)
3027 int len
= self
->length
;
3028 Py_UNICODE
*s
= self
->str
;
3032 register Py_UNICODE ch
;
3034 ch
= Py_UNICODE_TOUPPER(*s
);
3046 int fixlower(PyUnicodeObject
*self
)
3048 int len
= self
->length
;
3049 Py_UNICODE
*s
= self
->str
;
3053 register Py_UNICODE ch
;
3055 ch
= Py_UNICODE_TOLOWER(*s
);
3067 int fixswapcase(PyUnicodeObject
*self
)
3069 int len
= self
->length
;
3070 Py_UNICODE
*s
= self
->str
;
3074 if (Py_UNICODE_ISUPPER(*s
)) {
3075 *s
= Py_UNICODE_TOLOWER(*s
);
3077 } else if (Py_UNICODE_ISLOWER(*s
)) {
3078 *s
= Py_UNICODE_TOUPPER(*s
);
3088 int fixcapitalize(PyUnicodeObject
*self
)
3090 int len
= self
->length
;
3091 Py_UNICODE
*s
= self
->str
;
3096 if (Py_UNICODE_ISLOWER(*s
)) {
3097 *s
= Py_UNICODE_TOUPPER(*s
);
3102 if (Py_UNICODE_ISUPPER(*s
)) {
3103 *s
= Py_UNICODE_TOLOWER(*s
);
3112 int fixtitle(PyUnicodeObject
*self
)
3114 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3115 register Py_UNICODE
*e
;
3116 int previous_is_cased
;
3118 /* Shortcut for single character strings */
3119 if (PyUnicode_GET_SIZE(self
) == 1) {
3120 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
3129 e
= p
+ PyUnicode_GET_SIZE(self
);
3130 previous_is_cased
= 0;
3131 for (; p
< e
; p
++) {
3132 register const Py_UNICODE ch
= *p
;
3134 if (previous_is_cased
)
3135 *p
= Py_UNICODE_TOLOWER(ch
);
3137 *p
= Py_UNICODE_TOTITLE(ch
);
3139 if (Py_UNICODE_ISLOWER(ch
) ||
3140 Py_UNICODE_ISUPPER(ch
) ||
3141 Py_UNICODE_ISTITLE(ch
))
3142 previous_is_cased
= 1;
3144 previous_is_cased
= 0;
3149 PyObject
*PyUnicode_Join(PyObject
*separator
,
3154 PyUnicodeObject
*res
= NULL
;
3161 it
= PyObject_GetIter(seq
);
3165 if (separator
== NULL
) {
3166 Py_UNICODE blank
= ' ';
3171 separator
= PyUnicode_FromObject(separator
);
3172 if (separator
== NULL
)
3174 sep
= PyUnicode_AS_UNICODE(separator
);
3175 seplen
= PyUnicode_GET_SIZE(separator
);
3178 res
= _PyUnicode_New(sz
);
3181 p
= PyUnicode_AS_UNICODE(res
);
3184 for (i
= 0; ; ++i
) {
3186 PyObject
*item
= PyIter_Next(it
);
3188 if (PyErr_Occurred())
3192 if (!PyUnicode_Check(item
)) {
3194 if (!PyString_Check(item
)) {
3195 PyErr_Format(PyExc_TypeError
,
3196 "sequence item %i: expected string or Unicode,"
3198 i
, item
->ob_type
->tp_name
);
3202 v
= PyUnicode_FromObject(item
);
3208 itemlen
= PyUnicode_GET_SIZE(item
);
3209 while (reslen
+ itemlen
+ seplen
>= sz
) {
3210 if (_PyUnicode_Resize(&res
, sz
*2)) {
3215 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
3218 Py_UNICODE_COPY(p
, sep
, seplen
);
3222 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
3227 if (_PyUnicode_Resize(&res
, reslen
))
3230 Py_XDECREF(separator
);
3232 return (PyObject
*)res
;
3235 Py_XDECREF(separator
);
3242 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
3254 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
3259 u
= _PyUnicode_New(left
+ self
->length
+ right
);
3262 Py_UNICODE_FILL(u
->str
, fill
, left
);
3263 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
3265 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
3271 #define SPLIT_APPEND(data, left, right) \
3272 str = PyUnicode_FromUnicode(data + left, right - left); \
3275 if (PyList_Append(list, str)) { \
3283 PyObject
*split_whitespace(PyUnicodeObject
*self
,
3289 int len
= self
->length
;
3292 for (i
= j
= 0; i
< len
; ) {
3294 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3297 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
3300 if (maxcount
-- <= 0)
3302 SPLIT_APPEND(self
->str
, j
, i
);
3303 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3309 SPLIT_APPEND(self
->str
, j
, len
);
3318 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
3328 string
= PyUnicode_FromObject(string
);
3331 data
= PyUnicode_AS_UNICODE(string
);
3332 len
= PyUnicode_GET_SIZE(string
);
3334 list
= PyList_New(0);
3338 for (i
= j
= 0; i
< len
; ) {
3341 /* Find a line and append it */
3342 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
3345 /* Skip the line break reading CRLF as one line break */
3348 if (data
[i
] == '\r' && i
+ 1 < len
&&
3356 SPLIT_APPEND(data
, j
, eol
);
3360 SPLIT_APPEND(data
, j
, len
);
3373 PyObject
*split_char(PyUnicodeObject
*self
,
3380 int len
= self
->length
;
3383 for (i
= j
= 0; i
< len
; ) {
3384 if (self
->str
[i
] == ch
) {
3385 if (maxcount
-- <= 0)
3387 SPLIT_APPEND(self
->str
, j
, i
);
3393 SPLIT_APPEND(self
->str
, j
, len
);
3403 PyObject
*split_substring(PyUnicodeObject
*self
,
3405 PyUnicodeObject
*substring
,
3410 int len
= self
->length
;
3411 int sublen
= substring
->length
;
3414 for (i
= j
= 0; i
<= len
- sublen
; ) {
3415 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
3416 if (maxcount
-- <= 0)
3418 SPLIT_APPEND(self
->str
, j
, i
);
3424 SPLIT_APPEND(self
->str
, j
, len
);
3436 PyObject
*split(PyUnicodeObject
*self
,
3437 PyUnicodeObject
*substring
,
3445 list
= PyList_New(0);
3449 if (substring
== NULL
)
3450 return split_whitespace(self
,list
,maxcount
);
3452 else if (substring
->length
== 1)
3453 return split_char(self
,list
,substring
->str
[0],maxcount
);
3455 else if (substring
->length
== 0) {
3457 PyErr_SetString(PyExc_ValueError
, "empty separator");
3461 return split_substring(self
,list
,substring
,maxcount
);
3465 PyObject
*strip(PyUnicodeObject
*self
,
3469 Py_UNICODE
*p
= self
->str
;
3471 int end
= self
->length
;
3474 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
3478 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
3481 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
3482 /* couldn't strip anything off, return original string */
3484 return (PyObject
*) self
;
3487 return (PyObject
*) PyUnicode_FromUnicode(
3494 PyObject
*replace(PyUnicodeObject
*self
,
3495 PyUnicodeObject
*str1
,
3496 PyUnicodeObject
*str2
,
3504 if (str1
->length
== 1 && str2
->length
== 1) {
3507 /* replace characters */
3508 if (!findchar(self
->str
, self
->length
, str1
->str
[0]) &&
3509 PyUnicode_CheckExact(self
)) {
3510 /* nothing to replace, return original string */
3514 Py_UNICODE u1
= str1
->str
[0];
3515 Py_UNICODE u2
= str2
->str
[0];
3517 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3522 Py_UNICODE_COPY(u
->str
, self
->str
,
3524 for (i
= 0; i
< u
->length
; i
++)
3525 if (u
->str
[i
] == u1
) {
3537 /* replace strings */
3538 n
= count(self
, 0, self
->length
, str1
);
3541 if (n
== 0 && PyUnicode_CheckExact(self
)) {
3542 /* nothing to replace, return original string */
3547 self
->length
+ n
* (str2
->length
- str1
->length
));
3551 while (i
<= self
->length
- str1
->length
)
3552 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3553 /* replace string segment */
3554 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3558 /* copy remaining part */
3559 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3563 *p
++ = self
->str
[i
++];
3568 return (PyObject
*) u
;
3571 /* --- Unicode Object Methods --------------------------------------------- */
3573 static char title__doc__
[] =
3574 "S.title() -> unicode\n\
3576 Return a titlecased version of S, i.e. words start with title case\n\
3577 characters, all remaining cased characters have lower case.";
3580 unicode_title(PyUnicodeObject
*self
)
3582 return fixup(self
, fixtitle
);
3585 static char capitalize__doc__
[] =
3586 "S.capitalize() -> unicode\n\
3588 Return a capitalized version of S, i.e. make the first character\n\
3592 unicode_capitalize(PyUnicodeObject
*self
)
3594 return fixup(self
, fixcapitalize
);
3598 static char capwords__doc__
[] =
3599 "S.capwords() -> unicode\n\
3601 Apply .capitalize() to all words in S and return the result with\n\
3602 normalized whitespace (all whitespace strings are replaced by ' ').";
3605 unicode_capwords(PyUnicodeObject
*self
)
3611 /* Split into words */
3612 list
= split(self
, NULL
, -1);
3616 /* Capitalize each word */
3617 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3618 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3622 Py_DECREF(PyList_GET_ITEM(list
, i
));
3623 PyList_SET_ITEM(list
, i
, item
);
3626 /* Join the words to form a new string */
3627 item
= PyUnicode_Join(NULL
, list
);
3631 return (PyObject
*)item
;
3635 static char center__doc__
[] =
3636 "S.center(width) -> unicode\n\
3638 Return S centered in a Unicode string of length width. Padding is done\n\
3642 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3647 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3650 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
3652 return (PyObject
*) self
;
3655 marg
= width
- self
->length
;
3656 left
= marg
/ 2 + (marg
& width
& 1);
3658 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3663 /* This code should go into some future Unicode collation support
3664 module. The basic comparison should compare ordinals on a naive
3665 basis (this is what Java does and thus JPython too). */
3667 /* speedy UTF-16 code point order comparison */
3669 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3671 static short utf16Fixup
[32] =
3673 0, 0, 0, 0, 0, 0, 0, 0,
3674 0, 0, 0, 0, 0, 0, 0, 0,
3675 0, 0, 0, 0, 0, 0, 0, 0,
3676 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3680 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3684 Py_UNICODE
*s1
= str1
->str
;
3685 Py_UNICODE
*s2
= str2
->str
;
3687 len1
= str1
->length
;
3688 len2
= str2
->length
;
3690 while (len1
> 0 && len2
> 0) {
3696 if (c1
> (1<<11) * 26)
3697 c1
+= utf16Fixup
[c1
>>11];
3698 if (c2
> (1<<11) * 26)
3699 c2
+= utf16Fixup
[c2
>>11];
3700 /* now c1 and c2 are in UTF-32-compatible order */
3703 return (c1
< c2
) ? -1 : 1;
3708 return (len1
< len2
) ? -1 : (len1
!= len2
);
3714 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3716 register int len1
, len2
;
3718 Py_UNICODE
*s1
= str1
->str
;
3719 Py_UNICODE
*s2
= str2
->str
;
3721 len1
= str1
->length
;
3722 len2
= str2
->length
;
3724 while (len1
> 0 && len2
> 0) {
3731 return (c1
< c2
) ? -1 : 1;
3736 return (len1
< len2
) ? -1 : (len1
!= len2
);
3741 int PyUnicode_Compare(PyObject
*left
,
3744 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3747 /* Coerce the two arguments */
3748 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3751 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3755 /* Shortcut for empty or interned objects */
3762 result
= unicode_compare(u
, v
);
3774 int PyUnicode_Contains(PyObject
*container
,
3777 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3779 register const Py_UNICODE
*p
, *e
;
3780 register Py_UNICODE ch
;
3782 /* Coerce the two arguments */
3783 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3785 PyErr_SetString(PyExc_TypeError
,
3786 "'in <string>' requires character as left operand");
3789 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3796 if (PyUnicode_GET_SIZE(v
) != 1) {
3797 PyErr_SetString(PyExc_TypeError
,
3798 "'in <string>' requires character as left operand");
3801 ch
= *PyUnicode_AS_UNICODE(v
);
3802 p
= PyUnicode_AS_UNICODE(u
);
3803 e
= p
+ PyUnicode_GET_SIZE(u
);
3822 /* Concat to string or Unicode object giving a new Unicode object. */
3824 PyObject
*PyUnicode_Concat(PyObject
*left
,
3827 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3829 /* Coerce the two arguments */
3830 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3833 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3838 if (v
== unicode_empty
) {
3840 return (PyObject
*)u
;
3842 if (u
== unicode_empty
) {
3844 return (PyObject
*)v
;
3847 /* Concat the two Unicode strings */
3848 w
= _PyUnicode_New(u
->length
+ v
->length
);
3851 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3852 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3856 return (PyObject
*)w
;
3864 static char count__doc__
[] =
3865 "S.count(sub[, start[, end]]) -> int\n\
3867 Return the number of occurrences of substring sub in Unicode string\n\
3868 S[start:end]. Optional arguments start and end are\n\
3869 interpreted as in slice notation.";
3872 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3874 PyUnicodeObject
*substring
;
3879 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3880 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3883 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3884 (PyObject
*)substring
);
3885 if (substring
== NULL
)
3889 start
+= self
->length
;
3892 if (end
> self
->length
)
3895 end
+= self
->length
;
3899 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3901 Py_DECREF(substring
);
3905 static char encode__doc__
[] =
3906 "S.encode([encoding[,errors]]) -> string\n\
3908 Return an encoded string version of S. Default encoding is the current\n\
3909 default string encoding. errors may be given to set a different error\n\
3910 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3911 a ValueError. Other possible values are 'ignore' and 'replace'.";
3914 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3916 char *encoding
= NULL
;
3917 char *errors
= NULL
;
3918 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3920 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3923 static char expandtabs__doc__
[] =
3924 "S.expandtabs([tabsize]) -> unicode\n\
3926 Return a copy of S where all tab characters are expanded using spaces.\n\
3927 If tabsize is not given, a tab size of 8 characters is assumed.";
3930 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3939 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3942 /* First pass: determine size of output string */
3944 e
= self
->str
+ self
->length
;
3945 for (p
= self
->str
; p
< e
; p
++)
3948 j
+= tabsize
- (j
% tabsize
);
3952 if (*p
== '\n' || *p
== '\r') {
3958 /* Second pass: create output string and fill it */
3959 u
= _PyUnicode_New(i
+ j
);
3966 for (p
= self
->str
; p
< e
; p
++)
3969 i
= tabsize
- (j
% tabsize
);
3978 if (*p
== '\n' || *p
== '\r')
3982 return (PyObject
*) u
;
3985 static char find__doc__
[] =
3986 "S.find(sub [,start [,end]]) -> int\n\
3988 Return the lowest index in S where substring sub is found,\n\
3989 such that sub is contained within s[start,end]. Optional\n\
3990 arguments start and end are interpreted as in slice notation.\n\
3992 Return -1 on failure.";
3995 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
3997 PyUnicodeObject
*substring
;
4002 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
4003 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4005 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4006 (PyObject
*)substring
);
4007 if (substring
== NULL
)
4010 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
4012 Py_DECREF(substring
);
4017 unicode_getitem(PyUnicodeObject
*self
, int index
)
4019 if (index
< 0 || index
>= self
->length
) {
4020 PyErr_SetString(PyExc_IndexError
, "string index out of range");
4024 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
4028 unicode_hash(PyUnicodeObject
*self
)
4030 /* Since Unicode objects compare equal to their ASCII string
4031 counterparts, they should use the individual character values
4032 as basis for their hash value. This is needed to assure that
4033 strings and Unicode objects behave in the same way as
4037 register Py_UNICODE
*p
;
4040 if (self
->hash
!= -1)
4042 len
= PyUnicode_GET_SIZE(self
);
4043 p
= PyUnicode_AS_UNICODE(self
);
4046 x
= (1000003*x
) ^ *p
++;
4047 x
^= PyUnicode_GET_SIZE(self
);
4054 static char index__doc__
[] =
4055 "S.index(sub [,start [,end]]) -> int\n\
4057 Like S.find() but raise ValueError when the substring is not found.";
4060 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
4063 PyUnicodeObject
*substring
;
4067 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
4068 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4071 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4072 (PyObject
*)substring
);
4073 if (substring
== NULL
)
4076 result
= findstring(self
, substring
, start
, end
, 1);
4078 Py_DECREF(substring
);
4080 PyErr_SetString(PyExc_ValueError
, "substring not found");
4083 return PyInt_FromLong(result
);
4086 static char islower__doc__
[] =
4087 "S.islower() -> int\n\
4089 Return 1 if all cased characters in S are lowercase and there is\n\
4090 at least one cased character in S, 0 otherwise.";
4093 unicode_islower(PyUnicodeObject
*self
)
4095 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4096 register const Py_UNICODE
*e
;
4099 /* Shortcut for single character strings */
4100 if (PyUnicode_GET_SIZE(self
) == 1)
4101 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
4103 /* Special case for empty strings */
4104 if (PyString_GET_SIZE(self
) == 0)
4105 return PyInt_FromLong(0);
4107 e
= p
+ PyUnicode_GET_SIZE(self
);
4109 for (; p
< e
; p
++) {
4110 register const Py_UNICODE ch
= *p
;
4112 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
4113 return PyInt_FromLong(0);
4114 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
4117 return PyInt_FromLong(cased
);
4120 static char isupper__doc__
[] =
4121 "S.isupper() -> int\n\
4123 Return 1 if all cased characters in S are uppercase and there is\n\
4124 at least one cased character in S, 0 otherwise.";
4127 unicode_isupper(PyUnicodeObject
*self
)
4129 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4130 register const Py_UNICODE
*e
;
4133 /* Shortcut for single character strings */
4134 if (PyUnicode_GET_SIZE(self
) == 1)
4135 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
4137 /* Special case for empty strings */
4138 if (PyString_GET_SIZE(self
) == 0)
4139 return PyInt_FromLong(0);
4141 e
= p
+ PyUnicode_GET_SIZE(self
);
4143 for (; p
< e
; p
++) {
4144 register const Py_UNICODE ch
= *p
;
4146 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
4147 return PyInt_FromLong(0);
4148 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
4151 return PyInt_FromLong(cased
);
4154 static char istitle__doc__
[] =
4155 "S.istitle() -> int\n\
4157 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4158 may only follow uncased characters and lowercase characters only cased\n\
4159 ones. Return 0 otherwise.";
4162 unicode_istitle(PyUnicodeObject
*self
)
4164 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4165 register const Py_UNICODE
*e
;
4166 int cased
, previous_is_cased
;
4168 /* Shortcut for single character strings */
4169 if (PyUnicode_GET_SIZE(self
) == 1)
4170 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
4171 (Py_UNICODE_ISUPPER(*p
) != 0));
4173 /* Special case for empty strings */
4174 if (PyString_GET_SIZE(self
) == 0)
4175 return PyInt_FromLong(0);
4177 e
= p
+ PyUnicode_GET_SIZE(self
);
4179 previous_is_cased
= 0;
4180 for (; p
< e
; p
++) {
4181 register const Py_UNICODE ch
= *p
;
4183 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
4184 if (previous_is_cased
)
4185 return PyInt_FromLong(0);
4186 previous_is_cased
= 1;
4189 else if (Py_UNICODE_ISLOWER(ch
)) {
4190 if (!previous_is_cased
)
4191 return PyInt_FromLong(0);
4192 previous_is_cased
= 1;
4196 previous_is_cased
= 0;
4198 return PyInt_FromLong(cased
);
4201 static char isspace__doc__
[] =
4202 "S.isspace() -> int\n\
4204 Return 1 if there are only whitespace characters in S,\n\
4208 unicode_isspace(PyUnicodeObject
*self
)
4210 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4211 register const Py_UNICODE
*e
;
4213 /* Shortcut for single character strings */
4214 if (PyUnicode_GET_SIZE(self
) == 1 &&
4215 Py_UNICODE_ISSPACE(*p
))
4216 return PyInt_FromLong(1);
4218 /* Special case for empty strings */
4219 if (PyString_GET_SIZE(self
) == 0)
4220 return PyInt_FromLong(0);
4222 e
= p
+ PyUnicode_GET_SIZE(self
);
4223 for (; p
< e
; p
++) {
4224 if (!Py_UNICODE_ISSPACE(*p
))
4225 return PyInt_FromLong(0);
4227 return PyInt_FromLong(1);
4230 static char isalpha__doc__
[] =
4231 "S.isalpha() -> int\n\
4233 Return 1 if all characters in S are alphabetic\n\
4234 and there is at least one character in S, 0 otherwise.";
4237 unicode_isalpha(PyUnicodeObject
*self
)
4239 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4240 register const Py_UNICODE
*e
;
4242 /* Shortcut for single character strings */
4243 if (PyUnicode_GET_SIZE(self
) == 1 &&
4244 Py_UNICODE_ISALPHA(*p
))
4245 return PyInt_FromLong(1);
4247 /* Special case for empty strings */
4248 if (PyString_GET_SIZE(self
) == 0)
4249 return PyInt_FromLong(0);
4251 e
= p
+ PyUnicode_GET_SIZE(self
);
4252 for (; p
< e
; p
++) {
4253 if (!Py_UNICODE_ISALPHA(*p
))
4254 return PyInt_FromLong(0);
4256 return PyInt_FromLong(1);
4259 static char isalnum__doc__
[] =
4260 "S.isalnum() -> int\n\
4262 Return 1 if all characters in S are alphanumeric\n\
4263 and there is at least one character in S, 0 otherwise.";
4266 unicode_isalnum(PyUnicodeObject
*self
)
4268 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4269 register const Py_UNICODE
*e
;
4271 /* Shortcut for single character strings */
4272 if (PyUnicode_GET_SIZE(self
) == 1 &&
4273 Py_UNICODE_ISALNUM(*p
))
4274 return PyInt_FromLong(1);
4276 /* Special case for empty strings */
4277 if (PyString_GET_SIZE(self
) == 0)
4278 return PyInt_FromLong(0);
4280 e
= p
+ PyUnicode_GET_SIZE(self
);
4281 for (; p
< e
; p
++) {
4282 if (!Py_UNICODE_ISALNUM(*p
))
4283 return PyInt_FromLong(0);
4285 return PyInt_FromLong(1);
4288 static char isdecimal__doc__
[] =
4289 "S.isdecimal() -> int\n\
4291 Return 1 if there are only decimal characters in S,\n\
4295 unicode_isdecimal(PyUnicodeObject
*self
)
4297 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4298 register const Py_UNICODE
*e
;
4300 /* Shortcut for single character strings */
4301 if (PyUnicode_GET_SIZE(self
) == 1 &&
4302 Py_UNICODE_ISDECIMAL(*p
))
4303 return PyInt_FromLong(1);
4305 /* Special case for empty strings */
4306 if (PyString_GET_SIZE(self
) == 0)
4307 return PyInt_FromLong(0);
4309 e
= p
+ PyUnicode_GET_SIZE(self
);
4310 for (; p
< e
; p
++) {
4311 if (!Py_UNICODE_ISDECIMAL(*p
))
4312 return PyInt_FromLong(0);
4314 return PyInt_FromLong(1);
4317 static char isdigit__doc__
[] =
4318 "S.isdigit() -> int\n\
4320 Return 1 if there are only digit characters in S,\n\
4324 unicode_isdigit(PyUnicodeObject
*self
)
4326 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4327 register const Py_UNICODE
*e
;
4329 /* Shortcut for single character strings */
4330 if (PyUnicode_GET_SIZE(self
) == 1 &&
4331 Py_UNICODE_ISDIGIT(*p
))
4332 return PyInt_FromLong(1);
4334 /* Special case for empty strings */
4335 if (PyString_GET_SIZE(self
) == 0)
4336 return PyInt_FromLong(0);
4338 e
= p
+ PyUnicode_GET_SIZE(self
);
4339 for (; p
< e
; p
++) {
4340 if (!Py_UNICODE_ISDIGIT(*p
))
4341 return PyInt_FromLong(0);
4343 return PyInt_FromLong(1);
4346 static char isnumeric__doc__
[] =
4347 "S.isnumeric() -> int\n\
4349 Return 1 if there are only numeric characters in S,\n\
4353 unicode_isnumeric(PyUnicodeObject
*self
)
4355 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4356 register const Py_UNICODE
*e
;
4358 /* Shortcut for single character strings */
4359 if (PyUnicode_GET_SIZE(self
) == 1 &&
4360 Py_UNICODE_ISNUMERIC(*p
))
4361 return PyInt_FromLong(1);
4363 /* Special case for empty strings */
4364 if (PyString_GET_SIZE(self
) == 0)
4365 return PyInt_FromLong(0);
4367 e
= p
+ PyUnicode_GET_SIZE(self
);
4368 for (; p
< e
; p
++) {
4369 if (!Py_UNICODE_ISNUMERIC(*p
))
4370 return PyInt_FromLong(0);
4372 return PyInt_FromLong(1);
4375 static char join__doc__
[] =
4376 "S.join(sequence) -> unicode\n\
4378 Return a string which is the concatenation of the strings in the\n\
4379 sequence. The separator between elements is S.";
4382 unicode_join(PyObject
*self
, PyObject
*data
)
4384 return PyUnicode_Join(self
, data
);
4388 unicode_length(PyUnicodeObject
*self
)
4390 return self
->length
;
4393 static char ljust__doc__
[] =
4394 "S.ljust(width) -> unicode\n\
4396 Return S left justified in a Unicode string of length width. Padding is\n\
4397 done using spaces.";
4400 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
4403 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
4406 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4408 return (PyObject
*) self
;
4411 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
4414 static char lower__doc__
[] =
4415 "S.lower() -> unicode\n\
4417 Return a copy of the string S converted to lowercase.";
4420 unicode_lower(PyUnicodeObject
*self
)
4422 return fixup(self
, fixlower
);
4425 static char lstrip__doc__
[] =
4426 "S.lstrip() -> unicode\n\
4428 Return a copy of the string S with leading whitespace removed.";
4431 unicode_lstrip(PyUnicodeObject
*self
)
4433 return strip(self
, 1, 0);
4437 unicode_repeat(PyUnicodeObject
*str
, int len
)
4447 if (len
== 1 && PyUnicode_CheckExact(str
)) {
4448 /* no repeat, return original string */
4450 return (PyObject
*) str
;
4453 /* ensure # of chars needed doesn't overflow int and # of bytes
4454 * needed doesn't overflow size_t
4456 nchars
= len
* str
->length
;
4457 if (len
&& nchars
/ len
!= str
->length
) {
4458 PyErr_SetString(PyExc_OverflowError
,
4459 "repeated string is too long");
4462 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4463 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4464 PyErr_SetString(PyExc_OverflowError
,
4465 "repeated string is too long");
4468 u
= _PyUnicode_New(nchars
);
4475 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4479 return (PyObject
*) u
;
4482 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4492 self
= PyUnicode_FromObject(obj
);
4495 str1
= PyUnicode_FromObject(subobj
);
4500 str2
= PyUnicode_FromObject(replobj
);
4506 result
= replace((PyUnicodeObject
*)self
,
4507 (PyUnicodeObject
*)str1
,
4508 (PyUnicodeObject
*)str2
,
4516 static char replace__doc__
[] =
4517 "S.replace (old, new[, maxsplit]) -> unicode\n\
4519 Return a copy of S with all occurrences of substring\n\
4520 old replaced by new. If the optional argument maxsplit is\n\
4521 given, only the first maxsplit occurrences are replaced.";
4524 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4526 PyUnicodeObject
*str1
;
4527 PyUnicodeObject
*str2
;
4531 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4533 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4536 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4540 result
= replace(self
, str1
, str2
, maxcount
);
4548 PyObject
*unicode_repr(PyObject
*unicode
)
4550 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4551 PyUnicode_GET_SIZE(unicode
),
4555 static char rfind__doc__
[] =
4556 "S.rfind(sub [,start [,end]]) -> int\n\
4558 Return the highest index in S where substring sub is found,\n\
4559 such that sub is contained within s[start,end]. Optional\n\
4560 arguments start and end are interpreted as in slice notation.\n\
4562 Return -1 on failure.";
4565 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4567 PyUnicodeObject
*substring
;
4572 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4573 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4575 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4576 (PyObject
*)substring
);
4577 if (substring
== NULL
)
4580 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4582 Py_DECREF(substring
);
4586 static char rindex__doc__
[] =
4587 "S.rindex(sub [,start [,end]]) -> int\n\
4589 Like S.rfind() but raise ValueError when the substring is not found.";
4592 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4595 PyUnicodeObject
*substring
;
4599 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4600 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4602 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4603 (PyObject
*)substring
);
4604 if (substring
== NULL
)
4607 result
= findstring(self
, substring
, start
, end
, -1);
4609 Py_DECREF(substring
);
4611 PyErr_SetString(PyExc_ValueError
, "substring not found");
4614 return PyInt_FromLong(result
);
4617 static char rjust__doc__
[] =
4618 "S.rjust(width) -> unicode\n\
4620 Return S right justified in a Unicode string of length width. Padding is\n\
4621 done using spaces.";
4624 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4627 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4630 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4632 return (PyObject
*) self
;
4635 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4638 static char rstrip__doc__
[] =
4639 "S.rstrip() -> unicode\n\
4641 Return a copy of the string S with trailing whitespace removed.";
4644 unicode_rstrip(PyUnicodeObject
*self
)
4646 return strip(self
, 0, 1);
4650 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4652 /* standard clamping */
4657 if (end
> self
->length
)
4659 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
4660 /* full slice, return original string */
4662 return (PyObject
*) self
;
4667 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4671 PyObject
*PyUnicode_Split(PyObject
*s
,
4677 s
= PyUnicode_FromObject(s
);
4681 sep
= PyUnicode_FromObject(sep
);
4688 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4695 static char split__doc__
[] =
4696 "S.split([sep [,maxsplit]]) -> list of strings\n\
4698 Return a list of the words in S, using sep as the\n\
4699 delimiter string. If maxsplit is given, at most maxsplit\n\
4700 splits are done. If sep is not specified, any whitespace string\n\
4704 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4706 PyObject
*substring
= Py_None
;
4709 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4712 if (substring
== Py_None
)
4713 return split(self
, NULL
, maxcount
);
4714 else if (PyUnicode_Check(substring
))
4715 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4717 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4720 static char splitlines__doc__
[] =
4721 "S.splitlines([keepends]]) -> list of strings\n\
4723 Return a list of the lines in S, breaking at line boundaries.\n\
4724 Line breaks are not included in the resulting list unless keepends\n\
4725 is given and true.";
4728 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4732 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4735 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4739 PyObject
*unicode_str(PyUnicodeObject
*self
)
4741 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4744 static char strip__doc__
[] =
4745 "S.strip() -> unicode\n\
4747 Return a copy of S with leading and trailing whitespace removed.";
4750 unicode_strip(PyUnicodeObject
*self
)
4752 return strip(self
, 1, 1);
4755 static char swapcase__doc__
[] =
4756 "S.swapcase() -> unicode\n\
4758 Return a copy of S with uppercase characters converted to lowercase\n\
4762 unicode_swapcase(PyUnicodeObject
*self
)
4764 return fixup(self
, fixswapcase
);
4767 static char translate__doc__
[] =
4768 "S.translate(table) -> unicode\n\
4770 Return a copy of the string S, where all characters have been mapped\n\
4771 through the given translation table, which must be a mapping of\n\
4772 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4773 are left untouched. Characters mapped to None are deleted.";
4776 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
4778 return PyUnicode_TranslateCharmap(self
->str
,
4784 static char upper__doc__
[] =
4785 "S.upper() -> unicode\n\
4787 Return a copy of S converted to uppercase.";
4790 unicode_upper(PyUnicodeObject
*self
)
4792 return fixup(self
, fixupper
);
4796 static char zfill__doc__
[] =
4797 "S.zfill(width) -> unicode\n\
4799 Pad a numeric string x with zeros on the left, to fill a field\n\
4800 of the specified width. The string x is never truncated.";
4803 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4809 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4812 if (self
->length
>= width
) {
4814 return (PyObject
*) self
;
4817 fill
= width
- self
->length
;
4819 u
= pad(self
, fill
, 0, '0');
4821 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4822 /* move sign to beginning of string */
4823 u
->str
[0] = u
->str
[fill
];
4827 return (PyObject
*) u
;
4833 unicode_freelistsize(PyUnicodeObject
*self
)
4835 return PyInt_FromLong(unicode_freelist_size
);
4839 static char startswith__doc__
[] =
4840 "S.startswith(prefix[, start[, end]]) -> int\n\
4842 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4843 optional start, test S beginning at that position. With optional end, stop\n\
4844 comparing S at that position.";
4847 unicode_startswith(PyUnicodeObject
*self
,
4850 PyUnicodeObject
*substring
;
4855 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4856 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4858 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4859 (PyObject
*)substring
);
4860 if (substring
== NULL
)
4863 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4865 Py_DECREF(substring
);
4870 static char endswith__doc__
[] =
4871 "S.endswith(suffix[, start[, end]]) -> int\n\
4873 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4874 optional start, test S beginning at that position. With optional end, stop\n\
4875 comparing S at that position.";
4878 unicode_endswith(PyUnicodeObject
*self
,
4881 PyUnicodeObject
*substring
;
4886 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4887 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4889 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4890 (PyObject
*)substring
);
4891 if (substring
== NULL
)
4894 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4896 Py_DECREF(substring
);
4901 static PyMethodDef unicode_methods
[] = {
4903 /* Order is according to common usage: often used methods should
4904 appear first, since lookup is done sequentially. */
4906 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
4907 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
4908 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
4909 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
4910 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
4911 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
4912 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
4913 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
4914 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
4915 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
4916 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
4917 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
4918 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
4919 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_NOARGS
, lstrip__doc__
},
4920 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4921 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
4922 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
4923 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
4924 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_NOARGS
, rstrip__doc__
},
4925 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
4926 {"strip", (PyCFunction
) unicode_strip
, METH_NOARGS
, strip__doc__
},
4927 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
4928 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
4929 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
4930 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
4931 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
4932 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
4933 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
4934 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
4935 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
4936 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
4937 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
4938 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
4939 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
4940 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
4942 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
4943 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
4947 /* This one is just used for debugging the implementation. */
4948 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
4954 static PySequenceMethods unicode_as_sequence
= {
4955 (inquiry
) unicode_length
, /* sq_length */
4956 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4957 (intargfunc
) unicode_repeat
, /* sq_repeat */
4958 (intargfunc
) unicode_getitem
, /* sq_item */
4959 (intintargfunc
) unicode_slice
, /* sq_slice */
4960 0, /* sq_ass_item */
4961 0, /* sq_ass_slice */
4962 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4966 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4971 PyErr_SetString(PyExc_SystemError
,
4972 "accessing non-existent unicode segment");
4975 *ptr
= (void *) self
->str
;
4976 return PyUnicode_GET_DATA_SIZE(self
);
4980 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
4983 PyErr_SetString(PyExc_TypeError
,
4984 "cannot use unicode as modifyable buffer");
4989 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
4993 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
4998 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
5005 PyErr_SetString(PyExc_SystemError
,
5006 "accessing non-existent unicode segment");
5009 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
5012 *ptr
= (void *) PyString_AS_STRING(str
);
5013 return PyString_GET_SIZE(str
);
5016 /* Helpers for PyUnicode_Format() */
5019 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
5021 int argidx
= *p_argidx
;
5022 if (argidx
< arglen
) {
5027 return PyTuple_GetItem(args
, argidx
);
5029 PyErr_SetString(PyExc_TypeError
,
5030 "not enough arguments for format string");
5034 #define F_LJUST (1<<0)
5035 #define F_SIGN (1<<1)
5036 #define F_BLANK (1<<2)
5037 #define F_ALT (1<<3)
5038 #define F_ZERO (1<<4)
5041 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
5047 va_start(va
, format
);
5049 /* First, format the string as char array, then expand to Py_UNICODE
5051 charbuffer
= (char *)buffer
;
5052 len
= vsprintf(charbuffer
, format
, va
);
5053 for (i
= len
- 1; i
>= 0; i
--)
5054 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
5061 formatfloat(Py_UNICODE
*buf
,
5068 /* fmt = '%#.' + `prec` + `type`
5069 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5073 x
= PyFloat_AsDouble(v
);
5074 if (x
== -1.0 && PyErr_Occurred())
5078 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
5080 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
5081 (flags
& F_ALT
) ? "#" : "", prec
, type
);
5082 /* worst case length calc to ensure no buffer overrun:
5084 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5085 for any double rep.)
5086 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5087 If prec=0 the effective precision is 1 (the leading digit is
5088 always given), therefore increase by one to 10+prec. */
5089 if (buflen
<= (size_t)10 + (size_t)prec
) {
5090 PyErr_SetString(PyExc_OverflowError
,
5091 "formatted float is too long (precision too long?)");
5094 return usprintf(buf
, fmt
, x
);
5098 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
5102 PyObject
*str
; /* temporary string object. */
5103 PyUnicodeObject
*result
;
5105 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
5108 result
= _PyUnicode_New(len
);
5109 for (i
= 0; i
< len
; i
++)
5110 result
->str
[i
] = buf
[i
];
5111 result
->str
[len
] = 0;
5113 return (PyObject
*)result
;
5117 formatint(Py_UNICODE
*buf
,
5124 /* fmt = '%#.' + `prec` + 'l' + `type`
5125 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5127 char fmt
[64]; /* plenty big enough! */
5129 int use_native_c_format
= 1;
5131 x
= PyInt_AsLong(v
);
5132 if (x
== -1 && PyErr_Occurred())
5136 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5137 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5138 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
5139 PyErr_SetString(PyExc_OverflowError
,
5140 "formatted integer is too long (precision too long?)");
5143 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5144 * but we want it (for consistency with other %#x conversions, and
5145 * for consistency with Python's hex() function).
5146 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5147 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5148 * So add it only if the platform doesn't already.
5150 if (x
== 0 && (flags
& F_ALT
) && (type
== 'x' || type
== 'X')) {
5151 /* Only way to know what the platform does is to try it. */
5152 PyOS_snprintf(fmt
, sizeof(fmt
), type
== 'x' ? "%#x" : "%#X", 0);
5153 if (fmt
[1] != (char)type
) {
5154 /* Supply our own leading 0x/0X -- needed under std C */
5155 use_native_c_format
= 0;
5156 PyOS_snprintf(fmt
, sizeof(fmt
), "0%c%%#.%dl%c", type
, prec
, type
);
5159 if (use_native_c_format
)
5160 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%dl%c",
5161 (flags
& F_ALT
) ? "#" : "", prec
, type
);
5162 return usprintf(buf
, fmt
, x
);
5166 formatchar(Py_UNICODE
*buf
,
5170 /* presume that the buffer is at least 2 characters long */
5171 if (PyUnicode_Check(v
)) {
5172 if (PyUnicode_GET_SIZE(v
) != 1)
5174 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
5177 else if (PyString_Check(v
)) {
5178 if (PyString_GET_SIZE(v
) != 1)
5180 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
5184 /* Integer input truncated to a character */
5186 x
= PyInt_AsLong(v
);
5187 if (x
== -1 && PyErr_Occurred())
5195 PyErr_SetString(PyExc_TypeError
,
5196 "%c requires int or char");
5200 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5202 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5203 chars are formatted. XXX This is a magic number. Each formatting
5204 routine does bounds checking to ensure no overflow, but a better
5205 solution may be to malloc a buffer of appropriate size for each
5206 format. For now, the current solution is sufficient.
5208 #define FORMATBUFLEN (size_t)120
5210 PyObject
*PyUnicode_Format(PyObject
*format
,
5213 Py_UNICODE
*fmt
, *res
;
5214 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
5216 PyUnicodeObject
*result
= NULL
;
5217 PyObject
*dict
= NULL
;
5220 if (format
== NULL
|| args
== NULL
) {
5221 PyErr_BadInternalCall();
5224 uformat
= PyUnicode_FromObject(format
);
5225 if (uformat
== NULL
)
5227 fmt
= PyUnicode_AS_UNICODE(uformat
);
5228 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
5230 reslen
= rescnt
= fmtcnt
+ 100;
5231 result
= _PyUnicode_New(reslen
);
5234 res
= PyUnicode_AS_UNICODE(result
);
5236 if (PyTuple_Check(args
)) {
5237 arglen
= PyTuple_Size(args
);
5244 if (args
->ob_type
->tp_as_mapping
)
5247 while (--fmtcnt
>= 0) {
5250 rescnt
= fmtcnt
+ 100;
5252 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5254 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
5260 /* Got a format specifier */
5264 Py_UNICODE c
= '\0';
5267 PyObject
*temp
= NULL
;
5271 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
5275 Py_UNICODE
*keystart
;
5281 PyErr_SetString(PyExc_TypeError
,
5282 "format requires a mapping");
5288 /* Skip over balanced parentheses */
5289 while (pcount
> 0 && --fmtcnt
>= 0) {
5292 else if (*fmt
== '(')
5296 keylen
= fmt
- keystart
- 1;
5297 if (fmtcnt
< 0 || pcount
> 0) {
5298 PyErr_SetString(PyExc_ValueError
,
5299 "incomplete format key");
5303 /* keys are converted to strings using UTF-8 and
5304 then looked up since Python uses strings to hold
5305 variables names etc. in its namespaces and we
5306 wouldn't want to break common idioms. */
5307 key
= PyUnicode_EncodeUTF8(keystart
,
5311 key
= PyUnicode_FromUnicode(keystart
, keylen
);
5319 args
= PyObject_GetItem(dict
, key
);
5328 while (--fmtcnt
>= 0) {
5329 switch (c
= *fmt
++) {
5330 case '-': flags
|= F_LJUST
; continue;
5331 case '+': flags
|= F_SIGN
; continue;
5332 case ' ': flags
|= F_BLANK
; continue;
5333 case '#': flags
|= F_ALT
; continue;
5334 case '0': flags
|= F_ZERO
; continue;
5339 v
= getnextarg(args
, arglen
, &argidx
);
5342 if (!PyInt_Check(v
)) {
5343 PyErr_SetString(PyExc_TypeError
,
5347 width
= PyInt_AsLong(v
);
5355 else if (c
>= '0' && c
<= '9') {
5357 while (--fmtcnt
>= 0) {
5359 if (c
< '0' || c
> '9')
5361 if ((width
*10) / 10 != width
) {
5362 PyErr_SetString(PyExc_ValueError
,
5366 width
= width
*10 + (c
- '0');
5374 v
= getnextarg(args
, arglen
, &argidx
);
5377 if (!PyInt_Check(v
)) {
5378 PyErr_SetString(PyExc_TypeError
,
5382 prec
= PyInt_AsLong(v
);
5388 else if (c
>= '0' && c
<= '9') {
5390 while (--fmtcnt
>= 0) {
5391 c
= Py_CHARMASK(*fmt
++);
5392 if (c
< '0' || c
> '9')
5394 if ((prec
*10) / 10 != prec
) {
5395 PyErr_SetString(PyExc_ValueError
,
5399 prec
= prec
*10 + (c
- '0');
5404 if (c
== 'h' || c
== 'l' || c
== 'L') {
5410 PyErr_SetString(PyExc_ValueError
,
5411 "incomplete format");
5415 v
= getnextarg(args
, arglen
, &argidx
);
5425 /* presume that buffer length is at least 1 */
5432 if (PyUnicode_Check(v
) && c
== 's') {
5439 temp
= PyObject_Str(v
);
5441 temp
= PyObject_Repr(v
);
5444 if (!PyString_Check(temp
)) {
5445 /* XXX Note: this should never happen, since
5446 PyObject_Repr() and PyObject_Str() assure
5449 PyErr_SetString(PyExc_TypeError
,
5450 "%s argument has non-string str()");
5453 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5454 PyString_GET_SIZE(temp
),
5462 pbuf
= PyUnicode_AS_UNICODE(temp
);
5463 len
= PyUnicode_GET_SIZE(temp
);
5464 if (prec
>= 0 && len
> prec
)
5476 if (PyLong_Check(v
)) {
5477 temp
= formatlong(v
, flags
, prec
, c
);
5480 pbuf
= PyUnicode_AS_UNICODE(temp
);
5481 len
= PyUnicode_GET_SIZE(temp
);
5482 /* unbounded ints can always produce
5483 a sign character! */
5488 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5492 /* only d conversion is signed */
5505 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5516 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5522 PyErr_Format(PyExc_ValueError
,
5523 "unsupported format character '%c' (0x%x) "
5525 (31<=c
&& c
<=126) ? c
: '?',
5526 c
, fmt
-1 - PyUnicode_AS_UNICODE(uformat
));
5530 if (*pbuf
== '-' || *pbuf
== '+') {
5534 else if (flags
& F_SIGN
)
5536 else if (flags
& F_BLANK
)
5543 if (rescnt
< width
+ (sign
!= 0)) {
5545 rescnt
= width
+ fmtcnt
+ 100;
5547 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5549 res
= PyUnicode_AS_UNICODE(result
)
5559 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5560 assert(pbuf
[0] == '0');
5561 assert(pbuf
[1] == c
);
5572 if (width
> len
&& !(flags
& F_LJUST
)) {
5576 } while (--width
> len
);
5581 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5582 assert(pbuf
[0] == '0');
5583 assert(pbuf
[1] == c
);
5588 Py_UNICODE_COPY(res
, pbuf
, len
);
5591 while (--width
>= len
) {
5595 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5596 PyErr_SetString(PyExc_TypeError
,
5597 "not all arguments converted");
5603 if (argidx
< arglen
&& !dict
) {
5604 PyErr_SetString(PyExc_TypeError
,
5605 "not all arguments converted");
5613 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
5615 return (PyObject
*)result
;
5626 static PyBufferProcs unicode_as_buffer
= {
5627 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5628 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5629 (getsegcountproc
) unicode_buffer_getsegcount
,
5630 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5633 staticforward PyObject
*
5634 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
5637 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5640 static char *kwlist
[] = {"string", "encoding", "errors", 0};
5641 char *encoding
= NULL
;
5642 char *errors
= NULL
;
5644 if (type
!= &PyUnicode_Type
)
5645 return unicode_subtype_new(type
, args
, kwds
);
5646 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
5647 kwlist
, &x
, &encoding
, &errors
))
5650 return (PyObject
*)_PyUnicode_New(0);
5651 if (encoding
== NULL
&& errors
== NULL
)
5652 return PyObject_Unicode(x
);
5654 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
5658 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5660 PyUnicodeObject
*tmp
, *pnew
;
5663 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
5664 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
5667 assert(PyUnicode_Check(tmp
));
5668 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
5671 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
5672 if (pnew
->str
== NULL
) {
5673 _Py_ForgetReference((PyObject
*)pnew
);
5677 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
5679 pnew
->hash
= tmp
->hash
;
5681 return (PyObject
*)pnew
;
5684 static char unicode_doc
[] =
5685 "unicode(string [, encoding[, errors]]) -> object\n\
5687 Create a new Unicode object from the given encoded string.\n\
5688 encoding defaults to the current default string encoding and \n\
5689 errors, defining the error handling, to 'strict'.";
5691 PyTypeObject PyUnicode_Type
= {
5692 PyObject_HEAD_INIT(&PyType_Type
)
5694 "unicode", /* tp_name */
5695 sizeof(PyUnicodeObject
), /* tp_size */
5696 0, /* tp_itemsize */
5698 (destructor
)unicode_dealloc
, /* tp_dealloc */
5702 (cmpfunc
) unicode_compare
, /* tp_compare */
5703 (reprfunc
) unicode_repr
, /* tp_repr */
5704 0, /* tp_as_number */
5705 &unicode_as_sequence
, /* tp_as_sequence */
5706 0, /* tp_as_mapping */
5707 (hashfunc
) unicode_hash
, /* tp_hash*/
5709 (reprfunc
) unicode_str
, /* tp_str */
5710 PyObject_GenericGetAttr
, /* tp_getattro */
5711 0, /* tp_setattro */
5712 &unicode_as_buffer
, /* tp_as_buffer */
5713 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_BASETYPE
, /* tp_flags */
5714 unicode_doc
, /* tp_doc */
5715 0, /* tp_traverse */
5717 0, /* tp_richcompare */
5718 0, /* tp_weaklistoffset */
5720 0, /* tp_iternext */
5721 unicode_methods
, /* tp_methods */
5726 0, /* tp_descr_get */
5727 0, /* tp_descr_set */
5728 0, /* tp_dictoffset */
5731 unicode_new
, /* tp_new */
5732 _PyObject_Del
, /* tp_free */
5735 /* Initialize the Unicode implementation */
5737 void _PyUnicode_Init(void)
5741 /* Init the implementation */
5742 unicode_freelist
= NULL
;
5743 unicode_freelist_size
= 0;
5744 unicode_empty
= _PyUnicode_New(0);
5745 strcpy(unicode_default_encoding
, "ascii");
5746 for (i
= 0; i
< 256; i
++)
5747 unicode_latin1
[i
] = NULL
;
5750 /* Finalize the Unicode implementation */
5753 _PyUnicode_Fini(void)
5758 Py_XDECREF(unicode_empty
);
5759 unicode_empty
= NULL
;
5761 for (i
= 0; i
< 256; i
++) {
5762 if (unicode_latin1
[i
]) {
5763 Py_DECREF(unicode_latin1
[i
]);
5764 unicode_latin1
[i
] = NULL
;
5768 for (u
= unicode_freelist
; u
!= NULL
;) {
5769 PyUnicodeObject
*v
= u
;
5770 u
= *(PyUnicodeObject
**)u
;
5773 Py_XDECREF(v
->defenc
);
5776 unicode_freelist
= NULL
;
5777 unicode_freelist_size
= 0;