3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Copyright (c) Corporation for National Research Initiatives.
9 --------------------------------------------------------------------
10 The original string type implementation is:
12 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
15 By obtaining, using, and/or copying this software and/or its
16 associated documentation, you agree that you have read, understood,
17 and will comply with the following terms and conditions:
19 Permission to use, copy, modify, and distribute this software and its
20 associated documentation for any purpose and without fee is hereby
21 granted, provided that the above copyright notice appears in all
22 copies, and that both that copyright notice and this permission notice
23 appear in supporting documentation, and that the name of Secret Labs
24 AB or the author not be used in advertising or publicity pertaining to
25 distribution of the software without specific, written prior
28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35 --------------------------------------------------------------------
41 #include "unicodeobject.h"
48 /* Limit for the Unicode object free list */
50 #define MAX_UNICODE_FREELIST_SIZE 1024
52 /* Limit for the Unicode object free list stay alive optimization.
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
58 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
59 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
60 malloc()-overhead) bytes of unused garbage.
62 Setting the limit to 0 effectively turns the feature off.
64 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
69 #define KEEPALIVE_SIZE_LIMIT 9
71 /* Endianness switches; defaults to little endian */
73 #ifdef WORDS_BIGENDIAN
74 # define BYTEORDER_IS_BIG_ENDIAN
76 # define BYTEORDER_IS_LITTLE_ENDIAN
79 /* --- Globals ------------------------------------------------------------
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
86 /* Free list for Unicode objects */
87 static PyUnicodeObject
*unicode_freelist
;
88 static int unicode_freelist_size
;
90 /* The empty Unicode object is shared to improve performance. */
91 static PyUnicodeObject
*unicode_empty
;
93 /* Single character Unicode strings in the Latin-1 range are being
95 static PyUnicodeObject
*unicode_latin1
[256];
97 /* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
104 static char unicode_default_encoding
[100];
107 PyUnicode_GetMax(void)
109 #ifdef Py_UNICODE_WIDE
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
118 /* --- Unicode Object ----------------------------------------------------- */
121 int unicode_resize(register PyUnicodeObject
*unicode
,
126 /* Shortcut if there's nothing much to do. */
127 if (unicode
->length
== length
)
130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
133 if (unicode
== unicode_empty
||
134 (unicode
->length
== 1 &&
135 unicode
->str
[0] < 256 &&
136 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
137 PyErr_SetString(PyExc_SystemError
,
138 "can't resize shared unicode objects");
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr
= unicode
->str
;
145 PyMem_RESIZE(unicode
->str
, Py_UNICODE
, length
+ 1);
147 unicode
->str
= oldstr
;
151 unicode
->str
[length
] = 0;
152 unicode
->length
= length
;
155 /* Reset the object caches */
156 if (unicode
->defenc
) {
157 Py_DECREF(unicode
->defenc
);
158 unicode
->defenc
= NULL
;
165 /* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
174 PyUnicodeObject
*_PyUnicode_New(int length
)
176 register PyUnicodeObject
*unicode
;
178 /* Optimization for empty strings */
179 if (length
== 0 && unicode_empty
!= NULL
) {
180 Py_INCREF(unicode_empty
);
181 return unicode_empty
;
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist
) {
186 unicode
= unicode_freelist
;
187 unicode_freelist
= *(PyUnicodeObject
**)unicode
;
188 unicode_freelist_size
--;
190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode
->length
< length
) &&
193 unicode_resize(unicode
, length
)) {
194 PyMem_DEL(unicode
->str
);
199 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
201 PyObject_INIT(unicode
, &PyUnicode_Type
);
204 unicode
= PyObject_NEW(PyUnicodeObject
, &PyUnicode_Type
);
207 unicode
->str
= PyMem_NEW(Py_UNICODE
, length
+ 1);
214 unicode
->str
[length
] = 0;
215 unicode
->length
= length
;
217 unicode
->defenc
= NULL
;
221 _Py_ForgetReference((PyObject
*)unicode
);
222 PyObject_DEL(unicode
);
227 void unicode_dealloc(register PyUnicodeObject
*unicode
)
229 if (PyUnicode_CheckExact(unicode
) &&
230 unicode_freelist_size
< MAX_UNICODE_FREELIST_SIZE
) {
231 /* Keep-Alive optimization */
232 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
233 PyMem_DEL(unicode
->str
);
237 if (unicode
->defenc
) {
238 Py_DECREF(unicode
->defenc
);
239 unicode
->defenc
= NULL
;
241 /* Add to free list */
242 *(PyUnicodeObject
**)unicode
= unicode_freelist
;
243 unicode_freelist
= unicode
;
244 unicode_freelist_size
++;
247 PyMem_DEL(unicode
->str
);
248 Py_XDECREF(unicode
->defenc
);
249 unicode
->ob_type
->tp_free((PyObject
*)unicode
);
253 int PyUnicode_Resize(PyObject
**unicode
,
256 register PyUnicodeObject
*v
;
258 /* Argument checks */
259 if (unicode
== NULL
) {
260 PyErr_BadInternalCall();
263 v
= (PyUnicodeObject
*)*unicode
;
264 if (v
== NULL
|| !PyUnicode_Check(v
) || v
->ob_refcnt
!= 1) {
265 PyErr_BadInternalCall();
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v
->length
!= length
&&
273 (v
== unicode_empty
|| v
->length
== 1)) {
274 PyUnicodeObject
*w
= _PyUnicode_New(length
);
277 Py_UNICODE_COPY(w
->str
, v
->str
,
278 length
< v
->length
? length
: v
->length
);
279 *unicode
= (PyObject
*)w
;
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v
, length
);
288 /* Internal API for use in unicodeobject.c only ! */
289 #define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
292 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
295 PyUnicodeObject
*unicode
;
297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
301 /* Optimization for empty strings */
302 if (size
== 0 && unicode_empty
!= NULL
) {
303 Py_INCREF(unicode_empty
);
304 return (PyObject
*)unicode_empty
;
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size
== 1 && *u
< 256) {
310 unicode
= unicode_latin1
[*u
];
312 unicode
= _PyUnicode_New(1);
315 unicode
->str
[0] = *u
;
316 unicode_latin1
[*u
] = unicode
;
319 return (PyObject
*)unicode
;
323 unicode
= _PyUnicode_New(size
);
327 /* Copy the Unicode data into the new object */
329 Py_UNICODE_COPY(unicode
->str
, u
, size
);
331 return (PyObject
*)unicode
;
336 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
339 PyUnicodeObject
*unicode
;
342 PyErr_BadInternalCall();
346 unicode
= _PyUnicode_New(size
);
350 /* Copy the wchar_t data into the new object */
351 #ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
355 register Py_UNICODE
*u
;
357 u
= PyUnicode_AS_UNICODE(unicode
);
358 for (i
= size
; i
>= 0; i
--)
363 return (PyObject
*)unicode
;
366 int PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
370 if (unicode
== NULL
) {
371 PyErr_BadInternalCall();
374 if (size
> PyUnicode_GET_SIZE(unicode
))
375 size
= PyUnicode_GET_SIZE(unicode
);
376 #ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
380 register Py_UNICODE
*u
;
382 u
= PyUnicode_AS_UNICODE(unicode
);
383 for (i
= size
; i
>= 0; i
--)
393 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj
)) {
401 if (PyUnicode_Check(obj
)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
405 PyUnicode_GET_SIZE(obj
));
407 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
410 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
411 const char *encoding
,
414 const char *s
= NULL
;
420 PyErr_BadInternalCall();
425 /* For b/w compatibility we also accept Unicode objects provided
426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
434 if (PyUnicode_Check(obj
)) {
436 PyErr_SetString(PyExc_TypeError
,
437 "decoding Unicode is not supported");
440 return PyObject_Unicode(obj
);
443 if (PyUnicode_Check(obj
)) {
444 PyErr_SetString(PyExc_TypeError
,
445 "decoding Unicode is not supported");
451 if (PyString_Check(obj
)) {
452 s
= PyString_AS_STRING(obj
);
453 len
= PyString_GET_SIZE(obj
);
455 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError
))
459 PyErr_Format(PyExc_TypeError
,
460 "coercing to Unicode: need string or buffer, "
462 obj
->ob_type
->tp_name
);
466 /* Convert to Unicode */
468 Py_INCREF(unicode_empty
);
469 v
= (PyObject
*)unicode_empty
;
472 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
486 PyObject
*PyUnicode_Decode(const char *s
,
488 const char *encoding
,
491 PyObject
*buffer
= NULL
, *unicode
;
493 if (encoding
== NULL
)
494 encoding
= PyUnicode_GetDefaultEncoding();
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding
, "utf-8") == 0)
498 return PyUnicode_DecodeUTF8(s
, size
, errors
);
499 else if (strcmp(encoding
, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s
, size
, errors
);
501 else if (strcmp(encoding
, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s
, size
, errors
);
504 /* Decode via the codec registry */
505 buffer
= PyBuffer_FromMemory((void *)s
, size
);
508 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
511 if (!PyUnicode_Check(unicode
)) {
512 PyErr_Format(PyExc_TypeError
,
513 "decoder did not return an unicode object (type=%.400s)",
514 unicode
->ob_type
->tp_name
);
526 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
528 const char *encoding
,
531 PyObject
*v
, *unicode
;
533 unicode
= PyUnicode_FromUnicode(s
, size
);
536 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
541 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
542 const char *encoding
,
547 if (!PyUnicode_Check(unicode
)) {
552 if (encoding
== NULL
)
553 encoding
= PyUnicode_GetDefaultEncoding();
555 /* Shortcuts for common default encodings */
556 if (errors
== NULL
) {
557 if (strcmp(encoding
, "utf-8") == 0)
558 return PyUnicode_AsUTF8String(unicode
);
559 else if (strcmp(encoding
, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode
);
561 else if (strcmp(encoding
, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode
);
565 /* Encode via the codec registry */
566 v
= PyCodec_Encode(unicode
, encoding
, errors
);
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v
)) {
571 PyErr_Format(PyExc_TypeError
,
572 "encoder did not return a string object (type=%.400s)",
573 v
->ob_type
->tp_name
);
583 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
586 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
590 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
591 if (v
&& errors
== NULL
)
592 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
596 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
598 if (!PyUnicode_Check(unicode
)) {
602 return PyUnicode_AS_UNICODE(unicode
);
608 int PyUnicode_GetSize(PyObject
*unicode
)
610 if (!PyUnicode_Check(unicode
)) {
614 return PyUnicode_GET_SIZE(unicode
);
620 const char *PyUnicode_GetDefaultEncoding(void)
622 return unicode_default_encoding
;
625 int PyUnicode_SetDefaultEncoding(const char *encoding
)
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v
= _PyCodec_Lookup(encoding
);
635 strncpy(unicode_default_encoding
,
637 sizeof(unicode_default_encoding
));
644 /* --- UTF-7 Codec -------------------------------------------------------- */
646 /* see RFC2152 for details */
649 char utf7_special
[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
667 #define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
672 #define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674 #define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
677 #define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
683 #define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
695 errmsg = "code pairs are not supported"; \
703 int utf7_decoding_error(Py_UNICODE
**dest
,
707 if ((errors
== NULL
) ||
708 (strcmp(errors
,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError
,
710 "UTF-7 decoding error: %.400s",
714 else if (strcmp(errors
,"ignore") == 0) {
717 else if (strcmp(errors
,"replace") == 0) {
719 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
725 PyErr_Format(PyExc_ValueError
,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
732 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
737 PyUnicodeObject
*unicode
;
739 const char *errmsg
= "";
741 unsigned int bitsleft
= 0;
742 unsigned long charsleft
= 0;
745 unicode
= _PyUnicode_New(size
);
749 return (PyObject
*)unicode
;
758 if ((ch
== '-') || !B64CHAR(ch
)) {
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
768 errmsg
= "partial character in shift sequence";
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft
&& charsleft
<< (sizeof(charsleft
) * 8 - bitsleft
)) {
777 errmsg
= "non-zero padding bits in shift sequence";
782 if ((s
< e
) && (*(s
) == '-')) {
786 } else if (SPECIAL(ch
,0,0)) {
787 errmsg
= "unexpected special character";
793 charsleft
= (charsleft
<< 6) | UB64(ch
);
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p
, charsleft
, bitsleft
, surrogate
);
799 else if ( ch
== '+' ) {
801 if (s
< e
&& *s
== '-') {
810 else if (SPECIAL(ch
,0,0)) {
811 errmsg
= "unexpected special character";
821 if (utf7_decoding_error(&p
, errors
, errmsg
))
826 if (utf7_decoding_error(&p
, errors
, "unterminated shift sequence"))
830 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
833 return (PyObject
*)unicode
;
841 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
844 int encodeWhiteSpace
,
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated
= 5 * size
;
852 unsigned int bitsleft
= 0;
853 unsigned long charsleft
= 0;
858 return PyString_FromStringAndSize(NULL
, 0);
860 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
864 start
= out
= PyString_AS_STRING(v
);
865 for (;i
< size
; ++i
) {
866 Py_UNICODE ch
= s
[i
];
872 } else if (SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
876 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
877 inShift
= bitsleft
> 0;
882 if (!SPECIAL(ch
, encodeSetO
, encodeWhiteSpace
)) {
883 *out
++ = B64(charsleft
<< (6-bitsleft
));
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch
) || ch
== '-') {
895 charsleft
= (charsleft
<< 16) | ch
;
896 /* out, charsleft, bitsleft = */ ENCODE(out
, charsleft
, bitsleft
);
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
905 Py_UNICODE ch2
= s
[i
+1];
907 if (SPECIAL(ch2
, encodeSetO
, encodeWhiteSpace
)) {
909 } else if (B64CHAR(ch2
) || ch2
== '-') {
926 *out
++= B64(charsleft
<< (6-bitsleft
) );
930 if (_PyString_Resize(&v
, out
- start
)) {
944 /* --- UTF-8 Codec -------------------------------------------------------- */
947 char utf8_code_length
[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
969 int utf8_decoding_error(const char **source
,
974 if ((errors
== NULL
) ||
975 (strcmp(errors
,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError
,
977 "UTF-8 decoding error: %.400s",
981 else if (strcmp(errors
,"ignore") == 0) {
985 else if (strcmp(errors
,"replace") == 0) {
987 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
992 PyErr_Format(PyExc_ValueError
,
993 "UTF-8 decoding error; unknown error handling code: %.400s",
999 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1005 PyUnicodeObject
*unicode
;
1007 const char *errmsg
= "";
1009 /* Note: size will always be longer than the resulting Unicode
1011 unicode
= _PyUnicode_New(size
);
1015 return (PyObject
*)unicode
;
1017 /* Unpack UTF-8 encoded data */
1022 Py_UCS4 ch
= (unsigned char)*s
;
1025 *p
++ = (Py_UNICODE
)ch
;
1030 n
= utf8_code_length
[ch
];
1033 errmsg
= "unexpected end of data";
1040 errmsg
= "unexpected code byte";
1044 errmsg
= "internal error";
1048 if ((s
[1] & 0xc0) != 0x80) {
1049 errmsg
= "invalid data";
1052 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1054 errmsg
= "illegal encoding";
1058 *p
++ = (Py_UNICODE
)ch
;
1062 if ((s
[1] & 0xc0) != 0x80 ||
1063 (s
[2] & 0xc0) != 0x80) {
1064 errmsg
= "invalid data";
1067 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1068 if (ch
< 0x800 || (ch
>= 0xd800 && ch
< 0xe000)) {
1069 errmsg
= "illegal encoding";
1073 *p
++ = (Py_UNICODE
)ch
;
1077 if ((s
[1] & 0xc0) != 0x80 ||
1078 (s
[2] & 0xc0) != 0x80 ||
1079 (s
[3] & 0xc0) != 0x80) {
1080 errmsg
= "invalid data";
1083 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
1084 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
1085 /* validate and convert to UTF-16 */
1086 if ((ch
< 0x10000) /* minimum value allowed for 4
1088 || (ch
> 0x10ffff)) /* maximum value allowed for
1091 errmsg
= "illegal encoding";
1094 #ifdef Py_UNICODE_WIDE
1095 *p
++ = (Py_UNICODE
)ch
;
1097 /* compute and append the two surrogates: */
1099 /* translate from 10000..10FFFF to 0..FFFF */
1102 /* high surrogate = top 10 bits added to D800 */
1103 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
1105 /* low surrogate = bottom 10 bits added to DC00 */
1106 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
1111 /* Other sizes are only needed for UCS-4 */
1112 errmsg
= "unsupported Unicode code range";
1119 if (utf8_decoding_error(&s
, &p
, errors
, errmsg
))
1124 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1127 return (PyObject
*)unicode
;
1134 /* Not used anymore, now that the encoder supports UTF-16
1138 int utf8_encoding_error(const Py_UNICODE
**source
,
1141 const char *details
)
1143 if ((errors
== NULL
) ||
1144 (strcmp(errors
,"strict") == 0)) {
1145 PyErr_Format(PyExc_UnicodeError
,
1146 "UTF-8 encoding error: %.400s",
1150 else if (strcmp(errors
,"ignore") == 0) {
1153 else if (strcmp(errors
,"replace") == 0) {
1159 PyErr_Format(PyExc_ValueError
,
1160 "UTF-8 encoding error; "
1161 "unknown error handling code: %.400s",
1168 PyObject
*PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
1176 unsigned int cbAllocated
= 3 * size
;
1179 v
= PyString_FromStringAndSize(NULL
, cbAllocated
);
1185 p
= q
= PyString_AS_STRING(v
);
1187 Py_UCS4 ch
= s
[i
++];
1191 else if (ch
< 0x0800) {
1192 *p
++ = 0xc0 | (ch
>> 6);
1193 *p
++ = 0x80 | (ch
& 0x3f);
1196 else if (ch
< 0x10000) {
1197 /* Check for high surrogate */
1198 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1201 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1203 if ((Py_uintptr_t
)(p
- q
) >= (cbAllocated
- 4)) {
1204 /* Provide enough room for some more
1206 cbAllocated
+= 4*10;
1207 if (_PyString_Resize(&v
, cbAllocated
))
1209 p
= PyString_AS_STRING(v
) + (p
- q
);
1210 q
= PyString_AS_STRING(v
);
1213 /* combine the two values */
1214 ch
= ((ch
- 0xD800)<<10 | (ch2
-0xDC00))+0x10000;
1216 *p
++ = (char)((ch
>> 18) | 0xf0);
1217 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
1223 *p
++ = (char)(0xe0 | (ch
>> 12));
1225 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
1226 *p
++ = (char)(0x80 | (ch
& 0x3f));
1229 if ((Py_uintptr_t
)(p
- q
) >= (cbAllocated
- 4)) {
1230 /* Provide enough room for some more
1232 cbAllocated
+= 4*10;
1233 if (_PyString_Resize(&v
, cbAllocated
))
1235 p
= PyString_AS_STRING(v
) + (p
- q
);
1236 q
= PyString_AS_STRING(v
);
1239 *p
++ = 0xf0 | (ch
>>18);
1240 *p
++ = 0x80 | ((ch
>>12) & 0x3f);
1241 *p
++ = 0x80 | ((ch
>>6) & 0x3f);
1242 *p
++ = 0x80 | (ch
& 0x3f);
1246 if (_PyString_Resize(&v
, p
- q
))
1255 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
1257 if (!PyUnicode_Check(unicode
)) {
1258 PyErr_BadArgument();
1261 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
1262 PyUnicode_GET_SIZE(unicode
),
1266 /* --- UTF-16 Codec ------------------------------------------------------- */
1269 int utf16_decoding_error(Py_UNICODE
**dest
,
1271 const char *details
)
1273 if ((errors
== NULL
) ||
1274 (strcmp(errors
,"strict") == 0)) {
1275 PyErr_Format(PyExc_UnicodeError
,
1276 "UTF-16 decoding error: %.400s",
1280 else if (strcmp(errors
,"ignore") == 0) {
1283 else if (strcmp(errors
,"replace") == 0) {
1285 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1291 PyErr_Format(PyExc_ValueError
,
1292 "UTF-16 decoding error; "
1293 "unknown error handling code: %.400s",
1300 PyUnicode_DecodeUTF16(const char *s
,
1305 PyUnicodeObject
*unicode
;
1307 const unsigned char *q
, *e
;
1308 int bo
= 0; /* assume native ordering by default */
1309 const char *errmsg
= "";
1310 /* Offsets from q for retrieving byte pairs in the right order. */
1311 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1312 int ihi
= 1, ilo
= 0;
1314 int ihi
= 0, ilo
= 1;
1317 /* size should be an even number */
1319 if (utf16_decoding_error(NULL
, errors
, "truncated data"))
1321 --size
; /* else ignore the oddball byte */
1324 /* Note: size will always be longer than the resulting Unicode
1326 unicode
= _PyUnicode_New(size
);
1330 return (PyObject
*)unicode
;
1332 /* Unpack UTF-16 encoded data */
1334 q
= (unsigned char *)s
;
1340 /* Check for BOM marks (U+FEFF) in the input and adjust current
1341 byte order setting accordingly. In native mode, the leading BOM
1342 mark is skipped, in all other modes, it is copied to the output
1343 stream as-is (giving a ZWNBSP character). */
1345 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
1346 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1347 if (bom
== 0xFEFF) {
1351 else if (bom
== 0xFFFE) {
1356 if (bom
== 0xFEFF) {
1360 else if (bom
== 0xFFFE) {
1379 Py_UNICODE ch
= (q
[ihi
] << 8) | q
[ilo
];
1382 if (ch
< 0xD800 || ch
> 0xDFFF) {
1387 /* UTF-16 code pair: */
1389 errmsg
= "unexpected end of data";
1392 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
1393 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
1395 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
1396 #ifndef Py_UNICODE_WIDE
1400 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
1405 errmsg
= "illegal UTF-16 surrogate";
1410 errmsg
= "illegal encoding";
1411 /* Fall through to report the error */
1414 if (utf16_decoding_error(&p
, errors
, errmsg
))
1422 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
))
1425 return (PyObject
*)unicode
;
1433 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
1441 /* Offsets from p for storing byte pairs in the right order. */
1442 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1443 int ihi
= 1, ilo
= 0;
1445 int ihi
= 0, ilo
= 1;
1448 #define STORECHAR(CH) \
1450 p[ihi] = ((CH) >> 8) & 0xff; \
1451 p[ilo] = (CH) & 0xff; \
1455 for (i
= pairs
= 0; i
< size
; i
++)
1456 if (s
[i
] >= 0x10000)
1458 v
= PyString_FromStringAndSize(NULL
,
1459 2 * (size
+ pairs
+ (byteorder
== 0)));
1463 p
= (unsigned char *)PyString_AS_STRING(v
);
1469 if (byteorder
== -1) {
1474 else if (byteorder
== 1) {
1480 while (size
-- > 0) {
1481 Py_UNICODE ch
= *s
++;
1483 if (ch
>= 0x10000) {
1484 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1485 ch
= 0xD800 | ((ch
-0x10000) >> 10);
1495 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
1497 if (!PyUnicode_Check(unicode
)) {
1498 PyErr_BadArgument();
1501 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
1502 PyUnicode_GET_SIZE(unicode
),
1507 /* --- Unicode Escape Codec ----------------------------------------------- */
1510 int unicodeescape_decoding_error(Py_UNICODE
**x
,
1512 const char *details
)
1514 if ((errors
== NULL
) ||
1515 (strcmp(errors
,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError
,
1517 "Unicode-Escape decoding error: %.400s",
1521 else if (strcmp(errors
,"ignore") == 0) {
1524 else if (strcmp(errors
,"replace") == 0) {
1525 **x
= Py_UNICODE_REPLACEMENT_CHARACTER
;
1530 PyErr_Format(PyExc_ValueError
,
1531 "Unicode-Escape decoding error; "
1532 "unknown error handling code: %.400s",
1538 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
1540 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
1545 Py_UNICODE
*p
, *buf
;
1548 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
1550 /* Escaped strings will always be longer than the resulting
1551 Unicode string, so we start with size here and then reduce the
1552 length after conversion to the true value. */
1553 v
= _PyUnicode_New(size
);
1557 return (PyObject
*)v
;
1559 p
= buf
= PyUnicode_AS_UNICODE(v
);
1567 /* Non-escape characters are interpreted as Unicode ordinals */
1569 *p
++ = (unsigned char) *s
++;
1579 case '\\': *p
++ = '\\'; break;
1580 case '\'': *p
++ = '\''; break;
1581 case '\"': *p
++ = '\"'; break;
1582 case 'b': *p
++ = '\b'; break;
1583 case 'f': *p
++ = '\014'; break; /* FF */
1584 case 't': *p
++ = '\t'; break;
1585 case 'n': *p
++ = '\n'; break;
1586 case 'r': *p
++ = '\r'; break;
1587 case 'v': *p
++ = '\013'; break; /* VT */
1588 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
1590 /* \OOO (octal) escapes */
1591 case '0': case '1': case '2': case '3':
1592 case '4': case '5': case '6': case '7':
1594 if ('0' <= *s
&& *s
<= '7') {
1595 x
= (x
<<3) + *s
++ - '0';
1596 if ('0' <= *s
&& *s
<= '7')
1597 x
= (x
<<3) + *s
++ - '0';
1606 message
= "truncated \\xXX escape";
1612 message
= "truncated \\uXXXX escape";
1618 message
= "truncated \\UXXXXXXXX escape";
1621 for (i
= 0; i
< digits
; i
++) {
1622 c
= (unsigned char) s
[i
];
1624 if (unicodeescape_decoding_error(&p
, errors
, message
))
1630 chr
= (chr
<<4) & ~0xF;
1631 if (c
>= '0' && c
<= '9')
1633 else if (c
>= 'a' && c
<= 'f')
1634 chr
+= 10 + c
- 'a';
1636 chr
+= 10 + c
- 'A';
1639 if (chr
== 0xffffffff)
1640 /* _decoding_error will have already written into the
1644 /* when we get here, chr is a 32-bit unicode character */
1646 /* UCS-2 character */
1647 *p
++ = (Py_UNICODE
) chr
;
1648 else if (chr
<= 0x10ffff) {
1649 /* UCS-4 character. Either store directly, or as
1651 #ifdef Py_UNICODE_WIDE
1655 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
1656 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
1659 if (unicodeescape_decoding_error(
1661 "illegal Unicode character")
1669 message
= "malformed \\N character escape";
1670 if (ucnhash_CAPI
== NULL
) {
1671 /* load the unicode data module */
1673 m
= PyImport_ImportModule("unicodedata");
1676 v
= PyObject_GetAttrString(m
, "ucnhash_CAPI");
1680 ucnhash_CAPI
= PyCObject_AsVoidPtr(v
);
1682 if (ucnhash_CAPI
== NULL
)
1686 const char *start
= s
+1;
1687 /* look for the closing brace */
1688 while (*s
!= '}' && s
< end
)
1690 if (s
> start
&& s
< end
&& *s
== '}') {
1691 /* found a name. look it up in the unicode database */
1692 message
= "unknown Unicode character name";
1694 if (ucnhash_CAPI
->getcode(start
, s
-start
-1, &chr
))
1698 if (unicodeescape_decoding_error(&p
, errors
, message
))
1704 if (unicodeescape_decoding_error(&p
, errors
, "\\ at end of string"))
1709 *p
++ = (unsigned char)s
[-1];
1714 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1716 return (PyObject
*)v
;
1721 "\\N escapes not supported (can't load unicodedata module)"
1730 /* Return a Unicode-Escape string version of the Unicode object.
1732 If quotes is true, the string is enclosed in u"" or u'' quotes as
1737 static const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
1742 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
1749 static const char *hexdigit
= "0123456789abcdef";
1751 repr
= PyString_FromStringAndSize(NULL
, 2 + 6*size
+ 1);
1755 p
= PyString_AS_STRING(repr
);
1759 *p
++ = (findchar(s
, size
, '\'') &&
1760 !findchar(s
, size
, '"')) ? '"' : '\'';
1762 while (size
-- > 0) {
1763 Py_UNICODE ch
= *s
++;
1767 (ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1] || ch
== '\\')) {
1773 #ifdef Py_UNICODE_WIDE
1774 /* Map 21-bit characters to '\U00xxxxxx' */
1775 else if (ch
>= 0x10000) {
1776 int offset
= p
- PyString_AS_STRING(repr
);
1778 /* Resize the string if necessary */
1779 if (offset
+ 12 > PyString_GET_SIZE(repr
)) {
1780 if (_PyString_Resize(&repr
, PyString_GET_SIZE(repr
) + 100))
1782 p
= PyString_AS_STRING(repr
) + offset
;
1787 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
1788 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
1789 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
1790 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
1791 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
1792 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
1793 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
1794 *p
++ = hexdigit
[ch
& 0x0000000F];
1798 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1799 else if (ch
>= 0xD800 && ch
< 0xDC00) {
1805 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
1806 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
1809 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
1810 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
1811 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
1812 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
1813 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
1814 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
1815 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
1816 *p
++ = hexdigit
[ucs
& 0x0000000F];
1819 /* Fall through: isolated surrogates are copied as-is */
1824 /* Map 16-bit characters to '\uxxxx' */
1828 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
1829 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
1830 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1831 *p
++ = hexdigit
[ch
& 0x000F];
1834 /* Map special whitespace to '\t', \n', '\r' */
1835 else if (ch
== '\t') {
1839 else if (ch
== '\n') {
1843 else if (ch
== '\r') {
1848 /* Map non-printable US ASCII to '\xhh' */
1849 else if (ch
< ' ' || ch
>= 0x7F) {
1852 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
1853 *p
++ = hexdigit
[ch
& 0x000F];
1856 /* Copy everything else as-is */
1861 *p
++ = PyString_AS_STRING(repr
)[1];
1864 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
1874 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
1877 return unicodeescape_string(s
, size
, 0);
1880 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
1882 if (!PyUnicode_Check(unicode
)) {
1883 PyErr_BadArgument();
1886 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
1887 PyUnicode_GET_SIZE(unicode
));
1890 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1892 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
1897 Py_UNICODE
*p
, *buf
;
1901 /* Escaped strings will always be longer than the resulting
1902 Unicode string, so we start with size here and then reduce the
1903 length after conversion to the true value. */
1904 v
= _PyUnicode_New(size
);
1908 return (PyObject
*)v
;
1909 p
= buf
= PyUnicode_AS_UNICODE(v
);
1916 /* Non-escape characters are interpreted as Unicode ordinals */
1918 *p
++ = (unsigned char)*s
++;
1922 /* \u-escapes are only interpreted iff the number of leading
1923 backslashes if odd */
1928 *p
++ = (unsigned char)*s
++;
1930 if (((s
- bs
) & 1) == 0 ||
1938 /* \uXXXX with 4 hex digits */
1939 for (x
= 0, i
= 0; i
< 4; i
++) {
1940 c
= (unsigned char)s
[i
];
1942 if (unicodeescape_decoding_error(&p
, errors
,
1943 "truncated \\uXXXX"))
1950 if (c
>= '0' && c
<= '9')
1952 else if (c
>= 'a' && c
<= 'f')
1958 if (x
!= 0xffffffff)
1961 if (_PyUnicode_Resize(&v
, (int)(p
- buf
)))
1963 return (PyObject
*)v
;
1970 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
1977 static const char *hexdigit
= "0123456789abcdef";
1979 repr
= PyString_FromStringAndSize(NULL
, 6 * size
);
1985 p
= q
= PyString_AS_STRING(repr
);
1986 while (size
-- > 0) {
1987 Py_UNICODE ch
= *s
++;
1988 /* Map 16-bit characters to '\uxxxx' */
1992 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
1993 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
1994 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
1995 *p
++ = hexdigit
[ch
& 15];
1997 /* Copy everything else as-is */
2002 if (_PyString_Resize(&repr
, p
- q
))
2012 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
2014 if (!PyUnicode_Check(unicode
)) {
2015 PyErr_BadArgument();
2018 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
2019 PyUnicode_GET_SIZE(unicode
));
2022 /* --- Latin-1 Codec ------------------------------------------------------ */
2024 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
2031 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2032 if (size
== 1 && *(unsigned char*)s
< 256) {
2033 Py_UNICODE r
= *(unsigned char*)s
;
2034 return PyUnicode_FromUnicode(&r
, 1);
2037 v
= _PyUnicode_New(size
);
2041 return (PyObject
*)v
;
2042 p
= PyUnicode_AS_UNICODE(v
);
2044 *p
++ = (unsigned char)*s
++;
2045 return (PyObject
*)v
;
2053 int latin1_encoding_error(const Py_UNICODE
**source
,
2056 const char *details
)
2058 if ((errors
== NULL
) ||
2059 (strcmp(errors
,"strict") == 0)) {
2060 PyErr_Format(PyExc_UnicodeError
,
2061 "Latin-1 encoding error: %.400s",
2065 else if (strcmp(errors
,"ignore") == 0) {
2068 else if (strcmp(errors
,"replace") == 0) {
2074 PyErr_Format(PyExc_ValueError
,
2075 "Latin-1 encoding error; "
2076 "unknown error handling code: %.400s",
2082 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
2089 repr
= PyString_FromStringAndSize(NULL
, size
);
2095 s
= PyString_AS_STRING(repr
);
2097 while (size
-- > 0) {
2098 Py_UNICODE ch
= *p
++;
2100 if (latin1_encoding_error(&p
, &s
, errors
,
2101 "ordinal not in range(256)"))
2107 /* Resize if error handling skipped some characters */
2108 if (s
- start
< PyString_GET_SIZE(repr
))
2109 if (_PyString_Resize(&repr
, s
- start
))
2118 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
2120 if (!PyUnicode_Check(unicode
)) {
2121 PyErr_BadArgument();
2124 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
2125 PyUnicode_GET_SIZE(unicode
),
2129 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2132 int ascii_decoding_error(const char **source
,
2135 const char *details
)
2137 if ((errors
== NULL
) ||
2138 (strcmp(errors
,"strict") == 0)) {
2139 PyErr_Format(PyExc_UnicodeError
,
2140 "ASCII decoding error: %.400s",
2144 else if (strcmp(errors
,"ignore") == 0) {
2147 else if (strcmp(errors
,"replace") == 0) {
2148 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2153 PyErr_Format(PyExc_ValueError
,
2154 "ASCII decoding error; "
2155 "unknown error handling code: %.400s",
2161 PyObject
*PyUnicode_DecodeASCII(const char *s
,
2168 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2169 if (size
== 1 && *(unsigned char*)s
< 128) {
2170 Py_UNICODE r
= *(unsigned char*)s
;
2171 return PyUnicode_FromUnicode(&r
, 1);
2174 v
= _PyUnicode_New(size
);
2178 return (PyObject
*)v
;
2179 p
= PyUnicode_AS_UNICODE(v
);
2180 while (size
-- > 0) {
2181 register unsigned char c
;
2183 c
= (unsigned char)*s
++;
2186 else if (ascii_decoding_error(&s
, &p
, errors
,
2187 "ordinal not in range(128)"))
2190 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
2191 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2193 return (PyObject
*)v
;
2201 int ascii_encoding_error(const Py_UNICODE
**source
,
2204 const char *details
)
2206 if ((errors
== NULL
) ||
2207 (strcmp(errors
,"strict") == 0)) {
2208 PyErr_Format(PyExc_UnicodeError
,
2209 "ASCII encoding error: %.400s",
2213 else if (strcmp(errors
,"ignore") == 0) {
2216 else if (strcmp(errors
,"replace") == 0) {
2222 PyErr_Format(PyExc_ValueError
,
2223 "ASCII encoding error; "
2224 "unknown error handling code: %.400s",
2230 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
2237 repr
= PyString_FromStringAndSize(NULL
, size
);
2243 s
= PyString_AS_STRING(repr
);
2245 while (size
-- > 0) {
2246 Py_UNICODE ch
= *p
++;
2248 if (ascii_encoding_error(&p
, &s
, errors
,
2249 "ordinal not in range(128)"))
2255 /* Resize if error handling skipped some characters */
2256 if (s
- start
< PyString_GET_SIZE(repr
))
2257 if (_PyString_Resize(&repr
, s
- start
))
2266 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
2268 if (!PyUnicode_Check(unicode
)) {
2269 PyErr_BadArgument();
2272 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
2273 PyUnicode_GET_SIZE(unicode
),
2277 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2279 /* --- MBCS codecs for Windows -------------------------------------------- */
2281 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
2288 /* First get the size of the result */
2289 DWORD usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
2290 if (size
> 0 && usize
==0)
2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2293 v
= _PyUnicode_New(usize
);
2297 return (PyObject
*)v
;
2298 p
= PyUnicode_AS_UNICODE(v
);
2299 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
2301 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2304 return (PyObject
*)v
;
2307 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
2315 /* If there are no characters, bail now! */
2317 return PyString_FromString("");
2319 /* First get the size of the result */
2320 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
2322 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2324 repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
2330 /* Do the conversion */
2331 s
= PyString_AS_STRING(repr
);
2332 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
2334 return PyErr_SetFromWindowsErrWithFilename(0, NULL
);
2339 #endif /* MS_WIN32 */
2341 /* --- Character Mapping Codec -------------------------------------------- */
2344 int charmap_decoding_error(const char **source
,
2347 const char *details
)
2349 if ((errors
== NULL
) ||
2350 (strcmp(errors
,"strict") == 0)) {
2351 PyErr_Format(PyExc_UnicodeError
,
2352 "charmap decoding error: %.400s",
2356 else if (strcmp(errors
,"ignore") == 0) {
2359 else if (strcmp(errors
,"replace") == 0) {
2360 **dest
= Py_UNICODE_REPLACEMENT_CHARACTER
;
2365 PyErr_Format(PyExc_ValueError
,
2366 "charmap decoding error; "
2367 "unknown error handling code: %.400s",
2373 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
2382 /* Default to Latin-1 */
2383 if (mapping
== NULL
)
2384 return PyUnicode_DecodeLatin1(s
, size
, errors
);
2386 v
= _PyUnicode_New(size
);
2390 return (PyObject
*)v
;
2391 p
= PyUnicode_AS_UNICODE(v
);
2392 while (size
-- > 0) {
2393 unsigned char ch
= *s
++;
2396 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2397 w
= PyInt_FromLong((long)ch
);
2400 x
= PyObject_GetItem(mapping
, w
);
2403 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2404 /* No mapping found means: mapping is undefined. */
2413 if (PyInt_Check(x
)) {
2414 long value
= PyInt_AS_LONG(x
);
2415 if (value
< 0 || value
> 65535) {
2416 PyErr_SetString(PyExc_TypeError
,
2417 "character mapping must be in range(65536)");
2421 *p
++ = (Py_UNICODE
)value
;
2423 else if (x
== Py_None
) {
2424 /* undefined mapping */
2425 if (charmap_decoding_error(&s
, &p
, errors
,
2426 "character maps to <undefined>")) {
2431 else if (PyUnicode_Check(x
)) {
2432 int targetsize
= PyUnicode_GET_SIZE(x
);
2434 if (targetsize
== 1)
2436 *p
++ = *PyUnicode_AS_UNICODE(x
);
2438 else if (targetsize
> 1) {
2440 if (targetsize
> extrachars
) {
2442 int oldpos
= (int)(p
- PyUnicode_AS_UNICODE(v
));
2443 int needed
= (targetsize
- extrachars
) + \
2445 extrachars
+= needed
;
2446 if (_PyUnicode_Resize(&v
,
2447 PyUnicode_GET_SIZE(v
) + needed
)) {
2451 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
2454 PyUnicode_AS_UNICODE(x
),
2457 extrachars
-= targetsize
;
2459 /* 1-0 mapping: skip the character */
2462 /* wrong return value */
2463 PyErr_SetString(PyExc_TypeError
,
2464 "character mapping must return integer, None or unicode");
2470 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2471 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2473 return (PyObject
*)v
;
2481 int charmap_encoding_error(const Py_UNICODE
**source
,
2484 const char *details
)
2486 if ((errors
== NULL
) ||
2487 (strcmp(errors
,"strict") == 0)) {
2488 PyErr_Format(PyExc_UnicodeError
,
2489 "charmap encoding error: %.400s",
2493 else if (strcmp(errors
,"ignore") == 0) {
2496 else if (strcmp(errors
,"replace") == 0) {
2502 PyErr_Format(PyExc_ValueError
,
2503 "charmap encoding error; "
2504 "unknown error handling code: %.400s",
2510 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
2519 /* Default to Latin-1 */
2520 if (mapping
== NULL
)
2521 return PyUnicode_EncodeLatin1(p
, size
, errors
);
2523 v
= PyString_FromStringAndSize(NULL
, size
);
2528 s
= PyString_AS_STRING(v
);
2529 while (size
-- > 0) {
2530 Py_UNICODE ch
= *p
++;
2533 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2534 w
= PyInt_FromLong((long)ch
);
2537 x
= PyObject_GetItem(mapping
, w
);
2540 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2541 /* No mapping found means: mapping is undefined. */
2550 if (PyInt_Check(x
)) {
2551 long value
= PyInt_AS_LONG(x
);
2552 if (value
< 0 || value
> 255) {
2553 PyErr_SetString(PyExc_TypeError
,
2554 "character mapping must be in range(256)");
2560 else if (x
== Py_None
) {
2561 /* undefined mapping */
2562 if (charmap_encoding_error(&p
, &s
, errors
,
2563 "character maps to <undefined>")) {
2568 else if (PyString_Check(x
)) {
2569 int targetsize
= PyString_GET_SIZE(x
);
2571 if (targetsize
== 1)
2573 *s
++ = *PyString_AS_STRING(x
);
2575 else if (targetsize
> 1) {
2577 if (targetsize
> extrachars
) {
2579 int oldpos
= (int)(s
- PyString_AS_STRING(v
));
2580 int needed
= (targetsize
- extrachars
) + \
2582 extrachars
+= needed
;
2583 if (_PyString_Resize(&v
, PyString_GET_SIZE(v
) + needed
)) {
2587 s
= PyString_AS_STRING(v
) + oldpos
;
2589 memcpy(s
, PyString_AS_STRING(x
), targetsize
);
2591 extrachars
-= targetsize
;
2593 /* 1-0 mapping: skip the character */
2596 /* wrong return value */
2597 PyErr_SetString(PyExc_TypeError
,
2598 "character mapping must return integer, None or unicode");
2604 if (s
- PyString_AS_STRING(v
) < PyString_GET_SIZE(v
))
2605 if (_PyString_Resize(&v
, (int)(s
- PyString_AS_STRING(v
))))
2614 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
2617 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
2618 PyErr_BadArgument();
2621 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
2622 PyUnicode_GET_SIZE(unicode
),
2628 int translate_error(const Py_UNICODE
**source
,
2631 const char *details
)
2633 if ((errors
== NULL
) ||
2634 (strcmp(errors
,"strict") == 0)) {
2635 PyErr_Format(PyExc_UnicodeError
,
2636 "translate error: %.400s",
2640 else if (strcmp(errors
,"ignore") == 0) {
2643 else if (strcmp(errors
,"replace") == 0) {
2649 PyErr_Format(PyExc_ValueError
,
2651 "unknown error handling code: %.400s",
2657 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*s
,
2665 if (mapping
== NULL
) {
2666 PyErr_BadArgument();
2670 /* Output will never be longer than input */
2671 v
= _PyUnicode_New(size
);
2676 p
= PyUnicode_AS_UNICODE(v
);
2677 while (size
-- > 0) {
2678 Py_UNICODE ch
= *s
++;
2682 w
= PyInt_FromLong(ch
);
2685 x
= PyObject_GetItem(mapping
, w
);
2688 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
2689 /* No mapping found: default to 1-1 mapping */
2699 *p
++ = (Py_UNICODE
)PyInt_AS_LONG(x
);
2700 else if (x
== Py_None
) {
2701 /* undefined mapping */
2702 if (translate_error(&s
, &p
, errors
,
2703 "character maps to <undefined>")) {
2708 else if (PyUnicode_Check(x
)) {
2709 if (PyUnicode_GET_SIZE(x
) != 1) {
2711 PyErr_SetString(PyExc_NotImplementedError
,
2712 "1-n mappings are currently not implemented");
2716 *p
++ = *PyUnicode_AS_UNICODE(x
);
2719 /* wrong return value */
2720 PyErr_SetString(PyExc_TypeError
,
2721 "translate mapping must return integer, None or unicode");
2727 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
2728 if (_PyUnicode_Resize(&v
, (int)(p
- PyUnicode_AS_UNICODE(v
))))
2732 return (PyObject
*)v
;
2739 PyObject
*PyUnicode_Translate(PyObject
*str
,
2745 str
= PyUnicode_FromObject(str
);
2748 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
2749 PyUnicode_GET_SIZE(str
),
2760 /* --- Decimal Encoder ---------------------------------------------------- */
2762 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
2767 Py_UNICODE
*p
, *end
;
2769 if (output
== NULL
) {
2770 PyErr_BadArgument();
2777 register Py_UNICODE ch
= *p
++;
2780 if (Py_UNICODE_ISSPACE(ch
)) {
2784 decimal
= Py_UNICODE_TODECIMAL(ch
);
2786 *output
++ = '0' + decimal
;
2789 if (0 < ch
&& ch
< 256) {
2790 *output
++ = (char)ch
;
2793 /* All other characters are considered invalid */
2794 if (errors
== NULL
|| strcmp(errors
, "strict") == 0) {
2795 PyErr_SetString(PyExc_ValueError
,
2796 "invalid decimal Unicode string");
2799 else if (strcmp(errors
, "ignore") == 0)
2801 else if (strcmp(errors
, "replace") == 0) {
2806 /* 0-terminate the output string */
2814 /* --- Helpers ------------------------------------------------------------ */
2817 int count(PyUnicodeObject
*self
,
2820 PyUnicodeObject
*substring
)
2825 start
+= self
->length
;
2828 if (end
> self
->length
)
2831 end
+= self
->length
;
2835 if (substring
->length
== 0)
2836 return (end
- start
+ 1);
2838 end
-= substring
->length
;
2840 while (start
<= end
)
2841 if (Py_UNICODE_MATCH(self
, start
, substring
)) {
2843 start
+= substring
->length
;
2850 int PyUnicode_Count(PyObject
*str
,
2857 str
= PyUnicode_FromObject(str
);
2860 substr
= PyUnicode_FromObject(substr
);
2861 if (substr
== NULL
) {
2866 result
= count((PyUnicodeObject
*)str
,
2868 (PyUnicodeObject
*)substr
);
2876 int findstring(PyUnicodeObject
*self
,
2877 PyUnicodeObject
*substring
,
2883 start
+= self
->length
;
2887 if (substring
->length
== 0)
2890 if (end
> self
->length
)
2893 end
+= self
->length
;
2897 end
-= substring
->length
;
2899 if (direction
< 0) {
2900 for (; end
>= start
; end
--)
2901 if (Py_UNICODE_MATCH(self
, end
, substring
))
2904 for (; start
<= end
; start
++)
2905 if (Py_UNICODE_MATCH(self
, start
, substring
))
2912 int PyUnicode_Find(PyObject
*str
,
2920 str
= PyUnicode_FromObject(str
);
2923 substr
= PyUnicode_FromObject(substr
);
2924 if (substr
== NULL
) {
2929 result
= findstring((PyUnicodeObject
*)str
,
2930 (PyUnicodeObject
*)substr
,
2931 start
, end
, direction
);
2938 int tailmatch(PyUnicodeObject
*self
,
2939 PyUnicodeObject
*substring
,
2945 start
+= self
->length
;
2949 if (substring
->length
== 0)
2952 if (end
> self
->length
)
2955 end
+= self
->length
;
2959 end
-= substring
->length
;
2963 if (direction
> 0) {
2964 if (Py_UNICODE_MATCH(self
, end
, substring
))
2967 if (Py_UNICODE_MATCH(self
, start
, substring
))
2974 int PyUnicode_Tailmatch(PyObject
*str
,
2982 str
= PyUnicode_FromObject(str
);
2985 substr
= PyUnicode_FromObject(substr
);
2986 if (substr
== NULL
) {
2991 result
= tailmatch((PyUnicodeObject
*)str
,
2992 (PyUnicodeObject
*)substr
,
2993 start
, end
, direction
);
3000 const Py_UNICODE
*findchar(const Py_UNICODE
*s
,
3004 /* like wcschr, but doesn't stop at NULL characters */
3006 while (size
-- > 0) {
3015 /* Apply fixfct filter to the Unicode object self and return a
3016 reference to the modified object */
3019 PyObject
*fixup(PyUnicodeObject
*self
,
3020 int (*fixfct
)(PyUnicodeObject
*s
))
3025 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
3029 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
3031 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
3032 /* fixfct should return TRUE if it modified the buffer. If
3033 FALSE, return a reference to the original buffer instead
3034 (to save space, not time) */
3037 return (PyObject
*) self
;
3039 return (PyObject
*) u
;
3043 int fixupper(PyUnicodeObject
*self
)
3045 int len
= self
->length
;
3046 Py_UNICODE
*s
= self
->str
;
3050 register Py_UNICODE ch
;
3052 ch
= Py_UNICODE_TOUPPER(*s
);
3064 int fixlower(PyUnicodeObject
*self
)
3066 int len
= self
->length
;
3067 Py_UNICODE
*s
= self
->str
;
3071 register Py_UNICODE ch
;
3073 ch
= Py_UNICODE_TOLOWER(*s
);
3085 int fixswapcase(PyUnicodeObject
*self
)
3087 int len
= self
->length
;
3088 Py_UNICODE
*s
= self
->str
;
3092 if (Py_UNICODE_ISUPPER(*s
)) {
3093 *s
= Py_UNICODE_TOLOWER(*s
);
3095 } else if (Py_UNICODE_ISLOWER(*s
)) {
3096 *s
= Py_UNICODE_TOUPPER(*s
);
3106 int fixcapitalize(PyUnicodeObject
*self
)
3108 int len
= self
->length
;
3109 Py_UNICODE
*s
= self
->str
;
3114 if (Py_UNICODE_ISLOWER(*s
)) {
3115 *s
= Py_UNICODE_TOUPPER(*s
);
3120 if (Py_UNICODE_ISUPPER(*s
)) {
3121 *s
= Py_UNICODE_TOLOWER(*s
);
3130 int fixtitle(PyUnicodeObject
*self
)
3132 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
3133 register Py_UNICODE
*e
;
3134 int previous_is_cased
;
3136 /* Shortcut for single character strings */
3137 if (PyUnicode_GET_SIZE(self
) == 1) {
3138 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
3147 e
= p
+ PyUnicode_GET_SIZE(self
);
3148 previous_is_cased
= 0;
3149 for (; p
< e
; p
++) {
3150 register const Py_UNICODE ch
= *p
;
3152 if (previous_is_cased
)
3153 *p
= Py_UNICODE_TOLOWER(ch
);
3155 *p
= Py_UNICODE_TOTITLE(ch
);
3157 if (Py_UNICODE_ISLOWER(ch
) ||
3158 Py_UNICODE_ISUPPER(ch
) ||
3159 Py_UNICODE_ISTITLE(ch
))
3160 previous_is_cased
= 1;
3162 previous_is_cased
= 0;
3167 PyObject
*PyUnicode_Join(PyObject
*separator
,
3172 PyUnicodeObject
*res
= NULL
;
3179 it
= PyObject_GetIter(seq
);
3183 if (separator
== NULL
) {
3184 Py_UNICODE blank
= ' ';
3189 separator
= PyUnicode_FromObject(separator
);
3190 if (separator
== NULL
)
3192 sep
= PyUnicode_AS_UNICODE(separator
);
3193 seplen
= PyUnicode_GET_SIZE(separator
);
3196 res
= _PyUnicode_New(sz
);
3199 p
= PyUnicode_AS_UNICODE(res
);
3202 for (i
= 0; ; ++i
) {
3204 PyObject
*item
= PyIter_Next(it
);
3206 if (PyErr_Occurred())
3210 if (!PyUnicode_Check(item
)) {
3212 if (!PyString_Check(item
)) {
3213 PyErr_Format(PyExc_TypeError
,
3214 "sequence item %i: expected string or Unicode,"
3216 i
, item
->ob_type
->tp_name
);
3220 v
= PyUnicode_FromObject(item
);
3226 itemlen
= PyUnicode_GET_SIZE(item
);
3227 while (reslen
+ itemlen
+ seplen
>= sz
) {
3228 if (_PyUnicode_Resize(&res
, sz
*2)) {
3233 p
= PyUnicode_AS_UNICODE(res
) + reslen
;
3236 Py_UNICODE_COPY(p
, sep
, seplen
);
3240 Py_UNICODE_COPY(p
, PyUnicode_AS_UNICODE(item
), itemlen
);
3245 if (_PyUnicode_Resize(&res
, reslen
))
3248 Py_XDECREF(separator
);
3250 return (PyObject
*)res
;
3253 Py_XDECREF(separator
);
3260 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
3272 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
3277 u
= _PyUnicode_New(left
+ self
->length
+ right
);
3280 Py_UNICODE_FILL(u
->str
, fill
, left
);
3281 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
3283 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
3289 #define SPLIT_APPEND(data, left, right) \
3290 str = PyUnicode_FromUnicode(data + left, right - left); \
3293 if (PyList_Append(list, str)) { \
3301 PyObject
*split_whitespace(PyUnicodeObject
*self
,
3307 int len
= self
->length
;
3310 for (i
= j
= 0; i
< len
; ) {
3312 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3315 while (i
< len
&& !Py_UNICODE_ISSPACE(self
->str
[i
]))
3318 if (maxcount
-- <= 0)
3320 SPLIT_APPEND(self
->str
, j
, i
);
3321 while (i
< len
&& Py_UNICODE_ISSPACE(self
->str
[i
]))
3327 SPLIT_APPEND(self
->str
, j
, len
);
3336 PyObject
*PyUnicode_Splitlines(PyObject
*string
,
3346 string
= PyUnicode_FromObject(string
);
3349 data
= PyUnicode_AS_UNICODE(string
);
3350 len
= PyUnicode_GET_SIZE(string
);
3352 list
= PyList_New(0);
3356 for (i
= j
= 0; i
< len
; ) {
3359 /* Find a line and append it */
3360 while (i
< len
&& !Py_UNICODE_ISLINEBREAK(data
[i
]))
3363 /* Skip the line break reading CRLF as one line break */
3366 if (data
[i
] == '\r' && i
+ 1 < len
&&
3374 SPLIT_APPEND(data
, j
, eol
);
3378 SPLIT_APPEND(data
, j
, len
);
3391 PyObject
*split_char(PyUnicodeObject
*self
,
3398 int len
= self
->length
;
3401 for (i
= j
= 0; i
< len
; ) {
3402 if (self
->str
[i
] == ch
) {
3403 if (maxcount
-- <= 0)
3405 SPLIT_APPEND(self
->str
, j
, i
);
3411 SPLIT_APPEND(self
->str
, j
, len
);
3421 PyObject
*split_substring(PyUnicodeObject
*self
,
3423 PyUnicodeObject
*substring
,
3428 int len
= self
->length
;
3429 int sublen
= substring
->length
;
3432 for (i
= j
= 0; i
<= len
- sublen
; ) {
3433 if (Py_UNICODE_MATCH(self
, i
, substring
)) {
3434 if (maxcount
-- <= 0)
3436 SPLIT_APPEND(self
->str
, j
, i
);
3442 SPLIT_APPEND(self
->str
, j
, len
);
3454 PyObject
*split(PyUnicodeObject
*self
,
3455 PyUnicodeObject
*substring
,
3463 list
= PyList_New(0);
3467 if (substring
== NULL
)
3468 return split_whitespace(self
,list
,maxcount
);
3470 else if (substring
->length
== 1)
3471 return split_char(self
,list
,substring
->str
[0],maxcount
);
3473 else if (substring
->length
== 0) {
3475 PyErr_SetString(PyExc_ValueError
, "empty separator");
3479 return split_substring(self
,list
,substring
,maxcount
);
3483 PyObject
*strip(PyUnicodeObject
*self
,
3487 Py_UNICODE
*p
= self
->str
;
3489 int end
= self
->length
;
3492 while (start
< end
&& Py_UNICODE_ISSPACE(p
[start
]))
3496 while (end
> start
&& Py_UNICODE_ISSPACE(p
[end
-1]))
3499 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
3500 /* couldn't strip anything off, return original string */
3502 return (PyObject
*) self
;
3505 return (PyObject
*) PyUnicode_FromUnicode(
3512 PyObject
*replace(PyUnicodeObject
*self
,
3513 PyUnicodeObject
*str1
,
3514 PyUnicodeObject
*str2
,
3522 if (str1
->length
== 1 && str2
->length
== 1) {
3525 /* replace characters */
3526 if (!findchar(self
->str
, self
->length
, str1
->str
[0]) &&
3527 PyUnicode_CheckExact(self
)) {
3528 /* nothing to replace, return original string */
3532 Py_UNICODE u1
= str1
->str
[0];
3533 Py_UNICODE u2
= str2
->str
[0];
3535 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(
3540 Py_UNICODE_COPY(u
->str
, self
->str
,
3542 for (i
= 0; i
< u
->length
; i
++)
3543 if (u
->str
[i
] == u1
) {
3555 /* replace strings */
3556 n
= count(self
, 0, self
->length
, str1
);
3559 if (n
== 0 && PyUnicode_CheckExact(self
)) {
3560 /* nothing to replace, return original string */
3565 self
->length
+ n
* (str2
->length
- str1
->length
));
3569 while (i
<= self
->length
- str1
->length
)
3570 if (Py_UNICODE_MATCH(self
, i
, str1
)) {
3571 /* replace string segment */
3572 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
3576 /* copy remaining part */
3577 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
3581 *p
++ = self
->str
[i
++];
3586 return (PyObject
*) u
;
3589 /* --- Unicode Object Methods --------------------------------------------- */
3591 static char title__doc__
[] =
3592 "S.title() -> unicode\n\
3594 Return a titlecased version of S, i.e. words start with title case\n\
3595 characters, all remaining cased characters have lower case.";
3598 unicode_title(PyUnicodeObject
*self
)
3600 return fixup(self
, fixtitle
);
3603 static char capitalize__doc__
[] =
3604 "S.capitalize() -> unicode\n\
3606 Return a capitalized version of S, i.e. make the first character\n\
3610 unicode_capitalize(PyUnicodeObject
*self
)
3612 return fixup(self
, fixcapitalize
);
3616 static char capwords__doc__
[] =
3617 "S.capwords() -> unicode\n\
3619 Apply .capitalize() to all words in S and return the result with\n\
3620 normalized whitespace (all whitespace strings are replaced by ' ').";
3623 unicode_capwords(PyUnicodeObject
*self
)
3629 /* Split into words */
3630 list
= split(self
, NULL
, -1);
3634 /* Capitalize each word */
3635 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
3636 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
3640 Py_DECREF(PyList_GET_ITEM(list
, i
));
3641 PyList_SET_ITEM(list
, i
, item
);
3644 /* Join the words to form a new string */
3645 item
= PyUnicode_Join(NULL
, list
);
3649 return (PyObject
*)item
;
3653 static char center__doc__
[] =
3654 "S.center(width) -> unicode\n\
3656 Return S centered in a Unicode string of length width. Padding is done\n\
3660 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
3665 if (!PyArg_ParseTuple(args
, "i:center", &width
))
3668 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
3670 return (PyObject
*) self
;
3673 marg
= width
- self
->length
;
3674 left
= marg
/ 2 + (marg
& width
& 1);
3676 return (PyObject
*) pad(self
, left
, marg
- left
, ' ');
3681 /* This code should go into some future Unicode collation support
3682 module. The basic comparison should compare ordinals on a naive
3683 basis (this is what Java does and thus JPython too). */
3685 /* speedy UTF-16 code point order comparison */
3687 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3689 static short utf16Fixup
[32] =
3691 0, 0, 0, 0, 0, 0, 0, 0,
3692 0, 0, 0, 0, 0, 0, 0, 0,
3693 0, 0, 0, 0, 0, 0, 0, 0,
3694 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3698 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3702 Py_UNICODE
*s1
= str1
->str
;
3703 Py_UNICODE
*s2
= str2
->str
;
3705 len1
= str1
->length
;
3706 len2
= str2
->length
;
3708 while (len1
> 0 && len2
> 0) {
3714 if (c1
> (1<<11) * 26)
3715 c1
+= utf16Fixup
[c1
>>11];
3716 if (c2
> (1<<11) * 26)
3717 c2
+= utf16Fixup
[c2
>>11];
3718 /* now c1 and c2 are in UTF-32-compatible order */
3721 return (c1
< c2
) ? -1 : 1;
3726 return (len1
< len2
) ? -1 : (len1
!= len2
);
3732 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
3734 register int len1
, len2
;
3736 Py_UNICODE
*s1
= str1
->str
;
3737 Py_UNICODE
*s2
= str2
->str
;
3739 len1
= str1
->length
;
3740 len2
= str2
->length
;
3742 while (len1
> 0 && len2
> 0) {
3749 return (c1
< c2
) ? -1 : 1;
3754 return (len1
< len2
) ? -1 : (len1
!= len2
);
3759 int PyUnicode_Compare(PyObject
*left
,
3762 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3765 /* Coerce the two arguments */
3766 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3769 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3773 /* Shortcut for empty or interned objects */
3780 result
= unicode_compare(u
, v
);
3792 int PyUnicode_Contains(PyObject
*container
,
3795 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
3797 register const Py_UNICODE
*p
, *e
;
3798 register Py_UNICODE ch
;
3800 /* Coerce the two arguments */
3801 v
= (PyUnicodeObject
*)PyUnicode_FromObject(element
);
3803 PyErr_SetString(PyExc_TypeError
,
3804 "'in <string>' requires character as left operand");
3807 u
= (PyUnicodeObject
*)PyUnicode_FromObject(container
);
3814 if (PyUnicode_GET_SIZE(v
) != 1) {
3815 PyErr_SetString(PyExc_TypeError
,
3816 "'in <string>' requires character as left operand");
3819 ch
= *PyUnicode_AS_UNICODE(v
);
3820 p
= PyUnicode_AS_UNICODE(u
);
3821 e
= p
+ PyUnicode_GET_SIZE(u
);
3840 /* Concat to string or Unicode object giving a new Unicode object. */
3842 PyObject
*PyUnicode_Concat(PyObject
*left
,
3845 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
3847 /* Coerce the two arguments */
3848 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
3851 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
3856 if (v
== unicode_empty
) {
3858 return (PyObject
*)u
;
3860 if (u
== unicode_empty
) {
3862 return (PyObject
*)v
;
3865 /* Concat the two Unicode strings */
3866 w
= _PyUnicode_New(u
->length
+ v
->length
);
3869 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
3870 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
3874 return (PyObject
*)w
;
3882 static char count__doc__
[] =
3883 "S.count(sub[, start[, end]]) -> int\n\
3885 Return the number of occurrences of substring sub in Unicode string\n\
3886 S[start:end]. Optional arguments start and end are\n\
3887 interpreted as in slice notation.";
3890 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
3892 PyUnicodeObject
*substring
;
3897 if (!PyArg_ParseTuple(args
, "O|O&O&:count", &substring
,
3898 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
3901 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
3902 (PyObject
*)substring
);
3903 if (substring
== NULL
)
3907 start
+= self
->length
;
3910 if (end
> self
->length
)
3913 end
+= self
->length
;
3917 result
= PyInt_FromLong((long) count(self
, start
, end
, substring
));
3919 Py_DECREF(substring
);
3923 static char encode__doc__
[] =
3924 "S.encode([encoding[,errors]]) -> string\n\
3926 Return an encoded string version of S. Default encoding is the current\n\
3927 default string encoding. errors may be given to set a different error\n\
3928 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3929 a ValueError. Other possible values are 'ignore' and 'replace'.";
3932 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
)
3934 char *encoding
= NULL
;
3935 char *errors
= NULL
;
3936 if (!PyArg_ParseTuple(args
, "|ss:encode", &encoding
, &errors
))
3938 return PyUnicode_AsEncodedString((PyObject
*)self
, encoding
, errors
);
3941 static char expandtabs__doc__
[] =
3942 "S.expandtabs([tabsize]) -> unicode\n\
3944 Return a copy of S where all tab characters are expanded using spaces.\n\
3945 If tabsize is not given, a tab size of 8 characters is assumed.";
3948 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
3957 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
3960 /* First pass: determine size of output string */
3962 e
= self
->str
+ self
->length
;
3963 for (p
= self
->str
; p
< e
; p
++)
3966 j
+= tabsize
- (j
% tabsize
);
3970 if (*p
== '\n' || *p
== '\r') {
3976 /* Second pass: create output string and fill it */
3977 u
= _PyUnicode_New(i
+ j
);
3984 for (p
= self
->str
; p
< e
; p
++)
3987 i
= tabsize
- (j
% tabsize
);
3996 if (*p
== '\n' || *p
== '\r')
4000 return (PyObject
*) u
;
4003 static char find__doc__
[] =
4004 "S.find(sub [,start [,end]]) -> int\n\
4006 Return the lowest index in S where substring sub is found,\n\
4007 such that sub is contained within s[start,end]. Optional\n\
4008 arguments start and end are interpreted as in slice notation.\n\
4010 Return -1 on failure.";
4013 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
4015 PyUnicodeObject
*substring
;
4020 if (!PyArg_ParseTuple(args
, "O|O&O&:find", &substring
,
4021 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4023 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4024 (PyObject
*)substring
);
4025 if (substring
== NULL
)
4028 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, 1));
4030 Py_DECREF(substring
);
4035 unicode_getitem(PyUnicodeObject
*self
, int index
)
4037 if (index
< 0 || index
>= self
->length
) {
4038 PyErr_SetString(PyExc_IndexError
, "string index out of range");
4042 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
4046 unicode_hash(PyUnicodeObject
*self
)
4048 /* Since Unicode objects compare equal to their ASCII string
4049 counterparts, they should use the individual character values
4050 as basis for their hash value. This is needed to assure that
4051 strings and Unicode objects behave in the same way as
4055 register Py_UNICODE
*p
;
4058 if (self
->hash
!= -1)
4060 len
= PyUnicode_GET_SIZE(self
);
4061 p
= PyUnicode_AS_UNICODE(self
);
4064 x
= (1000003*x
) ^ *p
++;
4065 x
^= PyUnicode_GET_SIZE(self
);
4072 static char index__doc__
[] =
4073 "S.index(sub [,start [,end]]) -> int\n\
4075 Like S.find() but raise ValueError when the substring is not found.";
4078 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
4081 PyUnicodeObject
*substring
;
4085 if (!PyArg_ParseTuple(args
, "O|O&O&:index", &substring
,
4086 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4089 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4090 (PyObject
*)substring
);
4091 if (substring
== NULL
)
4094 result
= findstring(self
, substring
, start
, end
, 1);
4096 Py_DECREF(substring
);
4098 PyErr_SetString(PyExc_ValueError
, "substring not found");
4101 return PyInt_FromLong(result
);
4104 static char islower__doc__
[] =
4105 "S.islower() -> int\n\
4107 Return 1 if all cased characters in S are lowercase and there is\n\
4108 at least one cased character in S, 0 otherwise.";
4111 unicode_islower(PyUnicodeObject
*self
)
4113 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4114 register const Py_UNICODE
*e
;
4117 /* Shortcut for single character strings */
4118 if (PyUnicode_GET_SIZE(self
) == 1)
4119 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p
) != 0);
4121 /* Special case for empty strings */
4122 if (PyString_GET_SIZE(self
) == 0)
4123 return PyInt_FromLong(0);
4125 e
= p
+ PyUnicode_GET_SIZE(self
);
4127 for (; p
< e
; p
++) {
4128 register const Py_UNICODE ch
= *p
;
4130 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
4131 return PyInt_FromLong(0);
4132 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
4135 return PyInt_FromLong(cased
);
4138 static char isupper__doc__
[] =
4139 "S.isupper() -> int\n\
4141 Return 1 if all cased characters in S are uppercase and there is\n\
4142 at least one cased character in S, 0 otherwise.";
4145 unicode_isupper(PyUnicodeObject
*self
)
4147 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4148 register const Py_UNICODE
*e
;
4151 /* Shortcut for single character strings */
4152 if (PyUnicode_GET_SIZE(self
) == 1)
4153 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
4155 /* Special case for empty strings */
4156 if (PyString_GET_SIZE(self
) == 0)
4157 return PyInt_FromLong(0);
4159 e
= p
+ PyUnicode_GET_SIZE(self
);
4161 for (; p
< e
; p
++) {
4162 register const Py_UNICODE ch
= *p
;
4164 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
4165 return PyInt_FromLong(0);
4166 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
4169 return PyInt_FromLong(cased
);
4172 static char istitle__doc__
[] =
4173 "S.istitle() -> int\n\
4175 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4176 may only follow uncased characters and lowercase characters only cased\n\
4177 ones. Return 0 otherwise.";
4180 unicode_istitle(PyUnicodeObject
*self
)
4182 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4183 register const Py_UNICODE
*e
;
4184 int cased
, previous_is_cased
;
4186 /* Shortcut for single character strings */
4187 if (PyUnicode_GET_SIZE(self
) == 1)
4188 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
4189 (Py_UNICODE_ISUPPER(*p
) != 0));
4191 /* Special case for empty strings */
4192 if (PyString_GET_SIZE(self
) == 0)
4193 return PyInt_FromLong(0);
4195 e
= p
+ PyUnicode_GET_SIZE(self
);
4197 previous_is_cased
= 0;
4198 for (; p
< e
; p
++) {
4199 register const Py_UNICODE ch
= *p
;
4201 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
4202 if (previous_is_cased
)
4203 return PyInt_FromLong(0);
4204 previous_is_cased
= 1;
4207 else if (Py_UNICODE_ISLOWER(ch
)) {
4208 if (!previous_is_cased
)
4209 return PyInt_FromLong(0);
4210 previous_is_cased
= 1;
4214 previous_is_cased
= 0;
4216 return PyInt_FromLong(cased
);
4219 static char isspace__doc__
[] =
4220 "S.isspace() -> int\n\
4222 Return 1 if there are only whitespace characters in S,\n\
4226 unicode_isspace(PyUnicodeObject
*self
)
4228 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4229 register const Py_UNICODE
*e
;
4231 /* Shortcut for single character strings */
4232 if (PyUnicode_GET_SIZE(self
) == 1 &&
4233 Py_UNICODE_ISSPACE(*p
))
4234 return PyInt_FromLong(1);
4236 /* Special case for empty strings */
4237 if (PyString_GET_SIZE(self
) == 0)
4238 return PyInt_FromLong(0);
4240 e
= p
+ PyUnicode_GET_SIZE(self
);
4241 for (; p
< e
; p
++) {
4242 if (!Py_UNICODE_ISSPACE(*p
))
4243 return PyInt_FromLong(0);
4245 return PyInt_FromLong(1);
4248 static char isalpha__doc__
[] =
4249 "S.isalpha() -> int\n\
4251 Return 1 if all characters in S are alphabetic\n\
4252 and there is at least one character in S, 0 otherwise.";
4255 unicode_isalpha(PyUnicodeObject
*self
)
4257 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4258 register const Py_UNICODE
*e
;
4260 /* Shortcut for single character strings */
4261 if (PyUnicode_GET_SIZE(self
) == 1 &&
4262 Py_UNICODE_ISALPHA(*p
))
4263 return PyInt_FromLong(1);
4265 /* Special case for empty strings */
4266 if (PyString_GET_SIZE(self
) == 0)
4267 return PyInt_FromLong(0);
4269 e
= p
+ PyUnicode_GET_SIZE(self
);
4270 for (; p
< e
; p
++) {
4271 if (!Py_UNICODE_ISALPHA(*p
))
4272 return PyInt_FromLong(0);
4274 return PyInt_FromLong(1);
4277 static char isalnum__doc__
[] =
4278 "S.isalnum() -> int\n\
4280 Return 1 if all characters in S are alphanumeric\n\
4281 and there is at least one character in S, 0 otherwise.";
4284 unicode_isalnum(PyUnicodeObject
*self
)
4286 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4287 register const Py_UNICODE
*e
;
4289 /* Shortcut for single character strings */
4290 if (PyUnicode_GET_SIZE(self
) == 1 &&
4291 Py_UNICODE_ISALNUM(*p
))
4292 return PyInt_FromLong(1);
4294 /* Special case for empty strings */
4295 if (PyString_GET_SIZE(self
) == 0)
4296 return PyInt_FromLong(0);
4298 e
= p
+ PyUnicode_GET_SIZE(self
);
4299 for (; p
< e
; p
++) {
4300 if (!Py_UNICODE_ISALNUM(*p
))
4301 return PyInt_FromLong(0);
4303 return PyInt_FromLong(1);
4306 static char isdecimal__doc__
[] =
4307 "S.isdecimal() -> int\n\
4309 Return 1 if there are only decimal characters in S,\n\
4313 unicode_isdecimal(PyUnicodeObject
*self
)
4315 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4316 register const Py_UNICODE
*e
;
4318 /* Shortcut for single character strings */
4319 if (PyUnicode_GET_SIZE(self
) == 1 &&
4320 Py_UNICODE_ISDECIMAL(*p
))
4321 return PyInt_FromLong(1);
4323 /* Special case for empty strings */
4324 if (PyString_GET_SIZE(self
) == 0)
4325 return PyInt_FromLong(0);
4327 e
= p
+ PyUnicode_GET_SIZE(self
);
4328 for (; p
< e
; p
++) {
4329 if (!Py_UNICODE_ISDECIMAL(*p
))
4330 return PyInt_FromLong(0);
4332 return PyInt_FromLong(1);
4335 static char isdigit__doc__
[] =
4336 "S.isdigit() -> int\n\
4338 Return 1 if there are only digit characters in S,\n\
4342 unicode_isdigit(PyUnicodeObject
*self
)
4344 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4345 register const Py_UNICODE
*e
;
4347 /* Shortcut for single character strings */
4348 if (PyUnicode_GET_SIZE(self
) == 1 &&
4349 Py_UNICODE_ISDIGIT(*p
))
4350 return PyInt_FromLong(1);
4352 /* Special case for empty strings */
4353 if (PyString_GET_SIZE(self
) == 0)
4354 return PyInt_FromLong(0);
4356 e
= p
+ PyUnicode_GET_SIZE(self
);
4357 for (; p
< e
; p
++) {
4358 if (!Py_UNICODE_ISDIGIT(*p
))
4359 return PyInt_FromLong(0);
4361 return PyInt_FromLong(1);
4364 static char isnumeric__doc__
[] =
4365 "S.isnumeric() -> int\n\
4367 Return 1 if there are only numeric characters in S,\n\
4371 unicode_isnumeric(PyUnicodeObject
*self
)
4373 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
4374 register const Py_UNICODE
*e
;
4376 /* Shortcut for single character strings */
4377 if (PyUnicode_GET_SIZE(self
) == 1 &&
4378 Py_UNICODE_ISNUMERIC(*p
))
4379 return PyInt_FromLong(1);
4381 /* Special case for empty strings */
4382 if (PyString_GET_SIZE(self
) == 0)
4383 return PyInt_FromLong(0);
4385 e
= p
+ PyUnicode_GET_SIZE(self
);
4386 for (; p
< e
; p
++) {
4387 if (!Py_UNICODE_ISNUMERIC(*p
))
4388 return PyInt_FromLong(0);
4390 return PyInt_FromLong(1);
4393 static char join__doc__
[] =
4394 "S.join(sequence) -> unicode\n\
4396 Return a string which is the concatenation of the strings in the\n\
4397 sequence. The separator between elements is S.";
4400 unicode_join(PyObject
*self
, PyObject
*data
)
4402 return PyUnicode_Join(self
, data
);
4406 unicode_length(PyUnicodeObject
*self
)
4408 return self
->length
;
4411 static char ljust__doc__
[] =
4412 "S.ljust(width) -> unicode\n\
4414 Return S left justified in a Unicode string of length width. Padding is\n\
4415 done using spaces.";
4418 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
4421 if (!PyArg_ParseTuple(args
, "i:ljust", &width
))
4424 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4426 return (PyObject
*) self
;
4429 return (PyObject
*) pad(self
, 0, width
- self
->length
, ' ');
4432 static char lower__doc__
[] =
4433 "S.lower() -> unicode\n\
4435 Return a copy of the string S converted to lowercase.";
4438 unicode_lower(PyUnicodeObject
*self
)
4440 return fixup(self
, fixlower
);
4443 static char lstrip__doc__
[] =
4444 "S.lstrip() -> unicode\n\
4446 Return a copy of the string S with leading whitespace removed.";
4449 unicode_lstrip(PyUnicodeObject
*self
)
4451 return strip(self
, 1, 0);
4455 unicode_repeat(PyUnicodeObject
*str
, int len
)
4465 if (len
== 1 && PyUnicode_CheckExact(str
)) {
4466 /* no repeat, return original string */
4468 return (PyObject
*) str
;
4471 /* ensure # of chars needed doesn't overflow int and # of bytes
4472 * needed doesn't overflow size_t
4474 nchars
= len
* str
->length
;
4475 if (len
&& nchars
/ len
!= str
->length
) {
4476 PyErr_SetString(PyExc_OverflowError
,
4477 "repeated string is too long");
4480 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
4481 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
4482 PyErr_SetString(PyExc_OverflowError
,
4483 "repeated string is too long");
4486 u
= _PyUnicode_New(nchars
);
4493 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
4497 return (PyObject
*) u
;
4500 PyObject
*PyUnicode_Replace(PyObject
*obj
,
4510 self
= PyUnicode_FromObject(obj
);
4513 str1
= PyUnicode_FromObject(subobj
);
4518 str2
= PyUnicode_FromObject(replobj
);
4524 result
= replace((PyUnicodeObject
*)self
,
4525 (PyUnicodeObject
*)str1
,
4526 (PyUnicodeObject
*)str2
,
4534 static char replace__doc__
[] =
4535 "S.replace (old, new[, maxsplit]) -> unicode\n\
4537 Return a copy of S with all occurrences of substring\n\
4538 old replaced by new. If the optional argument maxsplit is\n\
4539 given, only the first maxsplit occurrences are replaced.";
4542 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
4544 PyUnicodeObject
*str1
;
4545 PyUnicodeObject
*str2
;
4549 if (!PyArg_ParseTuple(args
, "OO|i:replace", &str1
, &str2
, &maxcount
))
4551 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
4554 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
4558 result
= replace(self
, str1
, str2
, maxcount
);
4566 PyObject
*unicode_repr(PyObject
*unicode
)
4568 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
4569 PyUnicode_GET_SIZE(unicode
),
4573 static char rfind__doc__
[] =
4574 "S.rfind(sub [,start [,end]]) -> int\n\
4576 Return the highest index in S where substring sub is found,\n\
4577 such that sub is contained within s[start,end]. Optional\n\
4578 arguments start and end are interpreted as in slice notation.\n\
4580 Return -1 on failure.";
4583 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
4585 PyUnicodeObject
*substring
;
4590 if (!PyArg_ParseTuple(args
, "O|O&O&:rfind", &substring
,
4591 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4593 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4594 (PyObject
*)substring
);
4595 if (substring
== NULL
)
4598 result
= PyInt_FromLong(findstring(self
, substring
, start
, end
, -1));
4600 Py_DECREF(substring
);
4604 static char rindex__doc__
[] =
4605 "S.rindex(sub [,start [,end]]) -> int\n\
4607 Like S.rfind() but raise ValueError when the substring is not found.";
4610 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
4613 PyUnicodeObject
*substring
;
4617 if (!PyArg_ParseTuple(args
, "O|O&O&:rindex", &substring
,
4618 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4620 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4621 (PyObject
*)substring
);
4622 if (substring
== NULL
)
4625 result
= findstring(self
, substring
, start
, end
, -1);
4627 Py_DECREF(substring
);
4629 PyErr_SetString(PyExc_ValueError
, "substring not found");
4632 return PyInt_FromLong(result
);
4635 static char rjust__doc__
[] =
4636 "S.rjust(width) -> unicode\n\
4638 Return S right justified in a Unicode string of length width. Padding is\n\
4639 done using spaces.";
4642 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
4645 if (!PyArg_ParseTuple(args
, "i:rjust", &width
))
4648 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
4650 return (PyObject
*) self
;
4653 return (PyObject
*) pad(self
, width
- self
->length
, 0, ' ');
4656 static char rstrip__doc__
[] =
4657 "S.rstrip() -> unicode\n\
4659 Return a copy of the string S with trailing whitespace removed.";
4662 unicode_rstrip(PyUnicodeObject
*self
)
4664 return strip(self
, 0, 1);
4668 unicode_slice(PyUnicodeObject
*self
, int start
, int end
)
4670 /* standard clamping */
4675 if (end
> self
->length
)
4677 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
4678 /* full slice, return original string */
4680 return (PyObject
*) self
;
4685 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
4689 PyObject
*PyUnicode_Split(PyObject
*s
,
4695 s
= PyUnicode_FromObject(s
);
4699 sep
= PyUnicode_FromObject(sep
);
4706 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
4713 static char split__doc__
[] =
4714 "S.split([sep [,maxsplit]]) -> list of strings\n\
4716 Return a list of the words in S, using sep as the\n\
4717 delimiter string. If maxsplit is given, at most maxsplit\n\
4718 splits are done. If sep is not specified, any whitespace string\n\
4722 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
4724 PyObject
*substring
= Py_None
;
4727 if (!PyArg_ParseTuple(args
, "|Oi:split", &substring
, &maxcount
))
4730 if (substring
== Py_None
)
4731 return split(self
, NULL
, maxcount
);
4732 else if (PyUnicode_Check(substring
))
4733 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
4735 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
4738 static char splitlines__doc__
[] =
4739 "S.splitlines([keepends]]) -> list of strings\n\
4741 Return a list of the lines in S, breaking at line boundaries.\n\
4742 Line breaks are not included in the resulting list unless keepends\n\
4743 is given and true.";
4746 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
4750 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
4753 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
4757 PyObject
*unicode_str(PyUnicodeObject
*self
)
4759 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
4762 static char strip__doc__
[] =
4763 "S.strip() -> unicode\n\
4765 Return a copy of S with leading and trailing whitespace removed.";
4768 unicode_strip(PyUnicodeObject
*self
)
4770 return strip(self
, 1, 1);
4773 static char swapcase__doc__
[] =
4774 "S.swapcase() -> unicode\n\
4776 Return a copy of S with uppercase characters converted to lowercase\n\
4780 unicode_swapcase(PyUnicodeObject
*self
)
4782 return fixup(self
, fixswapcase
);
4785 static char translate__doc__
[] =
4786 "S.translate(table) -> unicode\n\
4788 Return a copy of the string S, where all characters have been mapped\n\
4789 through the given translation table, which must be a mapping of\n\
4790 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4791 are left untouched. Characters mapped to None are deleted.";
4794 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
4796 return PyUnicode_TranslateCharmap(self
->str
,
4802 static char upper__doc__
[] =
4803 "S.upper() -> unicode\n\
4805 Return a copy of S converted to uppercase.";
4808 unicode_upper(PyUnicodeObject
*self
)
4810 return fixup(self
, fixupper
);
4814 static char zfill__doc__
[] =
4815 "S.zfill(width) -> unicode\n\
4817 Pad a numeric string x with zeros on the left, to fill a field\n\
4818 of the specified width. The string x is never truncated.";
4821 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
4827 if (!PyArg_ParseTuple(args
, "i:zfill", &width
))
4830 if (self
->length
>= width
) {
4832 return (PyObject
*) self
;
4835 fill
= width
- self
->length
;
4837 u
= pad(self
, fill
, 0, '0');
4839 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
4840 /* move sign to beginning of string */
4841 u
->str
[0] = u
->str
[fill
];
4845 return (PyObject
*) u
;
4851 unicode_freelistsize(PyUnicodeObject
*self
)
4853 return PyInt_FromLong(unicode_freelist_size
);
4857 static char startswith__doc__
[] =
4858 "S.startswith(prefix[, start[, end]]) -> int\n\
4860 Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4861 optional start, test S beginning at that position. With optional end, stop\n\
4862 comparing S at that position.";
4865 unicode_startswith(PyUnicodeObject
*self
,
4868 PyUnicodeObject
*substring
;
4873 if (!PyArg_ParseTuple(args
, "O|O&O&:startswith", &substring
,
4874 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4876 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4877 (PyObject
*)substring
);
4878 if (substring
== NULL
)
4881 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, -1));
4883 Py_DECREF(substring
);
4888 static char endswith__doc__
[] =
4889 "S.endswith(suffix[, start[, end]]) -> int\n\
4891 Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4892 optional start, test S beginning at that position. With optional end, stop\n\
4893 comparing S at that position.";
4896 unicode_endswith(PyUnicodeObject
*self
,
4899 PyUnicodeObject
*substring
;
4904 if (!PyArg_ParseTuple(args
, "O|O&O&:endswith", &substring
,
4905 _PyEval_SliceIndex
, &start
, _PyEval_SliceIndex
, &end
))
4907 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
4908 (PyObject
*)substring
);
4909 if (substring
== NULL
)
4912 result
= PyInt_FromLong(tailmatch(self
, substring
, start
, end
, +1));
4914 Py_DECREF(substring
);
4919 static PyMethodDef unicode_methods
[] = {
4921 /* Order is according to common usage: often used methods should
4922 appear first, since lookup is done sequentially. */
4924 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
, encode__doc__
},
4925 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
4926 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
4927 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
4928 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
4929 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
4930 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
4931 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
4932 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
4933 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
4934 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
4935 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
4936 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
4937 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_NOARGS
, lstrip__doc__
},
4938 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4939 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
4940 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
4941 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
4942 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_NOARGS
, rstrip__doc__
},
4943 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
4944 {"strip", (PyCFunction
) unicode_strip
, METH_NOARGS
, strip__doc__
},
4945 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
4946 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
4947 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
4948 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
4949 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
4950 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
4951 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
4952 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
4953 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
4954 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
4955 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
4956 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
4957 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
4958 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
4960 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
4961 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
4965 /* This one is just used for debugging the implementation. */
4966 {"freelistsize", (PyCFunction
) unicode_freelistsize
, METH_NOARGS
},
4972 static PySequenceMethods unicode_as_sequence
= {
4973 (inquiry
) unicode_length
, /* sq_length */
4974 (binaryfunc
) PyUnicode_Concat
, /* sq_concat */
4975 (intargfunc
) unicode_repeat
, /* sq_repeat */
4976 (intargfunc
) unicode_getitem
, /* sq_item */
4977 (intintargfunc
) unicode_slice
, /* sq_slice */
4978 0, /* sq_ass_item */
4979 0, /* sq_ass_slice */
4980 (objobjproc
)PyUnicode_Contains
, /*sq_contains*/
4984 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
4989 PyErr_SetString(PyExc_SystemError
,
4990 "accessing non-existent unicode segment");
4993 *ptr
= (void *) self
->str
;
4994 return PyUnicode_GET_DATA_SIZE(self
);
4998 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, int index
,
5001 PyErr_SetString(PyExc_TypeError
,
5002 "cannot use unicode as modifyable buffer");
5007 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
5011 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
5016 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
5023 PyErr_SetString(PyExc_SystemError
,
5024 "accessing non-existent unicode segment");
5027 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
5030 *ptr
= (void *) PyString_AS_STRING(str
);
5031 return PyString_GET_SIZE(str
);
5034 /* Helpers for PyUnicode_Format() */
5037 getnextarg(PyObject
*args
, int arglen
, int *p_argidx
)
5039 int argidx
= *p_argidx
;
5040 if (argidx
< arglen
) {
5045 return PyTuple_GetItem(args
, argidx
);
5047 PyErr_SetString(PyExc_TypeError
,
5048 "not enough arguments for format string");
5052 #define F_LJUST (1<<0)
5053 #define F_SIGN (1<<1)
5054 #define F_BLANK (1<<2)
5055 #define F_ALT (1<<3)
5056 #define F_ZERO (1<<4)
5059 int usprintf(register Py_UNICODE
*buffer
, char *format
, ...)
5065 va_start(va
, format
);
5067 /* First, format the string as char array, then expand to Py_UNICODE
5069 charbuffer
= (char *)buffer
;
5070 len
= vsprintf(charbuffer
, format
, va
);
5071 for (i
= len
- 1; i
>= 0; i
--)
5072 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
5079 formatfloat(Py_UNICODE
*buf
,
5086 /* fmt = '%#.' + `prec` + `type`
5087 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5091 x
= PyFloat_AsDouble(v
);
5092 if (x
== -1.0 && PyErr_Occurred())
5096 if (type
== 'f' && (fabs(x
) / 1e25
) >= 1e25
)
5098 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%d%c",
5099 (flags
& F_ALT
) ? "#" : "", prec
, type
);
5100 /* worst case length calc to ensure no buffer overrun:
5102 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5103 for any double rep.)
5104 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5105 If prec=0 the effective precision is 1 (the leading digit is
5106 always given), therefore increase by one to 10+prec. */
5107 if (buflen
<= (size_t)10 + (size_t)prec
) {
5108 PyErr_SetString(PyExc_OverflowError
,
5109 "formatted float is too long (precision too long?)");
5112 return usprintf(buf
, fmt
, x
);
5116 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
5120 PyObject
*str
; /* temporary string object. */
5121 PyUnicodeObject
*result
;
5123 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
5126 result
= _PyUnicode_New(len
);
5127 for (i
= 0; i
< len
; i
++)
5128 result
->str
[i
] = buf
[i
];
5129 result
->str
[len
] = 0;
5131 return (PyObject
*)result
;
5135 formatint(Py_UNICODE
*buf
,
5142 /* fmt = '%#.' + `prec` + 'l' + `type`
5143 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5145 char fmt
[64]; /* plenty big enough! */
5147 int use_native_c_format
= 1;
5149 x
= PyInt_AsLong(v
);
5150 if (x
== -1 && PyErr_Occurred())
5154 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5155 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5156 if (buflen
<= 13 || buflen
<= (size_t)2+(size_t)prec
) {
5157 PyErr_SetString(PyExc_OverflowError
,
5158 "formatted integer is too long (precision too long?)");
5161 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5162 * but we want it (for consistency with other %#x conversions, and
5163 * for consistency with Python's hex() function).
5164 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5165 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5166 * So add it only if the platform doesn't already.
5168 if (x
== 0 && (flags
& F_ALT
) && (type
== 'x' || type
== 'X')) {
5169 /* Only way to know what the platform does is to try it. */
5170 PyOS_snprintf(fmt
, sizeof(fmt
), type
== 'x' ? "%#x" : "%#X", 0);
5171 if (fmt
[1] != (char)type
) {
5172 /* Supply our own leading 0x/0X -- needed under std C */
5173 use_native_c_format
= 0;
5174 PyOS_snprintf(fmt
, sizeof(fmt
), "0%c%%#.%dl%c", type
, prec
, type
);
5177 if (use_native_c_format
)
5178 PyOS_snprintf(fmt
, sizeof(fmt
), "%%%s.%dl%c",
5179 (flags
& F_ALT
) ? "#" : "", prec
, type
);
5180 return usprintf(buf
, fmt
, x
);
5184 formatchar(Py_UNICODE
*buf
,
5188 /* presume that the buffer is at least 2 characters long */
5189 if (PyUnicode_Check(v
)) {
5190 if (PyUnicode_GET_SIZE(v
) != 1)
5192 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
5195 else if (PyString_Check(v
)) {
5196 if (PyString_GET_SIZE(v
) != 1)
5198 buf
[0] = (Py_UNICODE
)PyString_AS_STRING(v
)[0];
5202 /* Integer input truncated to a character */
5204 x
= PyInt_AsLong(v
);
5205 if (x
== -1 && PyErr_Occurred())
5213 PyErr_SetString(PyExc_TypeError
,
5214 "%c requires int or char");
5218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5220 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5221 chars are formatted. XXX This is a magic number. Each formatting
5222 routine does bounds checking to ensure no overflow, but a better
5223 solution may be to malloc a buffer of appropriate size for each
5224 format. For now, the current solution is sufficient.
5226 #define FORMATBUFLEN (size_t)120
5228 PyObject
*PyUnicode_Format(PyObject
*format
,
5231 Py_UNICODE
*fmt
, *res
;
5232 int fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
5234 PyUnicodeObject
*result
= NULL
;
5235 PyObject
*dict
= NULL
;
5238 if (format
== NULL
|| args
== NULL
) {
5239 PyErr_BadInternalCall();
5242 uformat
= PyUnicode_FromObject(format
);
5243 if (uformat
== NULL
)
5245 fmt
= PyUnicode_AS_UNICODE(uformat
);
5246 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
5248 reslen
= rescnt
= fmtcnt
+ 100;
5249 result
= _PyUnicode_New(reslen
);
5252 res
= PyUnicode_AS_UNICODE(result
);
5254 if (PyTuple_Check(args
)) {
5255 arglen
= PyTuple_Size(args
);
5262 if (args
->ob_type
->tp_as_mapping
)
5265 while (--fmtcnt
>= 0) {
5268 rescnt
= fmtcnt
+ 100;
5270 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5272 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
5278 /* Got a format specifier */
5282 Py_UNICODE c
= '\0';
5285 PyObject
*temp
= NULL
;
5289 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{float,int,char}() */
5293 Py_UNICODE
*keystart
;
5299 PyErr_SetString(PyExc_TypeError
,
5300 "format requires a mapping");
5306 /* Skip over balanced parentheses */
5307 while (pcount
> 0 && --fmtcnt
>= 0) {
5310 else if (*fmt
== '(')
5314 keylen
= fmt
- keystart
- 1;
5315 if (fmtcnt
< 0 || pcount
> 0) {
5316 PyErr_SetString(PyExc_ValueError
,
5317 "incomplete format key");
5321 /* keys are converted to strings using UTF-8 and
5322 then looked up since Python uses strings to hold
5323 variables names etc. in its namespaces and we
5324 wouldn't want to break common idioms. */
5325 key
= PyUnicode_EncodeUTF8(keystart
,
5329 key
= PyUnicode_FromUnicode(keystart
, keylen
);
5337 args
= PyObject_GetItem(dict
, key
);
5346 while (--fmtcnt
>= 0) {
5347 switch (c
= *fmt
++) {
5348 case '-': flags
|= F_LJUST
; continue;
5349 case '+': flags
|= F_SIGN
; continue;
5350 case ' ': flags
|= F_BLANK
; continue;
5351 case '#': flags
|= F_ALT
; continue;
5352 case '0': flags
|= F_ZERO
; continue;
5357 v
= getnextarg(args
, arglen
, &argidx
);
5360 if (!PyInt_Check(v
)) {
5361 PyErr_SetString(PyExc_TypeError
,
5365 width
= PyInt_AsLong(v
);
5373 else if (c
>= '0' && c
<= '9') {
5375 while (--fmtcnt
>= 0) {
5377 if (c
< '0' || c
> '9')
5379 if ((width
*10) / 10 != width
) {
5380 PyErr_SetString(PyExc_ValueError
,
5384 width
= width
*10 + (c
- '0');
5392 v
= getnextarg(args
, arglen
, &argidx
);
5395 if (!PyInt_Check(v
)) {
5396 PyErr_SetString(PyExc_TypeError
,
5400 prec
= PyInt_AsLong(v
);
5406 else if (c
>= '0' && c
<= '9') {
5408 while (--fmtcnt
>= 0) {
5409 c
= Py_CHARMASK(*fmt
++);
5410 if (c
< '0' || c
> '9')
5412 if ((prec
*10) / 10 != prec
) {
5413 PyErr_SetString(PyExc_ValueError
,
5417 prec
= prec
*10 + (c
- '0');
5422 if (c
== 'h' || c
== 'l' || c
== 'L') {
5428 PyErr_SetString(PyExc_ValueError
,
5429 "incomplete format");
5433 v
= getnextarg(args
, arglen
, &argidx
);
5443 /* presume that buffer length is at least 1 */
5450 if (PyUnicode_Check(v
) && c
== 's') {
5457 temp
= PyObject_Str(v
);
5459 temp
= PyObject_Repr(v
);
5462 if (!PyString_Check(temp
)) {
5463 /* XXX Note: this should never happen, since
5464 PyObject_Repr() and PyObject_Str() assure
5467 PyErr_SetString(PyExc_TypeError
,
5468 "%s argument has non-string str()");
5471 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
5472 PyString_GET_SIZE(temp
),
5480 pbuf
= PyUnicode_AS_UNICODE(temp
);
5481 len
= PyUnicode_GET_SIZE(temp
);
5482 if (prec
>= 0 && len
> prec
)
5494 if (PyLong_Check(v
)) {
5495 temp
= formatlong(v
, flags
, prec
, c
);
5498 pbuf
= PyUnicode_AS_UNICODE(temp
);
5499 len
= PyUnicode_GET_SIZE(temp
);
5500 /* unbounded ints can always produce
5501 a sign character! */
5506 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5510 /* only d conversion is signed */
5523 len
= formatfloat(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
5534 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
5540 PyErr_Format(PyExc_ValueError
,
5541 "unsupported format character '%c' (0x%x) "
5543 (31<=c
&& c
<=126) ? (int)c
: '?',
5544 (int)c
, (fmt
-1 - PyUnicode_AS_UNICODE(uformat
)));
5548 if (*pbuf
== '-' || *pbuf
== '+') {
5552 else if (flags
& F_SIGN
)
5554 else if (flags
& F_BLANK
)
5561 if (rescnt
< width
+ (sign
!= 0)) {
5563 rescnt
= width
+ fmtcnt
+ 100;
5565 if (_PyUnicode_Resize(&result
, reslen
) < 0)
5567 res
= PyUnicode_AS_UNICODE(result
)
5577 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5578 assert(pbuf
[0] == '0');
5579 assert(pbuf
[1] == c
);
5590 if (width
> len
&& !(flags
& F_LJUST
)) {
5594 } while (--width
> len
);
5599 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
5600 assert(pbuf
[0] == '0');
5601 assert(pbuf
[1] == c
);
5606 Py_UNICODE_COPY(res
, pbuf
, len
);
5609 while (--width
>= len
) {
5613 if (dict
&& (argidx
< arglen
) && c
!= '%') {
5614 PyErr_SetString(PyExc_TypeError
,
5615 "not all arguments converted");
5621 if (argidx
< arglen
&& !dict
) {
5622 PyErr_SetString(PyExc_TypeError
,
5623 "not all arguments converted");
5631 if (_PyUnicode_Resize(&result
, reslen
- rescnt
))
5633 return (PyObject
*)result
;
5644 static PyBufferProcs unicode_as_buffer
= {
5645 (getreadbufferproc
) unicode_buffer_getreadbuf
,
5646 (getwritebufferproc
) unicode_buffer_getwritebuf
,
5647 (getsegcountproc
) unicode_buffer_getsegcount
,
5648 (getcharbufferproc
) unicode_buffer_getcharbuf
,
5651 staticforward PyObject
*
5652 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
5655 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5658 static char *kwlist
[] = {"string", "encoding", "errors", 0};
5659 char *encoding
= NULL
;
5660 char *errors
= NULL
;
5662 if (type
!= &PyUnicode_Type
)
5663 return unicode_subtype_new(type
, args
, kwds
);
5664 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
5665 kwlist
, &x
, &encoding
, &errors
))
5668 return (PyObject
*)_PyUnicode_New(0);
5669 if (encoding
== NULL
&& errors
== NULL
)
5670 return PyObject_Unicode(x
);
5672 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
5676 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
5678 PyUnicodeObject
*tmp
, *pnew
;
5681 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
5682 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
5685 assert(PyUnicode_Check(tmp
));
5686 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
5689 pnew
->str
= PyMem_NEW(Py_UNICODE
, n
+1);
5690 if (pnew
->str
== NULL
) {
5691 _Py_ForgetReference((PyObject
*)pnew
);
5695 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
5697 pnew
->hash
= tmp
->hash
;
5699 return (PyObject
*)pnew
;
5702 static char unicode_doc
[] =
5703 "unicode(string [, encoding[, errors]]) -> object\n\
5705 Create a new Unicode object from the given encoded string.\n\
5706 encoding defaults to the current default string encoding and \n\
5707 errors, defining the error handling, to 'strict'.";
5709 PyTypeObject PyUnicode_Type
= {
5710 PyObject_HEAD_INIT(&PyType_Type
)
5712 "unicode", /* tp_name */
5713 sizeof(PyUnicodeObject
), /* tp_size */
5714 0, /* tp_itemsize */
5716 (destructor
)unicode_dealloc
, /* tp_dealloc */
5720 (cmpfunc
) unicode_compare
, /* tp_compare */
5721 (reprfunc
) unicode_repr
, /* tp_repr */
5722 0, /* tp_as_number */
5723 &unicode_as_sequence
, /* tp_as_sequence */
5724 0, /* tp_as_mapping */
5725 (hashfunc
) unicode_hash
, /* tp_hash*/
5727 (reprfunc
) unicode_str
, /* tp_str */
5728 PyObject_GenericGetAttr
, /* tp_getattro */
5729 0, /* tp_setattro */
5730 &unicode_as_buffer
, /* tp_as_buffer */
5731 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_BASETYPE
, /* tp_flags */
5732 unicode_doc
, /* tp_doc */
5733 0, /* tp_traverse */
5735 0, /* tp_richcompare */
5736 0, /* tp_weaklistoffset */
5738 0, /* tp_iternext */
5739 unicode_methods
, /* tp_methods */
5744 0, /* tp_descr_get */
5745 0, /* tp_descr_set */
5746 0, /* tp_dictoffset */
5749 unicode_new
, /* tp_new */
5750 _PyObject_Del
, /* tp_free */
5753 /* Initialize the Unicode implementation */
5755 void _PyUnicode_Init(void)
5759 /* Init the implementation */
5760 unicode_freelist
= NULL
;
5761 unicode_freelist_size
= 0;
5762 unicode_empty
= _PyUnicode_New(0);
5763 strcpy(unicode_default_encoding
, "ascii");
5764 for (i
= 0; i
< 256; i
++)
5765 unicode_latin1
[i
] = NULL
;
5768 /* Finalize the Unicode implementation */
5771 _PyUnicode_Fini(void)
5776 Py_XDECREF(unicode_empty
);
5777 unicode_empty
= NULL
;
5779 for (i
= 0; i
< 256; i
++) {
5780 if (unicode_latin1
[i
]) {
5781 Py_DECREF(unicode_latin1
[i
]);
5782 unicode_latin1
[i
] = NULL
;
5786 for (u
= unicode_freelist
; u
!= NULL
;) {
5787 PyUnicodeObject
*v
= u
;
5788 u
= *(PyUnicodeObject
**)u
;
5791 Py_XDECREF(v
->defenc
);
5794 unicode_freelist
= NULL
;
5795 unicode_freelist_size
= 0;