Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_DEL(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 228 {
 229     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 230         /* Keep-Alive optimization */
 231         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 232             PyMem_DEL(unicode->str);
 233             unicode->str = NULL;
 234             unicode->length = 0;
 235         }
 236         if (unicode->defenc) {
 237             Py_DECREF(unicode->defenc);
 238             unicode->defenc = NULL;
 239         }
 240         /* Add to free list */
 241         *(PyUnicodeObject **)unicode = unicode_freelist;
 242         unicode_freelist = unicode;
 243         unicode_freelist_size++;
 244     }
 245     else {
 246         PyMem_DEL(unicode->str);
 247         Py_XDECREF(unicode->defenc);
 248         PyObject_DEL(unicode);
 249     }
 250 }
 251
 252 int PyUnicode_Resize(PyObject **unicode,
 253                      int length)
 254 {
 255     register PyUnicodeObject *v;
 256
 257     /* Argument checks */
 258     if (unicode == NULL) {
 259         PyErr_BadInternalCall();
 260         return -1;
 261     }
 262     v = (PyUnicodeObject *)*unicode;
 263     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 264         PyErr_BadInternalCall();
 265         return -1;
 266     }
 267
 268     /* Resizing unicode_empty and single character objects is not
 269        possible since these are being shared. We simply return a fresh
 270        copy with the same Unicode content. */
 271     if (v->length != length &&
 272         (v == unicode_empty || v->length == 1)) {
 273         PyUnicodeObject *w = _PyUnicode_New(length);
 274         if (w == NULL)
 275             return -1;
 276         Py_UNICODE_COPY(w->str, v->str,
 277                         length < v->length ? length : v->length);
 278         *unicode = (PyObject *)w;
 279         return 0;
 280     }
 281
 282     /* Note that we don't have to modify *unicode for unshared Unicode
 283        objects, since we can modify them in-place. */
 284     return unicode_resize(v, length);
 285 }
 286
 287 /* Internal API for use in unicodeobject.c only ! */
 288 #define _PyUnicode_Resize(unicodevar, length) \
 289         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 290
 291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 292                                 int size)
 293 {
 294     PyUnicodeObject *unicode;
 295
 296     /* If the Unicode data is known at construction time, we can apply
 297        some optimizations which share commonly used objects. */
 298     if (u != NULL) {
 299
 300         /* Optimization for empty strings */
 301         if (size == 0 && unicode_empty != NULL) {
 302             Py_INCREF(unicode_empty);
 303             return (PyObject *)unicode_empty;
 304         }
 305
 306         /* Single character Unicode objects in the Latin-1 range are
 307            shared when using this constructor */
 308         if (size == 1 && *u < 256) {
 309             unicode = unicode_latin1[*u];
 310             if (!unicode) {
 311                 unicode = _PyUnicode_New(1);
 312                 if (!unicode)
 313                     return NULL;
 314                 unicode->str[0] = *u;
 315                 unicode_latin1[*u] = unicode;
 316             }
 317             Py_INCREF(unicode);
 318             return (PyObject *)unicode;
 319         }
 320     }
 321
 322     unicode = _PyUnicode_New(size);
 323     if (!unicode)
 324         return NULL;
 325
 326     /* Copy the Unicode data into the new object */
 327     if (u != NULL)
 328         Py_UNICODE_COPY(unicode->str, u, size);
 329
 330     return (PyObject *)unicode;
 331 }
 332
 333 #ifdef HAVE_WCHAR_H
 334
 335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 336                                  int size)
 337 {
 338     PyUnicodeObject *unicode;
 339
 340     if (w == NULL) {
 341         PyErr_BadInternalCall();
 342         return NULL;
 343     }
 344
 345     unicode = _PyUnicode_New(size);
 346     if (!unicode)
 347         return NULL;
 348
 349     /* Copy the wchar_t data into the new object */
 350 #ifdef HAVE_USABLE_WCHAR_T
 351     memcpy(unicode->str, w, size * sizeof(wchar_t));
 352 #else
 353     {
 354         register Py_UNICODE *u;
 355         register int i;
 356         u = PyUnicode_AS_UNICODE(unicode);
 357         for (i = size; i >= 0; i--)
 358             *u++ = *w++;
 359     }
 360 #endif
 361
 362     return (PyObject *)unicode;
 363 }
 364
 365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 366                          register wchar_t *w,
 367                          int size)
 368 {
 369     if (unicode == NULL) {
 370         PyErr_BadInternalCall();
 371         return -1;
 372     }
 373     if (size > PyUnicode_GET_SIZE(unicode))
 374         size = PyUnicode_GET_SIZE(unicode);
 375 #ifdef HAVE_USABLE_WCHAR_T
 376     memcpy(w, unicode->str, size * sizeof(wchar_t));
 377 #else
 378     {
 379         register Py_UNICODE *u;
 380         register int i;
 381         u = PyUnicode_AS_UNICODE(unicode);
 382         for (i = size; i >= 0; i--)
 383             *w++ = *u++;
 384     }
 385 #endif
 386
 387     return size;
 388 }
 389
 390 #endif
 391
 392 PyObject *PyUnicode_FromObject(register PyObject *obj)
 393 {
 394     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 395 }
 396
 397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 398                                       const char *encoding,
 399                                       const char *errors)
 400 {
 401     const char *s = NULL;
 402     int len;
 403     int owned = 0;
 404     PyObject *v;
 405     int reclevel;
 406
 407     if (obj == NULL) {
 408         PyErr_BadInternalCall();
 409         return NULL;
 410     }
 411
 412     /* Coerce object */
 413     for (reclevel = 0; reclevel < 2; reclevel++) {
 414
 415         if (PyUnicode_Check(obj)) {
 416             if (encoding) {
 417                 PyErr_SetString(PyExc_TypeError,
 418                                 "decoding Unicode is not supported");
 419                 goto onError;
 420             }
 421             if (PyUnicode_CheckExact(obj)) {
 422                 Py_INCREF(obj);
 423                 v = obj;
 424             }
 425             else {
 426                 /* For a subclass of unicode, return a true unicode object
 427                    with the same string value. */
 428                 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 429                                           PyUnicode_GET_SIZE(obj));
 430             }
 431             goto done;
 432         }
 433         else if (PyString_Check(obj)) {
 434             s = PyString_AS_STRING(obj);
 435             len = PyString_GET_SIZE(obj);
 436             break;
 437         }
 438         else {
 439             PyObject *w;
 440
 441             /* Try char buffer interface */
 442             if (PyObject_AsCharBuffer(obj, &s, &len))
 443                 PyErr_Clear();
 444             else
 445                 break;
 446
 447             /* Mimic the behaviour of str(object) if everything else
 448                fails (see PyObject_Str()); this also covers instances
 449                which implement __str__. */
 450             if (obj->ob_type->tp_str == NULL)
 451                 w = PyObject_Repr(obj);
 452             else
 453                 w = (*obj->ob_type->tp_str)(obj);
 454             if (w == NULL)
 455                 goto onError;
 456             if (owned) {
 457                 Py_DECREF(obj);
 458             }
 459             obj = w;
 460             owned = 1;
 461         }
 462     }
 463
 464     if (s == NULL) {
 465         PyErr_Format(PyExc_TypeError,
 466                      "coercing to Unicode: __str__ recursion limit exceeded "
 467                      "(last type: %.80s)",
 468                      obj->ob_type->tp_name);
 469         goto onError;
 470     }
 471
 472     /* Convert to Unicode */
 473     if (len == 0) {
 474         Py_INCREF(unicode_empty);
 475         v = (PyObject *)unicode_empty;
 476     }
 477     else
 478         v = PyUnicode_Decode(s, len, encoding, errors);
 479
 480  done:
 481     if (owned) {
 482         Py_DECREF(obj);
 483     }
 484     return v;
 485
 486  onError:
 487     if (owned) {
 488         Py_DECREF(obj);
 489     }
 490     return NULL;
 491 }
 492
 493 PyObject *PyUnicode_Decode(const char *s,
 494                            int size,
 495                            const char *encoding,
 496                            const char *errors)
 497 {
 498     PyObject *buffer = NULL, *unicode;
 499
 500     if (encoding == NULL)
 501         encoding = PyUnicode_GetDefaultEncoding();
 502
 503     /* Shortcuts for common default encodings */
 504     if (strcmp(encoding, "utf-8") == 0)
 505         return PyUnicode_DecodeUTF8(s, size, errors);
 506     else if (strcmp(encoding, "latin-1") == 0)
 507         return PyUnicode_DecodeLatin1(s, size, errors);
 508     else if (strcmp(encoding, "ascii") == 0)
 509         return PyUnicode_DecodeASCII(s, size, errors);
 510
 511     /* Decode via the codec registry */
 512     buffer = PyBuffer_FromMemory((void *)s, size);
 513     if (buffer == NULL)
 514         goto onError;
 515     unicode = PyCodec_Decode(buffer, encoding, errors);
 516     if (unicode == NULL)
 517         goto onError;
 518     if (!PyUnicode_Check(unicode)) {
 519         PyErr_Format(PyExc_TypeError,
 520                      "decoder did not return an unicode object (type=%.400s)",
 521                      unicode->ob_type->tp_name);
 522         Py_DECREF(unicode);
 523         goto onError;
 524     }
 525     Py_DECREF(buffer);
 526     return unicode;
 527
 528  onError:
 529     Py_XDECREF(buffer);
 530     return NULL;
 531 }
 532
 533 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 534                            int size,
 535                            const char *encoding,
 536                            const char *errors)
 537 {
 538     PyObject *v, *unicode;
 539
 540     unicode = PyUnicode_FromUnicode(s, size);
 541     if (unicode == NULL)
 542         return NULL;
 543     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 544     Py_DECREF(unicode);
 545     return v;
 546 }
 547
 548 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 549                                     const char *encoding,
 550                                     const char *errors)
 551 {
 552     PyObject *v;
 553
 554     if (!PyUnicode_Check(unicode)) {
 555         PyErr_BadArgument();
 556         goto onError;
 557     }
 558
 559     if (encoding == NULL)
 560         encoding = PyUnicode_GetDefaultEncoding();
 561
 562     /* Shortcuts for common default encodings */
 563     if (errors == NULL) {
 564         if (strcmp(encoding, "utf-8") == 0)
 565             return PyUnicode_AsUTF8String(unicode);
 566         else if (strcmp(encoding, "latin-1") == 0)
 567             return PyUnicode_AsLatin1String(unicode);
 568         else if (strcmp(encoding, "ascii") == 0)
 569             return PyUnicode_AsASCIIString(unicode);
 570     }
 571
 572     /* Encode via the codec registry */
 573     v = PyCodec_Encode(unicode, encoding, errors);
 574     if (v == NULL)
 575         goto onError;
 576     /* XXX Should we really enforce this ? */
 577     if (!PyString_Check(v)) {
 578         PyErr_Format(PyExc_TypeError,
 579                      "encoder did not return a string object (type=%.400s)",
 580                      v->ob_type->tp_name);
 581         Py_DECREF(v);
 582         goto onError;
 583     }
 584     return v;
 585
 586  onError:
 587     return NULL;
 588 }
 589
 590 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 591                                             const char *errors)
 592 {
 593     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 594
 595     if (v)
 596         return v;
 597     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 598     if (v && errors == NULL)
 599         ((PyUnicodeObject *)unicode)->defenc = v;
 600     return v;
 601 }
 602
 603 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 604 {
 605     if (!PyUnicode_Check(unicode)) {
 606         PyErr_BadArgument();
 607         goto onError;
 608     }
 609     return PyUnicode_AS_UNICODE(unicode);
 610
 611  onError:
 612     return NULL;
 613 }
 614
 615 int PyUnicode_GetSize(PyObject *unicode)
 616 {
 617     if (!PyUnicode_Check(unicode)) {
 618         PyErr_BadArgument();
 619         goto onError;
 620     }
 621     return PyUnicode_GET_SIZE(unicode);
 622
 623  onError:
 624     return -1;
 625 }
 626
 627 const char *PyUnicode_GetDefaultEncoding(void)
 628 {
 629     return unicode_default_encoding;
 630 }
 631
 632 int PyUnicode_SetDefaultEncoding(const char *encoding)
 633 {
 634     PyObject *v;
 635
 636     /* Make sure the encoding is valid. As side effect, this also
 637        loads the encoding into the codec registry cache. */
 638     v = _PyCodec_Lookup(encoding);
 639     if (v == NULL)
 640         goto onError;
 641     Py_DECREF(v);
 642     strncpy(unicode_default_encoding,
 643             encoding,
 644             sizeof(unicode_default_encoding));
 645     return 0;
 646
 647  onError:
 648     return -1;
 649 }
 650
 651 /* --- UTF-7 Codec -------------------------------------------------------- */
 652
 653 /* see RFC2152 for details */
 654
 655 static
 656 char utf7_special[128] = {
 657     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 658        encoded:
 659            0 - not special
 660            1 - special
 661            2 - whitespace (optional)
 662            3 - RFC2152 Set O (optional) */
 663     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 664     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 665     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 666     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 667     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 668     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 669     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 670     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 671
 672 };
 673
 674 #define SPECIAL(c, encodeO, encodeWS) \
 675         (((c)>127 || utf7_special[(c)] == 1) || \
 676          (encodeWS && (utf7_special[(c)] == 2)) || \
 677      (encodeO && (utf7_special[(c)] == 3)))
 678
 679 #define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 680 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
 681 #define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
 682                         (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
 683
 684 #define ENCODE(out, ch, bits) \
 685     while (bits >= 6) { \
 686         *out++ = B64(ch >> (bits-6)); \
 687         bits -= 6; \
 688     }
 689
 690 #define DECODE(out, ch, bits, surrogate) \
 691     while (bits >= 16) { \
 692         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
 693         bits -= 16; \
 694                 if (surrogate) { \
 695                         /* We have already generated an error for the high surrogate
 696                so let's not bother seeing if the low surrogate is correct or not */\
 697                         surrogate = 0; \
 698                 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
 699             /* This is a surrogate pair. Unfortunately we can't represent \
 700                it in a 16-bit character */ \
 701                         surrogate = 1; \
 702             errmsg = "code pairs are not supported"; \
 703                 goto utf7Error; \
 704                 } else { \
 705                                 *out++ = outCh; \
 706                 } \
 707     } \
 708
 709 static
 710 int utf7_decoding_error(Py_UNICODE **dest,
 711                         const char *errors,
 712                         const char *details)
 713 {
 714     if ((errors == NULL) ||
 715         (strcmp(errors,"strict") == 0)) {
 716         PyErr_Format(PyExc_UnicodeError,
 717                      "UTF-7 decoding error: %.400s",
 718                      details);
 719         return -1;
 720     }
 721     else if (strcmp(errors,"ignore") == 0) {
 722         return 0;
 723     }
 724     else if (strcmp(errors,"replace") == 0) {
 725         if (dest != NULL) {
 726             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 727             (*dest)++;
 728         }
 729         return 0;
 730     }
 731     else {
 732         PyErr_Format(PyExc_ValueError,
 733                      "UTF-7 decoding error; unknown error handling code: %.400s",
 734                      errors);
 735         return -1;
 736     }
 737 }
 738
 739 PyObject *PyUnicode_DecodeUTF7(const char *s,
 740                                int size,
 741                                const char *errors)
 742 {
 743     const char *e;
 744     PyUnicodeObject *unicode;
 745     Py_UNICODE *p;
 746     const char *errmsg = "";
 747     int inShift = 0;
 748     unsigned int bitsleft = 0;
 749     unsigned long charsleft = 0;
 750         int surrogate = 0;
 751
 752     unicode = _PyUnicode_New(size);
 753     if (!unicode)
 754         return NULL;
 755     if (size == 0)
 756         return (PyObject *)unicode;
 757
 758     p = unicode->str;
 759     e = s + size;
 760
 761     while (s < e) {
 762         Py_UNICODE ch = *s;
 763
 764         if (inShift) {
 765             if ((ch == '-') || !B64CHAR(ch)) {
 766                 inShift = 0;
 767                 s++;
 768
 769                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 770                 if (bitsleft >= 6) {
 771                     /* The shift sequence has a partial character in it. If
 772                        bitsleft < 6 then we could just classify it as padding
 773                        but that is not the case here */
 774
 775                     errmsg = "partial character in shift sequence";
 776                     goto utf7Error;
 777                 }
 778                 /* According to RFC2152 the remaining bits should be zero. We
 779                    choose to signal an error/insert a replacement character
 780                    here so indicate the potential of a misencoded character. */
 781
 782                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 783                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 784                     errmsg = "non-zero padding bits in shift sequence";
 785                     goto utf7Error;
 786                 }
 787
 788                 if (ch == '-') {
 789                     if ((s < e) && (*(s) == '-')) {
 790                         *p++ = '-';
 791                         inShift = 1;
 792                     }
 793                 } else if (SPECIAL(ch,0,0)) {
 794                     errmsg = "unexpected special character";
 795                         goto utf7Error;
 796                 } else  {
 797                     *p++ = ch;
 798                 }
 799             } else {
 800                 charsleft = (charsleft << 6) | UB64(ch);
 801                 bitsleft += 6;
 802                 s++;
 803                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 804             }
 805         }
 806         else if ( ch == '+' ) {
 807             s++;
 808             if (s < e && *s == '-') {
 809                 s++;
 810                 *p++ = '+';
 811             } else
 812             {
 813                 inShift = 1;
 814                 bitsleft = 0;
 815             }
 816         }
 817         else if (SPECIAL(ch,0,0)) {
 818             errmsg = "unexpected special character";
 819             s++;
 820                 goto utf7Error;
 821         }
 822         else {
 823             *p++ = ch;
 824             s++;
 825         }
 826         continue;
 827     utf7Error:
 828       if (utf7_decoding_error(&p, errors, errmsg))
 829           goto onError;
 830     }
 831
 832     if (inShift) {
 833         if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
 834             goto onError;
 835     }
 836
 837     if (_PyUnicode_Resize(&unicode, p - unicode->str))
 838         goto onError;
 839
 840     return (PyObject *)unicode;
 841
 842 onError:
 843     Py_DECREF(unicode);
 844     return NULL;
 845 }
 846
 847
 848 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
 849                    int size,
 850                    int encodeSetO,
 851                    int encodeWhiteSpace,
 852                    const char *errors)
 853 {
 854     PyObject *v;
 855     /* It might be possible to tighten this worst case */
 856     unsigned int cbAllocated = 5 * size;
 857     int inShift = 0;
 858     int i = 0;
 859     unsigned int bitsleft = 0;
 860     unsigned long charsleft = 0;
 861     char * out;
 862     char * start;
 863
 864     if (size == 0)
 865                 return PyString_FromStringAndSize(NULL, 0);
 866
 867     v = PyString_FromStringAndSize(NULL, cbAllocated);
 868     if (v == NULL)
 869         return NULL;
 870
 871     start = out = PyString_AS_STRING(v);
 872     for (;i < size; ++i) {
 873         Py_UNICODE ch = s[i];
 874
 875         if (!inShift) {
 876                         if (ch == '+') {
 877                                 *out++ = '+';
 878                 *out++ = '-';
 879             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 880                 charsleft = ch;
 881                 bitsleft = 16;
 882                 *out++ = '+';
 883                                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 884                 inShift = bitsleft > 0;
 885                         } else {
 886                                 *out++ = (char) ch;
 887                         }
 888                 } else {
 889             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 890                 *out++ = B64(charsleft << (6-bitsleft));
 891                 charsleft = 0;
 892                 bitsleft = 0;
 893                 /* Characters not in the BASE64 set implicitly unshift the sequence
 894                    so no '-' is required, except if the character is itself a '-' */
 895                 if (B64CHAR(ch) || ch == '-') {
 896                     *out++ = '-';
 897                 }
 898                 inShift = 0;
 899                 *out++ = (char) ch;
 900             } else {
 901                 bitsleft += 16;
 902                 charsleft = (charsleft << 16) | ch;
 903                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 904
 905                 /* If the next character is special then we dont' need to terminate
 906                    the shift sequence. If the next character is not a BASE64 character
 907                    or '-' then the shift sequence will be terminated implicitly and we
 908                    don't have to insert a '-'. */
 909
 910                 if (bitsleft == 0) {
 911                     if (i + 1 < size) {
 912                         Py_UNICODE ch2 = s[i+1];
 913
 914                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
 915
 916                         } else if (B64CHAR(ch2) || ch2 == '-') {
 917                             *out++ = '-';
 918                             inShift = 0;
 919                         } else {
 920                             inShift = 0;
 921                         }
 922
 923                     }
 924                     else {
 925                         *out++ = '-';
 926                         inShift = 0;
 927                     }
 928                 }
 929             }
 930         }
 931         }
 932     if (bitsleft) {
 933         *out++= B64(charsleft << (6-bitsleft) );
 934         *out++ = '-';
 935     }
 936
 937     if (_PyString_Resize(&v, out - start)) {
 938         Py_DECREF(v);
 939         return NULL;
 940     }
 941     return v;
 942 }
 943
 944 #undef SPECIAL
 945 #undef B64
 946 #undef B64CHAR
 947 #undef UB64
 948 #undef ENCODE
 949 #undef DECODE
 950
 951 /* --- UTF-8 Codec -------------------------------------------------------- */
 952
 953 static
 954 char utf8_code_length[256] = {
 955     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 956        illegal prefix.  see RFC 2279 for details */
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 958     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 959     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 960     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 961     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 962     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 963     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 964     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 965     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 966     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 967     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 968     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 969     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 970     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 971     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 972     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 973 };
 974
 975 static
 976 int utf8_decoding_error(const char **source,
 977                         Py_UNICODE **dest,
 978                         const char *errors,
 979                         const char *details)
 980 {
 981     if ((errors == NULL) ||
 982         (strcmp(errors,"strict") == 0)) {
 983         PyErr_Format(PyExc_UnicodeError,
 984                      "UTF-8 decoding error: %.400s",
 985                      details);
 986         return -1;
 987     }
 988     else if (strcmp(errors,"ignore") == 0) {
 989         (*source)++;
 990         return 0;
 991     }
 992     else if (strcmp(errors,"replace") == 0) {
 993         (*source)++;
 994         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 995         (*dest)++;
 996         return 0;
 997     }
 998     else {
 999         PyErr_Format(PyExc_ValueError,
1000                      "UTF-8 decoding error; unknown error handling code: %.400s",
1001                      errors);
1002         return -1;
1003     }
1004 }
1005
1006 PyObject *PyUnicode_DecodeUTF8(const char *s,
1007                                int size,
1008                                const char *errors)
1009 {
1010     int n;
1011     const char *e;
1012     PyUnicodeObject *unicode;
1013     Py_UNICODE *p;
1014     const char *errmsg = "";
1015
1016     /* Note: size will always be longer than the resulting Unicode
1017        character count */
1018     unicode = _PyUnicode_New(size);
1019     if (!unicode)
1020         return NULL;
1021     if (size == 0)
1022         return (PyObject *)unicode;
1023
1024     /* Unpack UTF-8 encoded data */
1025     p = unicode->str;
1026     e = s + size;
1027
1028     while (s < e) {
1029         Py_UCS4 ch = (unsigned char)*s;
1030
1031         if (ch < 0x80) {
1032             *p++ = (Py_UNICODE)ch;
1033             s++;
1034             continue;
1035         }
1036
1037         n = utf8_code_length[ch];
1038
1039         if (s + n > e) {
1040             errmsg = "unexpected end of data";
1041             goto utf8Error;
1042         }
1043
1044         switch (n) {
1045
1046         case 0:
1047             errmsg = "unexpected code byte";
1048             goto utf8Error;
1049
1050         case 1:
1051             errmsg = "internal error";
1052             goto utf8Error;
1053
1054         case 2:
1055             if ((s[1] & 0xc0) != 0x80) {
1056                 errmsg = "invalid data";
1057                 goto utf8Error;
1058             }
1059             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1060             if (ch < 0x80) {
1061                 errmsg = "illegal encoding";
1062                 goto utf8Error;
1063             }
1064             else
1065                 *p++ = (Py_UNICODE)ch;
1066             break;
1067
1068         case 3:
1069             if ((s[1] & 0xc0) != 0x80 ||
1070                 (s[2] & 0xc0) != 0x80) {
1071                 errmsg = "invalid data";
1072                 goto utf8Error;
1073             }
1074             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1075             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1076                 errmsg = "illegal encoding";
1077                 goto utf8Error;
1078             }
1079             else
1080                                 *p++ = (Py_UNICODE)ch;
1081             break;
1082
1083         case 4:
1084             if ((s[1] & 0xc0) != 0x80 ||
1085                 (s[2] & 0xc0) != 0x80 ||
1086                 (s[3] & 0xc0) != 0x80) {
1087                 errmsg = "invalid data";
1088                 goto utf8Error;
1089             }
1090             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092             /* validate and convert to UTF-16 */
1093             if ((ch < 0x10000)        /* minimum value allowed for 4
1094                                        byte encoding */
1095                 || (ch > 0x10ffff))   /* maximum value allowed for
1096                                        UTF-16 */
1097             {
1098                 errmsg = "illegal encoding";
1099                 goto utf8Error;
1100             }
1101 #ifdef Py_UNICODE_WIDE
1102             *p++ = (Py_UNICODE)ch;
1103 #else
1104             /*  compute and append the two surrogates: */
1105
1106             /*  translate from 10000..10FFFF to 0..FFFF */
1107             ch -= 0x10000;
1108
1109             /*  high surrogate = top 10 bits added to D800 */
1110             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112             /*  low surrogate = bottom 10 bits added to DC00 */
1113             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1114 #endif
1115             break;
1116
1117         default:
1118             /* Other sizes are only needed for UCS-4 */
1119             errmsg = "unsupported Unicode code range";
1120             goto utf8Error;
1121         }
1122         s += n;
1123         continue;
1124
1125     utf8Error:
1126       if (utf8_decoding_error(&s, &p, errors, errmsg))
1127           goto onError;
1128     }
1129
1130     /* Adjust length */
1131     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1132         goto onError;
1133
1134     return (PyObject *)unicode;
1135
1136 onError:
1137     Py_DECREF(unicode);
1138     return NULL;
1139 }
1140
1141 /* Not used anymore, now that the encoder supports UTF-16
1142    surrogates. */
1143 #if 0
1144 static
1145 int utf8_encoding_error(const Py_UNICODE **source,
1146                         char **dest,
1147                         const char *errors,
1148                         const char *details)
1149 {
1150     if ((errors == NULL) ||
1151         (strcmp(errors,"strict") == 0)) {
1152         PyErr_Format(PyExc_UnicodeError,
1153                      "UTF-8 encoding error: %.400s",
1154                      details);
1155         return -1;
1156     }
1157     else if (strcmp(errors,"ignore") == 0) {
1158         return 0;
1159     }
1160     else if (strcmp(errors,"replace") == 0) {
1161         **dest = '?';
1162         (*dest)++;
1163         return 0;
1164     }
1165     else {
1166         PyErr_Format(PyExc_ValueError,
1167                      "UTF-8 encoding error; "
1168                      "unknown error handling code: %.400s",
1169                      errors);
1170         return -1;
1171     }
1172 }
1173 #endif
1174
1175 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176                                int size,
1177                                const char *errors)
1178 {
1179     PyObject *v;
1180     char *p;
1181     char *q;
1182     Py_UCS4 ch2;
1183     unsigned int cbAllocated = 3 * size;
1184     unsigned int cbWritten = 0;
1185     int i = 0;
1186
1187     v = PyString_FromStringAndSize(NULL, cbAllocated);
1188     if (v == NULL)
1189         return NULL;
1190     if (size == 0)
1191         return v;
1192
1193     p = q = PyString_AS_STRING(v);
1194     while (i < size) {
1195         Py_UCS4 ch = s[i++];
1196         if (ch < 0x80) {
1197             *p++ = (char) ch;
1198             cbWritten++;
1199         }
1200         else if (ch < 0x0800) {
1201             *p++ = 0xc0 | (ch >> 6);
1202             *p++ = 0x80 | (ch & 0x3f);
1203             cbWritten += 2;
1204         }
1205         else if (ch < 0x10000) {
1206             /* Check for high surrogate */
1207             if (0xD800 <= ch && ch <= 0xDBFF) {
1208                 if (i != size) {
1209                     ch2 = s[i];
1210                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1211
1212                         if (cbWritten >= (cbAllocated - 4)) {
1213                             /* Provide enough room for some more
1214                                surrogates */
1215                             cbAllocated += 4*10;
1216                             if (_PyString_Resize(&v, cbAllocated))
1217                                 goto onError;
1218                         }
1219
1220                         /* combine the two values */
1221                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1222
1223                         *p++ = (char)((ch >> 18) | 0xf0);
1224                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1225                         i++;
1226                         cbWritten += 4;
1227                     }
1228                 }
1229             }
1230             else {
1231                 *p++ = (char)(0xe0 | (ch >> 12));
1232                 cbWritten += 3;
1233             }
1234             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1235             *p++ = (char)(0x80 | (ch & 0x3f));
1236         } else {
1237             *p++ = 0xf0 | (ch>>18);
1238             *p++ = 0x80 | ((ch>>12) & 0x3f);
1239             *p++ = 0x80 | ((ch>>6) & 0x3f);
1240             *p++ = 0x80 | (ch & 0x3f);
1241             cbWritten += 4;
1242         }
1243     }
1244     *p = '\0';
1245     if (_PyString_Resize(&v, p - q))
1246         goto onError;
1247     return v;
1248
1249  onError:
1250     Py_DECREF(v);
1251     return NULL;
1252 }
1253
1254 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1255 {
1256     if (!PyUnicode_Check(unicode)) {
1257         PyErr_BadArgument();
1258         return NULL;
1259     }
1260     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1261                                 PyUnicode_GET_SIZE(unicode),
1262                                 NULL);
1263 }
1264
1265 /* --- UTF-16 Codec ------------------------------------------------------- */
1266
1267 static
1268 int utf16_decoding_error(Py_UNICODE **dest,
1269                          const char *errors,
1270                          const char *details)
1271 {
1272     if ((errors == NULL) ||
1273         (strcmp(errors,"strict") == 0)) {
1274         PyErr_Format(PyExc_UnicodeError,
1275                      "UTF-16 decoding error: %.400s",
1276                      details);
1277         return -1;
1278     }
1279     else if (strcmp(errors,"ignore") == 0) {
1280         return 0;
1281     }
1282     else if (strcmp(errors,"replace") == 0) {
1283         if (dest) {
1284             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1285             (*dest)++;
1286         }
1287         return 0;
1288     }
1289     else {
1290         PyErr_Format(PyExc_ValueError,
1291                      "UTF-16 decoding error; "
1292                      "unknown error handling code: %.400s",
1293                      errors);
1294         return -1;
1295     }
1296 }
1297
1298 PyObject *
1299 PyUnicode_DecodeUTF16(const char *s,
1300                       int size,
1301                       const char *errors,
1302                       int *byteorder)
1303 {
1304     PyUnicodeObject *unicode;
1305     Py_UNICODE *p;
1306     const unsigned char *q, *e;
1307     int bo = 0;       /* assume native ordering by default */
1308     const char *errmsg = "";
1309     /* Offsets from q for retrieving byte pairs in the right order. */
1310 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311     int ihi = 1, ilo = 0;
1312 #else
1313     int ihi = 0, ilo = 1;
1314 #endif
1315
1316     /* size should be an even number */
1317     if (size & 1) {
1318         if (utf16_decoding_error(NULL, errors, "truncated data"))
1319             return NULL;
1320         --size;  /* else ignore the oddball byte */
1321     }
1322
1323     /* Note: size will always be longer than the resulting Unicode
1324        character count */
1325     unicode = _PyUnicode_New(size);
1326     if (!unicode)
1327         return NULL;
1328     if (size == 0)
1329         return (PyObject *)unicode;
1330
1331     /* Unpack UTF-16 encoded data */
1332     p = unicode->str;
1333     q = (unsigned char *)s;
1334     e = q + size;
1335
1336     if (byteorder)
1337         bo = *byteorder;
1338
1339     /* Check for BOM marks (U+FEFF) in the input and adjust current
1340        byte order setting accordingly. In native mode, the leading BOM
1341        mark is skipped, in all other modes, it is copied to the output
1342        stream as-is (giving a ZWNBSP character). */
1343     if (bo == 0) {
1344         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1345 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1346         if (bom == 0xFEFF) {
1347             q += 2;
1348             bo = -1;
1349         }
1350         else if (bom == 0xFFFE) {
1351             q += 2;
1352             bo = 1;
1353         }
1354 #else
1355         if (bom == 0xFEFF) {
1356             q += 2;
1357             bo = 1;
1358         }
1359         else if (bom == 0xFFFE) {
1360             q += 2;
1361             bo = -1;
1362         }
1363 #endif
1364     }
1365
1366     if (bo == -1) {
1367         /* force LE */
1368         ihi = 1;
1369         ilo = 0;
1370     }
1371     else if (bo == 1) {
1372         /* force BE */
1373         ihi = 0;
1374         ilo = 1;
1375     }
1376
1377     while (q < e) {
1378         Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1379         q += 2;
1380
1381         if (ch < 0xD800 || ch > 0xDFFF) {
1382             *p++ = ch;
1383             continue;
1384         }
1385
1386         /* UTF-16 code pair: */
1387         if (q >= e) {
1388             errmsg = "unexpected end of data";
1389             goto utf16Error;
1390         }
1391         if (0xD800 <= ch && ch <= 0xDBFF) {
1392             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1393             q += 2;
1394             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1395 #ifndef Py_UNICODE_WIDE
1396                 *p++ = ch;
1397                 *p++ = ch2;
1398 #else
1399                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1400 #endif
1401                 continue;
1402             }
1403             else {
1404                 errmsg = "illegal UTF-16 surrogate";
1405                 goto utf16Error;
1406             }
1407
1408         }
1409         errmsg = "illegal encoding";
1410         /* Fall through to report the error */
1411
1412     utf16Error:
1413         if (utf16_decoding_error(&p, errors, errmsg))
1414             goto onError;
1415     }
1416
1417     if (byteorder)
1418         *byteorder = bo;
1419
1420     /* Adjust length */
1421     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1422         goto onError;
1423
1424     return (PyObject *)unicode;
1425
1426 onError:
1427     Py_DECREF(unicode);
1428     return NULL;
1429 }
1430
1431 PyObject *
1432 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1433                       int size,
1434                       const char *errors,
1435                       int byteorder)
1436 {
1437     PyObject *v;
1438     unsigned char *p;
1439     int i, pairs;
1440     /* Offsets from p for storing byte pairs in the right order. */
1441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442     int ihi = 1, ilo = 0;
1443 #else
1444     int ihi = 0, ilo = 1;
1445 #endif
1446
1447 #define STORECHAR(CH)                   \
1448     do {                                \
1449         p[ihi] = ((CH) >> 8) & 0xff;    \
1450         p[ilo] = (CH) & 0xff;           \
1451         p += 2;                         \
1452     } while(0)
1453
1454     for (i = pairs = 0; i < size; i++)
1455         if (s[i] >= 0x10000)
1456             pairs++;
1457     v = PyString_FromStringAndSize(NULL,
1458                   2 * (size + pairs + (byteorder == 0)));
1459     if (v == NULL)
1460         return NULL;
1461
1462     p = (unsigned char *)PyString_AS_STRING(v);
1463     if (byteorder == 0)
1464         STORECHAR(0xFEFF);
1465     if (size == 0)
1466         return v;
1467
1468     if (byteorder == -1) {
1469         /* force LE */
1470         ihi = 1;
1471         ilo = 0;
1472     }
1473     else if (byteorder == 1) {
1474         /* force BE */
1475         ihi = 0;
1476         ilo = 1;
1477     }
1478
1479     while (size-- > 0) {
1480         Py_UNICODE ch = *s++;
1481         Py_UNICODE ch2 = 0;
1482         if (ch >= 0x10000) {
1483             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1484             ch  = 0xD800 | ((ch-0x10000) >> 10);
1485         }
1486         STORECHAR(ch);
1487         if (ch2)
1488             STORECHAR(ch2);
1489     }
1490     return v;
1491 #undef STORECHAR
1492 }
1493
1494 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1495 {
1496     if (!PyUnicode_Check(unicode)) {
1497         PyErr_BadArgument();
1498         return NULL;
1499     }
1500     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1501                                  PyUnicode_GET_SIZE(unicode),
1502                                  NULL,
1503                                  0);
1504 }
1505
1506 /* --- Unicode Escape Codec ----------------------------------------------- */
1507
1508 static
1509 int unicodeescape_decoding_error(const char **source,
1510                                  Py_UNICODE *x,
1511                                  const char *errors,
1512                                  const char *details)
1513 {
1514     if ((errors == NULL) ||
1515         (strcmp(errors,"strict") == 0)) {
1516         PyErr_Format(PyExc_UnicodeError,
1517                      "Unicode-Escape decoding error: %.400s",
1518                      details);
1519         return -1;
1520     }
1521     else if (strcmp(errors,"ignore") == 0) {
1522         return 0;
1523     }
1524     else if (strcmp(errors,"replace") == 0) {
1525         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1526         return 0;
1527     }
1528     else {
1529         PyErr_Format(PyExc_ValueError,
1530                      "Unicode-Escape decoding error; "
1531                      "unknown error handling code: %.400s",
1532                      errors);
1533         return -1;
1534     }
1535 }
1536
1537 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1538
1539 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1540                                         int size,
1541                                         const char *errors)
1542 {
1543     PyUnicodeObject *v;
1544     Py_UNICODE *p, *buf;
1545     const char *end;
1546     char* message;
1547     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1548
1549     /* Escaped strings will always be longer than the resulting
1550        Unicode string, so we start with size here and then reduce the
1551        length after conversion to the true value. */
1552     v = _PyUnicode_New(size);
1553     if (v == NULL)
1554         goto onError;
1555     if (size == 0)
1556         return (PyObject *)v;
1557
1558     p = buf = PyUnicode_AS_UNICODE(v);
1559     end = s + size;
1560
1561     while (s < end) {
1562         unsigned char c;
1563         Py_UNICODE x;
1564         int i, digits;
1565
1566         /* Non-escape characters are interpreted as Unicode ordinals */
1567         if (*s != '\\') {
1568             *p++ = (unsigned char) *s++;
1569             continue;
1570         }
1571
1572         /* \ - Escapes */
1573         s++;
1574         switch (*s++) {
1575
1576         /* \x escapes */
1577         case '\n': break;
1578         case '\\': *p++ = '\\'; break;
1579         case '\'': *p++ = '\''; break;
1580         case '\"': *p++ = '\"'; break;
1581         case 'b': *p++ = '\b'; break;
1582         case 'f': *p++ = '\014'; break; /* FF */
1583         case 't': *p++ = '\t'; break;
1584         case 'n': *p++ = '\n'; break;
1585         case 'r': *p++ = '\r'; break;
1586         case 'v': *p++ = '\013'; break; /* VT */
1587         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1588
1589         /* \OOO (octal) escapes */
1590         case '0': case '1': case '2': case '3':
1591         case '4': case '5': case '6': case '7':
1592             x = s[-1] - '0';
1593             if ('0' <= *s && *s <= '7') {
1594                 x = (x<<3) + *s++ - '0';
1595                 if ('0' <= *s && *s <= '7')
1596                     x = (x<<3) + *s++ - '0';
1597             }
1598             *p++ = x;
1599             break;
1600
1601         /* hex escapes */
1602         /* \xXX */
1603         case 'x':
1604             digits = 2;
1605             message = "truncated \\xXX escape";
1606             goto hexescape;
1607
1608         /* \uXXXX */
1609         case 'u':
1610             digits = 4;
1611             message = "truncated \\uXXXX escape";
1612             goto hexescape;
1613
1614         /* \UXXXXXXXX */
1615         case 'U':
1616             digits = 8;
1617             message = "truncated \\UXXXXXXXX escape";
1618         hexescape:
1619             chr = 0;
1620             for (i = 0; i < digits; i++) {
1621                 c = (unsigned char) s[i];
1622                 if (!isxdigit(c)) {
1623                     if (unicodeescape_decoding_error(&s, &x, errors, message))
1624                         goto onError;
1625                     chr = x;
1626                     i++;
1627                     break;
1628                 }
1629                 chr = (chr<<4) & ~0xF;
1630                 if (c >= '0' && c <= '9')
1631                     chr += c - '0';
1632                 else if (c >= 'a' && c <= 'f')
1633                     chr += 10 + c - 'a';
1634                 else
1635                     chr += 10 + c - 'A';
1636             }
1637             s += i;
1638         store:
1639             /* when we get here, chr is a 32-bit unicode character */
1640             if (chr <= 0xffff)
1641                 /* UCS-2 character */
1642                 *p++ = (Py_UNICODE) chr;
1643             else if (chr <= 0x10ffff) {
1644                 /* UCS-4 character. Either store directly, or as
1645                    surrogate pair. */
1646 #ifdef Py_UNICODE_WIDE
1647                 *p++ = chr;
1648 #else
1649                 chr -= 0x10000L;
1650                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1651                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1652 #endif
1653             } else {
1654                 if (unicodeescape_decoding_error(
1655                     &s, &x, errors,
1656                     "illegal Unicode character")
1657                     )
1658                     goto onError;
1659                 *p++ = x; /* store replacement character */
1660             }
1661             break;
1662
1663         /* \N{name} */
1664         case 'N':
1665             message = "malformed \\N character escape";
1666             if (ucnhash_CAPI == NULL) {
1667                 /* load the unicode data module */
1668                 PyObject *m, *v;
1669                 m = PyImport_ImportModule("unicodedata");
1670                 if (m == NULL)
1671                     goto ucnhashError;
1672                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1673                 Py_DECREF(m);
1674                 if (v == NULL)
1675                     goto ucnhashError;
1676                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1677                 Py_DECREF(v);
1678                 if (ucnhash_CAPI == NULL)
1679                     goto ucnhashError;
1680             }
1681             if (*s == '{') {
1682                 const char *start = s+1;
1683                 /* look for the closing brace */
1684                 while (*s != '}' && s < end)
1685                     s++;
1686                 if (s > start && s < end && *s == '}') {
1687                     /* found a name.  look it up in the unicode database */
1688                     message = "unknown Unicode character name";
1689                     s++;
1690                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1691                         goto store;
1692                 }
1693             }
1694             if (unicodeescape_decoding_error(&s, &x, errors, message))
1695                 goto onError;
1696             *p++ = x;
1697             break;
1698
1699         default:
1700             *p++ = '\\';
1701             *p++ = (unsigned char)s[-1];
1702             break;
1703         }
1704     }
1705     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1706                 goto onError;
1707     return (PyObject *)v;
1708
1709 ucnhashError:
1710     PyErr_SetString(
1711         PyExc_UnicodeError,
1712         "\\N escapes not supported (can't load unicodedata module)"
1713         );
1714     return NULL;
1715
1716 onError:
1717     Py_XDECREF(v);
1718     return NULL;
1719 }
1720
1721 /* Return a Unicode-Escape string version of the Unicode object.
1722
1723    If quotes is true, the string is enclosed in u"" or u'' quotes as
1724    appropriate.
1725
1726 */
1727
1728 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1729                                   int size,
1730                                   Py_UNICODE ch);
1731
1732 static
1733 PyObject *unicodeescape_string(const Py_UNICODE *s,
1734                                int size,
1735                                int quotes)
1736 {
1737     PyObject *repr;
1738     char *p;
1739
1740     static const char *hexdigit = "0123456789abcdef";
1741
1742     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1743     if (repr == NULL)
1744         return NULL;
1745
1746     p = PyString_AS_STRING(repr);
1747
1748     if (quotes) {
1749         *p++ = 'u';
1750         *p++ = (findchar(s, size, '\'') &&
1751                 !findchar(s, size, '"')) ? '"' : '\'';
1752     }
1753     while (size-- > 0) {
1754         Py_UNICODE ch = *s++;
1755
1756         /* Escape quotes */
1757         if (quotes &&
1758             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1759             *p++ = '\\';
1760             *p++ = (char) ch;
1761             continue;
1762         }
1763
1764 #ifdef Py_UNICODE_WIDE
1765         /* Map 21-bit characters to '\U00xxxxxx' */
1766         else if (ch >= 0x10000) {
1767             int offset = p - PyString_AS_STRING(repr);
1768
1769             /* Resize the string if necessary */
1770             if (offset + 12 > PyString_GET_SIZE(repr)) {
1771                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1772                     goto onError;
1773                 p = PyString_AS_STRING(repr) + offset;
1774             }
1775
1776             *p++ = '\\';
1777             *p++ = 'U';
1778             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1779             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1780             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1781             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1782             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1783             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1784             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1785             *p++ = hexdigit[ch & 0x0000000F];
1786             continue;
1787         }
1788 #endif
1789         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1790         else if (ch >= 0xD800 && ch < 0xDC00) {
1791             Py_UNICODE ch2;
1792             Py_UCS4 ucs;
1793
1794             ch2 = *s++;
1795             size--;
1796             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1797                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1798                 *p++ = '\\';
1799                 *p++ = 'U';
1800                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1801                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1802                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1803                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1804                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1805                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1806                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1807                 *p++ = hexdigit[ucs & 0x0000000F];
1808                 continue;
1809             }
1810             /* Fall through: isolated surrogates are copied as-is */
1811             s--;
1812             size++;
1813         }
1814
1815         /* Map 16-bit characters to '\uxxxx' */
1816         if (ch >= 256) {
1817             *p++ = '\\';
1818             *p++ = 'u';
1819             *p++ = hexdigit[(ch >> 12) & 0x000F];
1820             *p++ = hexdigit[(ch >> 8) & 0x000F];
1821             *p++ = hexdigit[(ch >> 4) & 0x000F];
1822             *p++ = hexdigit[ch & 0x000F];
1823         }
1824
1825         /* Map special whitespace to '\t', \n', '\r' */
1826         else if (ch == '\t') {
1827             *p++ = '\\';
1828             *p++ = 't';
1829         }
1830         else if (ch == '\n') {
1831             *p++ = '\\';
1832             *p++ = 'n';
1833         }
1834         else if (ch == '\r') {
1835             *p++ = '\\';
1836             *p++ = 'r';
1837         }
1838
1839         /* Map non-printable US ASCII to '\xhh' */
1840         else if (ch < ' ' || ch >= 128) {
1841             *p++ = '\\';
1842             *p++ = 'x';
1843             *p++ = hexdigit[(ch >> 4) & 0x000F];
1844             *p++ = hexdigit[ch & 0x000F];
1845         }
1846
1847         /* Copy everything else as-is */
1848         else
1849             *p++ = (char) ch;
1850     }
1851     if (quotes)
1852         *p++ = PyString_AS_STRING(repr)[1];
1853
1854     *p = '\0';
1855     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1856         goto onError;
1857
1858     return repr;
1859
1860  onError:
1861     Py_DECREF(repr);
1862     return NULL;
1863 }
1864
1865 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1866                                         int size)
1867 {
1868     return unicodeescape_string(s, size, 0);
1869 }
1870
1871 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1872 {
1873     if (!PyUnicode_Check(unicode)) {
1874         PyErr_BadArgument();
1875         return NULL;
1876     }
1877     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1878                                          PyUnicode_GET_SIZE(unicode));
1879 }
1880
1881 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1882
1883 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1884                                            int size,
1885                                            const char *errors)
1886 {
1887     PyUnicodeObject *v;
1888     Py_UNICODE *p, *buf;
1889     const char *end;
1890     const char *bs;
1891
1892     /* Escaped strings will always be longer than the resulting
1893        Unicode string, so we start with size here and then reduce the
1894        length after conversion to the true value. */
1895     v = _PyUnicode_New(size);
1896     if (v == NULL)
1897         goto onError;
1898     if (size == 0)
1899         return (PyObject *)v;
1900     p = buf = PyUnicode_AS_UNICODE(v);
1901     end = s + size;
1902     while (s < end) {
1903         unsigned char c;
1904         Py_UNICODE x;
1905         int i;
1906
1907         /* Non-escape characters are interpreted as Unicode ordinals */
1908         if (*s != '\\') {
1909             *p++ = (unsigned char)*s++;
1910             continue;
1911         }
1912
1913         /* \u-escapes are only interpreted iff the number of leading
1914            backslashes if odd */
1915         bs = s;
1916         for (;s < end;) {
1917             if (*s != '\\')
1918                 break;
1919             *p++ = (unsigned char)*s++;
1920         }
1921         if (((s - bs) & 1) == 0 ||
1922             s >= end ||
1923             *s != 'u') {
1924             continue;
1925         }
1926         p--;
1927         s++;
1928
1929         /* \uXXXX with 4 hex digits */
1930         for (x = 0, i = 0; i < 4; i++) {
1931             c = (unsigned char)s[i];
1932             if (!isxdigit(c)) {
1933                 if (unicodeescape_decoding_error(&s, &x, errors,
1934                                                  "truncated \\uXXXX"))
1935                     goto onError;
1936                 i++;
1937                 break;
1938             }
1939             x = (x<<4) & ~0xF;
1940             if (c >= '0' && c <= '9')
1941                 x += c - '0';
1942             else if (c >= 'a' && c <= 'f')
1943                 x += 10 + c - 'a';
1944             else
1945                 x += 10 + c - 'A';
1946         }
1947         s += i;
1948         *p++ = x;
1949     }
1950     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1951         goto onError;
1952     return (PyObject *)v;
1953
1954  onError:
1955     Py_XDECREF(v);
1956     return NULL;
1957 }
1958
1959 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1960                                            int size)
1961 {
1962     PyObject *repr;
1963     char *p;
1964     char *q;
1965
1966     static const char *hexdigit = "0123456789abcdef";
1967
1968     repr = PyString_FromStringAndSize(NULL, 6 * size);
1969     if (repr == NULL)
1970         return NULL;
1971     if (size == 0)
1972         return repr;
1973
1974     p = q = PyString_AS_STRING(repr);
1975     while (size-- > 0) {
1976         Py_UNICODE ch = *s++;
1977         /* Map 16-bit characters to '\uxxxx' */
1978         if (ch >= 256) {
1979             *p++ = '\\';
1980             *p++ = 'u';
1981             *p++ = hexdigit[(ch >> 12) & 0xf];
1982             *p++ = hexdigit[(ch >> 8) & 0xf];
1983             *p++ = hexdigit[(ch >> 4) & 0xf];
1984             *p++ = hexdigit[ch & 15];
1985         }
1986         /* Copy everything else as-is */
1987         else
1988             *p++ = (char) ch;
1989     }
1990     *p = '\0';
1991     if (_PyString_Resize(&repr, p - q))
1992         goto onError;
1993
1994     return repr;
1995
1996  onError:
1997     Py_DECREF(repr);
1998     return NULL;
1999 }
2000
2001 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2002 {
2003     if (!PyUnicode_Check(unicode)) {
2004         PyErr_BadArgument();
2005         return NULL;
2006     }
2007     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2008                                             PyUnicode_GET_SIZE(unicode));
2009 }
2010
2011 /* --- Latin-1 Codec ------------------------------------------------------ */
2012
2013 PyObject *PyUnicode_DecodeLatin1(const char *s,
2014                                  int size,
2015                                  const char *errors)
2016 {
2017     PyUnicodeObject *v;
2018     Py_UNICODE *p;
2019
2020     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2021     if (size == 1 && *(unsigned char*)s < 256) {
2022         Py_UNICODE r = *(unsigned char*)s;
2023         return PyUnicode_FromUnicode(&r, 1);
2024     }
2025
2026     v = _PyUnicode_New(size);
2027     if (v == NULL)
2028         goto onError;
2029     if (size == 0)
2030         return (PyObject *)v;
2031     p = PyUnicode_AS_UNICODE(v);
2032     while (size-- > 0)
2033         *p++ = (unsigned char)*s++;
2034     return (PyObject *)v;
2035
2036  onError:
2037     Py_XDECREF(v);
2038     return NULL;
2039 }
2040
2041 static
2042 int latin1_encoding_error(const Py_UNICODE **source,
2043                           char **dest,
2044                           const char *errors,
2045                           const char *details)
2046 {
2047     if ((errors == NULL) ||
2048         (strcmp(errors,"strict") == 0)) {
2049         PyErr_Format(PyExc_UnicodeError,
2050                      "Latin-1 encoding error: %.400s",
2051                      details);
2052         return -1;
2053     }
2054     else if (strcmp(errors,"ignore") == 0) {
2055         return 0;
2056     }
2057     else if (strcmp(errors,"replace") == 0) {
2058         **dest = '?';
2059         (*dest)++;
2060         return 0;
2061     }
2062     else {
2063         PyErr_Format(PyExc_ValueError,
2064                      "Latin-1 encoding error; "
2065                      "unknown error handling code: %.400s",
2066                      errors);
2067         return -1;
2068     }
2069 }
2070
2071 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2072                                  int size,
2073                                  const char *errors)
2074 {
2075     PyObject *repr;
2076     char *s, *start;
2077
2078     repr = PyString_FromStringAndSize(NULL, size);
2079     if (repr == NULL)
2080         return NULL;
2081     if (size == 0)
2082         return repr;
2083
2084     s = PyString_AS_STRING(repr);
2085     start = s;
2086     while (size-- > 0) {
2087         Py_UNICODE ch = *p++;
2088         if (ch >= 256) {
2089             if (latin1_encoding_error(&p, &s, errors,
2090                                       "ordinal not in range(256)"))
2091                 goto onError;
2092         }
2093         else
2094             *s++ = (char)ch;
2095     }
2096     /* Resize if error handling skipped some characters */
2097     if (s - start < PyString_GET_SIZE(repr))
2098         if (_PyString_Resize(&repr, s - start))
2099             goto onError;
2100     return repr;
2101
2102  onError:
2103     Py_DECREF(repr);
2104     return NULL;
2105 }
2106
2107 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2108 {
2109     if (!PyUnicode_Check(unicode)) {
2110         PyErr_BadArgument();
2111         return NULL;
2112     }
2113     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2114                                   PyUnicode_GET_SIZE(unicode),
2115                                   NULL);
2116 }
2117
2118 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2119
2120 static
2121 int ascii_decoding_error(const char **source,
2122                          Py_UNICODE **dest,
2123                          const char *errors,
2124                          const char *details)
2125 {
2126     if ((errors == NULL) ||
2127         (strcmp(errors,"strict") == 0)) {
2128         PyErr_Format(PyExc_UnicodeError,
2129                      "ASCII decoding error: %.400s",
2130                      details);
2131         return -1;
2132     }
2133     else if (strcmp(errors,"ignore") == 0) {
2134         return 0;
2135     }
2136     else if (strcmp(errors,"replace") == 0) {
2137         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2138         (*dest)++;
2139         return 0;
2140     }
2141     else {
2142         PyErr_Format(PyExc_ValueError,
2143                      "ASCII decoding error; "
2144                      "unknown error handling code: %.400s",
2145                      errors);
2146         return -1;
2147     }
2148 }
2149
2150 PyObject *PyUnicode_DecodeASCII(const char *s,
2151                                 int size,
2152                                 const char *errors)
2153 {
2154     PyUnicodeObject *v;
2155     Py_UNICODE *p;
2156
2157     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2158     if (size == 1 && *(unsigned char*)s < 128) {
2159         Py_UNICODE r = *(unsigned char*)s;
2160         return PyUnicode_FromUnicode(&r, 1);
2161     }
2162
2163     v = _PyUnicode_New(size);
2164     if (v == NULL)
2165         goto onError;
2166     if (size == 0)
2167         return (PyObject *)v;
2168     p = PyUnicode_AS_UNICODE(v);
2169     while (size-- > 0) {
2170         register unsigned char c;
2171
2172         c = (unsigned char)*s++;
2173         if (c < 128)
2174             *p++ = c;
2175         else if (ascii_decoding_error(&s, &p, errors,
2176                                       "ordinal not in range(128)"))
2177                 goto onError;
2178     }
2179     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2180         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2181             goto onError;
2182     return (PyObject *)v;
2183
2184  onError:
2185     Py_XDECREF(v);
2186     return NULL;
2187 }
2188
2189 static
2190 int ascii_encoding_error(const Py_UNICODE **source,
2191                          char **dest,
2192                          const char *errors,
2193                          const char *details)
2194 {
2195     if ((errors == NULL) ||
2196         (strcmp(errors,"strict") == 0)) {
2197         PyErr_Format(PyExc_UnicodeError,
2198                      "ASCII encoding error: %.400s",
2199                      details);
2200         return -1;
2201     }
2202     else if (strcmp(errors,"ignore") == 0) {
2203         return 0;
2204     }
2205     else if (strcmp(errors,"replace") == 0) {
2206         **dest = '?';
2207         (*dest)++;
2208         return 0;
2209     }
2210     else {
2211         PyErr_Format(PyExc_ValueError,
2212                      "ASCII encoding error; "
2213                      "unknown error handling code: %.400s",
2214                      errors);
2215         return -1;
2216     }
2217 }
2218
2219 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2220                                 int size,
2221                                 const char *errors)
2222 {
2223     PyObject *repr;
2224     char *s, *start;
2225
2226     repr = PyString_FromStringAndSize(NULL, size);
2227     if (repr == NULL)
2228         return NULL;
2229     if (size == 0)
2230         return repr;
2231
2232     s = PyString_AS_STRING(repr);
2233     start = s;
2234     while (size-- > 0) {
2235         Py_UNICODE ch = *p++;
2236         if (ch >= 128) {
2237             if (ascii_encoding_error(&p, &s, errors,
2238                                       "ordinal not in range(128)"))
2239                 goto onError;
2240         }
2241         else
2242             *s++ = (char)ch;
2243     }
2244     /* Resize if error handling skipped some characters */
2245     if (s - start < PyString_GET_SIZE(repr))
2246         if (_PyString_Resize(&repr, s - start))
2247             goto onError;
2248     return repr;
2249
2250  onError:
2251     Py_DECREF(repr);
2252     return NULL;
2253 }
2254
2255 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2256 {
2257     if (!PyUnicode_Check(unicode)) {
2258         PyErr_BadArgument();
2259         return NULL;
2260     }
2261     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2262                                  PyUnicode_GET_SIZE(unicode),
2263                                  NULL);
2264 }
2265
2266 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2267
2268 /* --- MBCS codecs for Windows -------------------------------------------- */
2269
2270 PyObject *PyUnicode_DecodeMBCS(const char *s,
2271                                 int size,
2272                                 const char *errors)
2273 {
2274     PyUnicodeObject *v;
2275     Py_UNICODE *p;
2276
2277     /* First get the size of the result */
2278     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2279     if (size > 0 && usize==0)
2280         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2281
2282     v = _PyUnicode_New(usize);
2283     if (v == NULL)
2284         return NULL;
2285     if (usize == 0)
2286         return (PyObject *)v;
2287     p = PyUnicode_AS_UNICODE(v);
2288     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2289         Py_DECREF(v);
2290         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2291     }
2292
2293     return (PyObject *)v;
2294 }
2295
2296 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2297                                 int size,
2298                                 const char *errors)
2299 {
2300     PyObject *repr;
2301     char *s;
2302     DWORD mbcssize;
2303
2304     /* If there are no characters, bail now! */
2305     if (size==0)
2306             return PyString_FromString("");
2307
2308     /* First get the size of the result */
2309     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2310     if (mbcssize==0)
2311         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2312
2313     repr = PyString_FromStringAndSize(NULL, mbcssize);
2314     if (repr == NULL)
2315         return NULL;
2316     if (mbcssize == 0)
2317         return repr;
2318
2319     /* Do the conversion */
2320     s = PyString_AS_STRING(repr);
2321     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2322         Py_DECREF(repr);
2323         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2324     }
2325     return repr;
2326 }
2327
2328 #endif /* MS_WIN32 */
2329
2330 /* --- Character Mapping Codec -------------------------------------------- */
2331
2332 static
2333 int charmap_decoding_error(const char **source,
2334                          Py_UNICODE **dest,
2335                          const char *errors,
2336                          const char *details)
2337 {
2338     if ((errors == NULL) ||
2339         (strcmp(errors,"strict") == 0)) {
2340         PyErr_Format(PyExc_UnicodeError,
2341                      "charmap decoding error: %.400s",
2342                      details);
2343         return -1;
2344     }
2345     else if (strcmp(errors,"ignore") == 0) {
2346         return 0;
2347     }
2348     else if (strcmp(errors,"replace") == 0) {
2349         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2350         (*dest)++;
2351         return 0;
2352     }
2353     else {
2354         PyErr_Format(PyExc_ValueError,
2355                      "charmap decoding error; "
2356                      "unknown error handling code: %.400s",
2357                      errors);
2358         return -1;
2359     }
2360 }
2361
2362 PyObject *PyUnicode_DecodeCharmap(const char *s,
2363                                   int size,
2364                                   PyObject *mapping,
2365                                   const char *errors)
2366 {
2367     PyUnicodeObject *v;
2368     Py_UNICODE *p;
2369     int extrachars = 0;
2370
2371     /* Default to Latin-1 */
2372     if (mapping == NULL)
2373         return PyUnicode_DecodeLatin1(s, size, errors);
2374
2375     v = _PyUnicode_New(size);
2376     if (v == NULL)
2377         goto onError;
2378     if (size == 0)
2379         return (PyObject *)v;
2380     p = PyUnicode_AS_UNICODE(v);
2381     while (size-- > 0) {
2382         unsigned char ch = *s++;
2383         PyObject *w, *x;
2384
2385         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2386         w = PyInt_FromLong((long)ch);
2387         if (w == NULL)
2388             goto onError;
2389         x = PyObject_GetItem(mapping, w);
2390         Py_DECREF(w);
2391         if (x == NULL) {
2392             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2393                 /* No mapping found means: mapping is undefined. */
2394                 PyErr_Clear();
2395                 x = Py_None;
2396                 Py_INCREF(x);
2397             } else
2398                 goto onError;
2399         }
2400
2401         /* Apply mapping */
2402         if (PyInt_Check(x)) {
2403             long value = PyInt_AS_LONG(x);
2404             if (value < 0 || value > 65535) {
2405                 PyErr_SetString(PyExc_TypeError,
2406                                 "character mapping must be in range(65536)");
2407                 Py_DECREF(x);
2408                 goto onError;
2409             }
2410             *p++ = (Py_UNICODE)value;
2411         }
2412         else if (x == Py_None) {
2413             /* undefined mapping */
2414             if (charmap_decoding_error(&s, &p, errors,
2415                                        "character maps to <undefined>")) {
2416                 Py_DECREF(x);
2417                 goto onError;
2418             }
2419         }
2420         else if (PyUnicode_Check(x)) {
2421             int targetsize = PyUnicode_GET_SIZE(x);
2422
2423             if (targetsize == 1)
2424                 /* 1-1 mapping */
2425                 *p++ = *PyUnicode_AS_UNICODE(x);
2426
2427             else if (targetsize > 1) {
2428                 /* 1-n mapping */
2429                 if (targetsize > extrachars) {
2430                     /* resize first */
2431                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2432                     int needed = (targetsize - extrachars) + \
2433                                  (targetsize << 2);
2434                     extrachars += needed;
2435                     if (_PyUnicode_Resize(&v,
2436                                          PyUnicode_GET_SIZE(v) + needed)) {
2437                         Py_DECREF(x);
2438                         goto onError;
2439                     }
2440                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2441                 }
2442                 Py_UNICODE_COPY(p,
2443                                 PyUnicode_AS_UNICODE(x),
2444                                 targetsize);
2445                 p += targetsize;
2446                 extrachars -= targetsize;
2447             }
2448             /* 1-0 mapping: skip the character */
2449         }
2450         else {
2451             /* wrong return value */
2452             PyErr_SetString(PyExc_TypeError,
2453                   "character mapping must return integer, None or unicode");
2454             Py_DECREF(x);
2455             goto onError;
2456         }
2457         Py_DECREF(x);
2458     }
2459     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2460         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2461             goto onError;
2462     return (PyObject *)v;
2463
2464  onError:
2465     Py_XDECREF(v);
2466     return NULL;
2467 }
2468
2469 static
2470 int charmap_encoding_error(const Py_UNICODE **source,
2471                            char **dest,
2472                            const char *errors,
2473                            const char *details)
2474 {
2475     if ((errors == NULL) ||
2476         (strcmp(errors,"strict") == 0)) {
2477         PyErr_Format(PyExc_UnicodeError,
2478                      "charmap encoding error: %.400s",
2479                      details);
2480         return -1;
2481     }
2482     else if (strcmp(errors,"ignore") == 0) {
2483         return 0;
2484     }
2485     else if (strcmp(errors,"replace") == 0) {
2486         **dest = '?';
2487         (*dest)++;
2488         return 0;
2489     }
2490     else {
2491         PyErr_Format(PyExc_ValueError,
2492                      "charmap encoding error; "
2493                      "unknown error handling code: %.400s",
2494                      errors);
2495         return -1;
2496     }
2497 }
2498
2499 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2500                                   int size,
2501                                   PyObject *mapping,
2502                                   const char *errors)
2503 {
2504     PyObject *v;
2505     char *s;
2506     int extrachars = 0;
2507
2508     /* Default to Latin-1 */
2509     if (mapping == NULL)
2510         return PyUnicode_EncodeLatin1(p, size, errors);
2511
2512     v = PyString_FromStringAndSize(NULL, size);
2513     if (v == NULL)
2514         return NULL;
2515     if (size == 0)
2516         return v;
2517     s = PyString_AS_STRING(v);
2518     while (size-- > 0) {
2519         Py_UNICODE ch = *p++;
2520         PyObject *w, *x;
2521
2522         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2523         w = PyInt_FromLong((long)ch);
2524         if (w == NULL)
2525             goto onError;
2526         x = PyObject_GetItem(mapping, w);
2527         Py_DECREF(w);
2528         if (x == NULL) {
2529             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2530                 /* No mapping found means: mapping is undefined. */
2531                 PyErr_Clear();
2532                 x = Py_None;
2533                 Py_INCREF(x);
2534             } else
2535                 goto onError;
2536         }
2537
2538         /* Apply mapping */
2539         if (PyInt_Check(x)) {
2540             long value = PyInt_AS_LONG(x);
2541             if (value < 0 || value > 255) {
2542                 PyErr_SetString(PyExc_TypeError,
2543                                 "character mapping must be in range(256)");
2544                 Py_DECREF(x);
2545                 goto onError;
2546             }
2547             *s++ = (char)value;
2548         }
2549         else if (x == Py_None) {
2550             /* undefined mapping */
2551             if (charmap_encoding_error(&p, &s, errors,
2552                                        "character maps to <undefined>")) {
2553                 Py_DECREF(x);
2554                 goto onError;
2555             }
2556         }
2557         else if (PyString_Check(x)) {
2558             int targetsize = PyString_GET_SIZE(x);
2559
2560             if (targetsize == 1)
2561                 /* 1-1 mapping */
2562                 *s++ = *PyString_AS_STRING(x);
2563
2564             else if (targetsize > 1) {
2565                 /* 1-n mapping */
2566                 if (targetsize > extrachars) {
2567                     /* resize first */
2568                     int oldpos = (int)(s - PyString_AS_STRING(v));
2569                     int needed = (targetsize - extrachars) + \
2570                                  (targetsize << 2);
2571                     extrachars += needed;
2572                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2573                         Py_DECREF(x);
2574                         goto onError;
2575                     }
2576                     s = PyString_AS_STRING(v) + oldpos;
2577                 }
2578                 memcpy(s, PyString_AS_STRING(x), targetsize);
2579                 s += targetsize;
2580                 extrachars -= targetsize;
2581             }
2582             /* 1-0 mapping: skip the character */
2583         }
2584         else {
2585             /* wrong return value */
2586             PyErr_SetString(PyExc_TypeError,
2587                   "character mapping must return integer, None or unicode");
2588             Py_DECREF(x);
2589             goto onError;
2590         }
2591         Py_DECREF(x);
2592     }
2593     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2594         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2595             goto onError;
2596     return v;
2597
2598  onError:
2599     Py_DECREF(v);
2600     return NULL;
2601 }
2602
2603 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2604                                     PyObject *mapping)
2605 {
2606     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2607         PyErr_BadArgument();
2608         return NULL;
2609     }
2610     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2611                                    PyUnicode_GET_SIZE(unicode),
2612                                    mapping,
2613                                    NULL);
2614 }
2615
2616 static
2617 int translate_error(const Py_UNICODE **source,
2618                     Py_UNICODE **dest,
2619                     const char *errors,
2620                     const char *details)
2621 {
2622     if ((errors == NULL) ||
2623         (strcmp(errors,"strict") == 0)) {
2624         PyErr_Format(PyExc_UnicodeError,
2625                      "translate error: %.400s",
2626                      details);
2627         return -1;
2628     }
2629     else if (strcmp(errors,"ignore") == 0) {
2630         return 0;
2631     }
2632     else if (strcmp(errors,"replace") == 0) {
2633         **dest = '?';
2634         (*dest)++;
2635         return 0;
2636     }
2637     else {
2638         PyErr_Format(PyExc_ValueError,
2639                      "translate error; "
2640                      "unknown error handling code: %.400s",
2641                      errors);
2642         return -1;
2643     }
2644 }
2645
2646 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2647                                      int size,
2648                                      PyObject *mapping,
2649                                      const char *errors)
2650 {
2651     PyUnicodeObject *v;
2652     Py_UNICODE *p;
2653
2654     if (mapping == NULL) {
2655         PyErr_BadArgument();
2656         return NULL;
2657     }
2658
2659     /* Output will never be longer than input */
2660     v = _PyUnicode_New(size);
2661     if (v == NULL)
2662         goto onError;
2663     if (size == 0)
2664         goto done;
2665     p = PyUnicode_AS_UNICODE(v);
2666     while (size-- > 0) {
2667         Py_UNICODE ch = *s++;
2668         PyObject *w, *x;
2669
2670         /* Get mapping */
2671         w = PyInt_FromLong(ch);
2672         if (w == NULL)
2673             goto onError;
2674         x = PyObject_GetItem(mapping, w);
2675         Py_DECREF(w);
2676         if (x == NULL) {
2677             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2678                 /* No mapping found: default to 1-1 mapping */
2679                 PyErr_Clear();
2680                 *p++ = ch;
2681                 continue;
2682             }
2683             goto onError;
2684         }
2685
2686         /* Apply mapping */
2687         if (PyInt_Check(x))
2688             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2689         else if (x == Py_None) {
2690             /* undefined mapping */
2691             if (translate_error(&s, &p, errors,
2692                                 "character maps to <undefined>")) {
2693                 Py_DECREF(x);
2694                 goto onError;
2695             }
2696         }
2697         else if (PyUnicode_Check(x)) {
2698             if (PyUnicode_GET_SIZE(x) != 1) {
2699                 /* 1-n mapping */
2700                 PyErr_SetString(PyExc_NotImplementedError,
2701                                 "1-n mappings are currently not implemented");
2702                 Py_DECREF(x);
2703                 goto onError;
2704             }
2705             *p++ = *PyUnicode_AS_UNICODE(x);
2706         }
2707         else {
2708             /* wrong return value */
2709             PyErr_SetString(PyExc_TypeError,
2710                   "translate mapping must return integer, None or unicode");
2711             Py_DECREF(x);
2712             goto onError;
2713         }
2714         Py_DECREF(x);
2715     }
2716     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2717         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2718             goto onError;
2719
2720  done:
2721     return (PyObject *)v;
2722
2723  onError:
2724     Py_XDECREF(v);
2725     return NULL;
2726 }
2727
2728 PyObject *PyUnicode_Translate(PyObject *str,
2729                               PyObject *mapping,
2730                               const char *errors)
2731 {
2732     PyObject *result;
2733
2734     str = PyUnicode_FromObject(str);
2735     if (str == NULL)
2736         goto onError;
2737     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2738                                         PyUnicode_GET_SIZE(str),
2739                                         mapping,
2740                                         errors);
2741     Py_DECREF(str);
2742     return result;
2743
2744  onError:
2745     Py_XDECREF(str);
2746     return NULL;
2747 }
2748
2749 /* --- Decimal Encoder ---------------------------------------------------- */
2750
2751 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2752                             int length,
2753                             char *output,
2754                             const char *errors)
2755 {
2756     Py_UNICODE *p, *end;
2757
2758     if (output == NULL) {
2759         PyErr_BadArgument();
2760         return -1;
2761     }
2762
2763     p = s;
2764     end = s + length;
2765     while (p < end) {
2766         register Py_UNICODE ch = *p++;
2767         int decimal;
2768
2769         if (Py_UNICODE_ISSPACE(ch)) {
2770             *output++ = ' ';
2771             continue;
2772         }
2773         decimal = Py_UNICODE_TODECIMAL(ch);
2774         if (decimal >= 0) {
2775             *output++ = '0' + decimal;
2776             continue;
2777         }
2778         if (0 < ch && ch < 256) {
2779             *output++ = (char)ch;
2780             continue;
2781         }
2782         /* All other characters are considered invalid */
2783         if (errors == NULL || strcmp(errors, "strict") == 0) {
2784             PyErr_SetString(PyExc_ValueError,
2785                             "invalid decimal Unicode string");
2786             goto onError;
2787         }
2788         else if (strcmp(errors, "ignore") == 0)
2789             continue;
2790         else if (strcmp(errors, "replace") == 0) {
2791             *output++ = '?';
2792             continue;
2793         }
2794     }
2795     /* 0-terminate the output string */
2796     *output++ = '\0';
2797     return 0;
2798
2799  onError:
2800     return -1;
2801 }
2802
2803 /* --- Helpers ------------------------------------------------------------ */
2804
2805 static
2806 int count(PyUnicodeObject *self,
2807           int start,
2808           int end,
2809           PyUnicodeObject *substring)
2810 {
2811     int count = 0;
2812
2813     if (start < 0)
2814         start += self->length;
2815     if (start < 0)
2816         start = 0;
2817     if (end > self->length)
2818         end = self->length;
2819     if (end < 0)
2820         end += self->length;
2821     if (end < 0)
2822         end = 0;
2823
2824     if (substring->length == 0)
2825         return (end - start + 1);
2826
2827     end -= substring->length;
2828
2829     while (start <= end)
2830         if (Py_UNICODE_MATCH(self, start, substring)) {
2831             count++;
2832             start += substring->length;
2833         } else
2834             start++;
2835
2836     return count;
2837 }
2838
2839 int PyUnicode_Count(PyObject *str,
2840                     PyObject *substr,
2841                     int start,
2842                     int end)
2843 {
2844     int result;
2845
2846     str = PyUnicode_FromObject(str);
2847     if (str == NULL)
2848         return -1;
2849     substr = PyUnicode_FromObject(substr);
2850     if (substr == NULL) {
2851         Py_DECREF(str);
2852         return -1;
2853     }
2854
2855     result = count((PyUnicodeObject *)str,
2856                    start, end,
2857                    (PyUnicodeObject *)substr);
2858
2859     Py_DECREF(str);
2860     Py_DECREF(substr);
2861     return result;
2862 }
2863
2864 static
2865 int findstring(PyUnicodeObject *self,
2866                PyUnicodeObject *substring,
2867                int start,
2868                int end,
2869                int direction)
2870 {
2871     if (start < 0)
2872         start += self->length;
2873     if (start < 0)
2874         start = 0;
2875
2876     if (substring->length == 0)
2877         return start;
2878
2879     if (end > self->length)
2880         end = self->length;
2881     if (end < 0)
2882         end += self->length;
2883     if (end < 0)
2884         end = 0;
2885
2886     end -= substring->length;
2887
2888     if (direction < 0) {
2889         for (; end >= start; end--)
2890             if (Py_UNICODE_MATCH(self, end, substring))
2891                 return end;
2892     } else {
2893         for (; start <= end; start++)
2894             if (Py_UNICODE_MATCH(self, start, substring))
2895                 return start;
2896     }
2897
2898     return -1;
2899 }
2900
2901 int PyUnicode_Find(PyObject *str,
2902                    PyObject *substr,
2903                    int start,
2904                    int end,
2905                    int direction)
2906 {
2907     int result;
2908
2909     str = PyUnicode_FromObject(str);
2910     if (str == NULL)
2911         return -1;
2912     substr = PyUnicode_FromObject(substr);
2913     if (substr == NULL) {
2914         Py_DECREF(substr);
2915         return -1;
2916     }
2917
2918     result = findstring((PyUnicodeObject *)str,
2919                         (PyUnicodeObject *)substr,
2920                         start, end, direction);
2921     Py_DECREF(str);
2922     Py_DECREF(substr);
2923     return result;
2924 }
2925
2926 static
2927 int tailmatch(PyUnicodeObject *self,
2928               PyUnicodeObject *substring,
2929               int start,
2930               int end,
2931               int direction)
2932 {
2933     if (start < 0)
2934         start += self->length;
2935     if (start < 0)
2936         start = 0;
2937
2938     if (substring->length == 0)
2939         return 1;
2940
2941     if (end > self->length)
2942         end = self->length;
2943     if (end < 0)
2944         end += self->length;
2945     if (end < 0)
2946         end = 0;
2947
2948     end -= substring->length;
2949     if (end < start)
2950         return 0;
2951
2952     if (direction > 0) {
2953         if (Py_UNICODE_MATCH(self, end, substring))
2954             return 1;
2955     } else {
2956         if (Py_UNICODE_MATCH(self, start, substring))
2957             return 1;
2958     }
2959
2960     return 0;
2961 }
2962
2963 int PyUnicode_Tailmatch(PyObject *str,
2964                         PyObject *substr,
2965                         int start,
2966                         int end,
2967                         int direction)
2968 {
2969     int result;
2970
2971     str = PyUnicode_FromObject(str);
2972     if (str == NULL)
2973         return -1;
2974     substr = PyUnicode_FromObject(substr);
2975     if (substr == NULL) {
2976         Py_DECREF(substr);
2977         return -1;
2978     }
2979
2980     result = tailmatch((PyUnicodeObject *)str,
2981                        (PyUnicodeObject *)substr,
2982                        start, end, direction);
2983     Py_DECREF(str);
2984     Py_DECREF(substr);
2985     return result;
2986 }
2987
2988 static
2989 const Py_UNICODE *findchar(const Py_UNICODE *s,
2990                      int size,
2991                      Py_UNICODE ch)
2992 {
2993     /* like wcschr, but doesn't stop at NULL characters */
2994
2995     while (size-- > 0) {
2996         if (*s == ch)
2997             return s;
2998         s++;
2999     }
3000
3001     return NULL;
3002 }
3003
3004 /* Apply fixfct filter to the Unicode object self and return a
3005    reference to the modified object */
3006
3007 static
3008 PyObject *fixup(PyUnicodeObject *self,
3009                 int (*fixfct)(PyUnicodeObject *s))
3010 {
3011
3012     PyUnicodeObject *u;
3013
3014     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3015     if (u == NULL)
3016         return NULL;
3017
3018     Py_UNICODE_COPY(u->str, self->str, self->length);
3019
3020     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3021         /* fixfct should return TRUE if it modified the buffer. If
3022            FALSE, return a reference to the original buffer instead
3023            (to save space, not time) */
3024         Py_INCREF(self);
3025         Py_DECREF(u);
3026         return (PyObject*) self;
3027     }
3028     return (PyObject*) u;
3029 }
3030
3031 static
3032 int fixupper(PyUnicodeObject *self)
3033 {
3034     int len = self->length;
3035     Py_UNICODE *s = self->str;
3036     int status = 0;
3037
3038     while (len-- > 0) {
3039         register Py_UNICODE ch;
3040
3041         ch = Py_UNICODE_TOUPPER(*s);
3042         if (ch != *s) {
3043             status = 1;
3044             *s = ch;
3045         }
3046         s++;
3047     }
3048
3049     return status;
3050 }
3051
3052 static
3053 int fixlower(PyUnicodeObject *self)
3054 {
3055     int len = self->length;
3056     Py_UNICODE *s = self->str;
3057     int status = 0;
3058
3059     while (len-- > 0) {
3060         register Py_UNICODE ch;
3061
3062         ch = Py_UNICODE_TOLOWER(*s);
3063         if (ch != *s) {
3064             status = 1;
3065             *s = ch;
3066         }
3067         s++;
3068     }
3069
3070     return status;
3071 }
3072
3073 static
3074 int fixswapcase(PyUnicodeObject *self)
3075 {
3076     int len = self->length;
3077     Py_UNICODE *s = self->str;
3078     int status = 0;
3079
3080     while (len-- > 0) {
3081         if (Py_UNICODE_ISUPPER(*s)) {
3082             *s = Py_UNICODE_TOLOWER(*s);
3083             status = 1;
3084         } else if (Py_UNICODE_ISLOWER(*s)) {
3085             *s = Py_UNICODE_TOUPPER(*s);
3086             status = 1;
3087         }
3088         s++;
3089     }
3090
3091     return status;
3092 }
3093
3094 static
3095 int fixcapitalize(PyUnicodeObject *self)
3096 {
3097     int len = self->length;
3098     Py_UNICODE *s = self->str;
3099     int status = 0;
3100
3101     if (len == 0)
3102         return 0;
3103     if (Py_UNICODE_ISLOWER(*s)) {
3104         *s = Py_UNICODE_TOUPPER(*s);
3105         status = 1;
3106     }
3107     s++;
3108     while (--len > 0) {
3109         if (Py_UNICODE_ISUPPER(*s)) {
3110             *s = Py_UNICODE_TOLOWER(*s);
3111             status = 1;
3112         }
3113         s++;
3114     }
3115     return status;
3116 }
3117
3118 static
3119 int fixtitle(PyUnicodeObject *self)
3120 {
3121     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3122     register Py_UNICODE *e;
3123     int previous_is_cased;
3124
3125     /* Shortcut for single character strings */
3126     if (PyUnicode_GET_SIZE(self) == 1) {
3127         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3128         if (*p != ch) {
3129             *p = ch;
3130             return 1;
3131         }
3132         else
3133             return 0;
3134     }
3135
3136     e = p + PyUnicode_GET_SIZE(self);
3137     previous_is_cased = 0;
3138     for (; p < e; p++) {
3139         register const Py_UNICODE ch = *p;
3140
3141         if (previous_is_cased)
3142             *p = Py_UNICODE_TOLOWER(ch);
3143         else
3144             *p = Py_UNICODE_TOTITLE(ch);
3145
3146         if (Py_UNICODE_ISLOWER(ch) ||
3147             Py_UNICODE_ISUPPER(ch) ||
3148             Py_UNICODE_ISTITLE(ch))
3149             previous_is_cased = 1;
3150         else
3151             previous_is_cased = 0;
3152     }
3153     return 1;
3154 }
3155
3156 PyObject *PyUnicode_Join(PyObject *separator,
3157                          PyObject *seq)
3158 {
3159     Py_UNICODE *sep;
3160     int seplen;
3161     PyUnicodeObject *res = NULL;
3162     int reslen = 0;
3163     Py_UNICODE *p;
3164     int sz = 100;
3165     int i;
3166     PyObject *it;
3167
3168     it = PyObject_GetIter(seq);
3169     if (it == NULL)
3170         return NULL;
3171
3172     if (separator == NULL) {
3173         Py_UNICODE blank = ' ';
3174         sep = &blank;
3175         seplen = 1;
3176     }
3177     else {
3178         separator = PyUnicode_FromObject(separator);
3179         if (separator == NULL)
3180             goto onError;
3181         sep = PyUnicode_AS_UNICODE(separator);
3182         seplen = PyUnicode_GET_SIZE(separator);
3183     }
3184
3185     res = _PyUnicode_New(sz);
3186     if (res == NULL)
3187         goto onError;
3188     p = PyUnicode_AS_UNICODE(res);
3189     reslen = 0;
3190
3191     for (i = 0; ; ++i) {
3192         int itemlen;
3193         PyObject *item = PyIter_Next(it);
3194         if (item == NULL) {
3195             if (PyErr_Occurred())
3196                 goto onError;
3197             break;
3198         }
3199         if (!PyUnicode_Check(item)) {
3200             PyObject *v;
3201             if (!PyString_Check(item)) {
3202                 PyErr_Format(PyExc_TypeError,
3203                              "sequence item %i: expected string or Unicode,"
3204                              " %.80s found",
3205                              i, item->ob_type->tp_name);
3206                 Py_DECREF(item);
3207                 goto onError;
3208             }
3209             v = PyUnicode_FromObject(item);
3210             Py_DECREF(item);
3211             item = v;
3212             if (item == NULL)
3213                 goto onError;
3214         }
3215         itemlen = PyUnicode_GET_SIZE(item);
3216         while (reslen + itemlen + seplen >= sz) {
3217             if (_PyUnicode_Resize(&res, sz*2)) {
3218                 Py_DECREF(item);
3219                 goto onError;
3220             }
3221             sz *= 2;
3222             p = PyUnicode_AS_UNICODE(res) + reslen;
3223         }
3224         if (i > 0) {
3225             Py_UNICODE_COPY(p, sep, seplen);
3226             p += seplen;
3227             reslen += seplen;
3228         }
3229         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3230         p += itemlen;
3231         reslen += itemlen;
3232         Py_DECREF(item);
3233     }
3234     if (_PyUnicode_Resize(&res, reslen))
3235         goto onError;
3236
3237     Py_XDECREF(separator);
3238     Py_DECREF(it);
3239     return (PyObject *)res;
3240
3241  onError:
3242     Py_XDECREF(separator);
3243     Py_XDECREF(res);
3244     Py_DECREF(it);
3245     return NULL;
3246 }
3247
3248 static
3249 PyUnicodeObject *pad(PyUnicodeObject *self,
3250                      int left,
3251                      int right,
3252                      Py_UNICODE fill)
3253 {
3254     PyUnicodeObject *u;
3255
3256     if (left < 0)
3257         left = 0;
3258     if (right < 0)
3259         right = 0;
3260
3261     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3262         Py_INCREF(self);
3263         return self;
3264     }
3265
3266     u = _PyUnicode_New(left + self->length + right);
3267     if (u) {
3268         if (left)
3269             Py_UNICODE_FILL(u->str, fill, left);
3270         Py_UNICODE_COPY(u->str + left, self->str, self->length);
3271         if (right)
3272             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3273     }
3274
3275     return u;
3276 }
3277
3278 #define SPLIT_APPEND(data, left, right)                                 \
3279         str = PyUnicode_FromUnicode(data + left, right - left);         \
3280         if (!str)                                                       \
3281             goto onError;                                               \
3282         if (PyList_Append(list, str)) {                                 \
3283             Py_DECREF(str);                                             \
3284             goto onError;                                               \
3285         }                                                               \
3286         else                                                            \
3287             Py_DECREF(str);
3288
3289 static
3290 PyObject *split_whitespace(PyUnicodeObject *self,
3291                            PyObject *list,
3292                            int maxcount)
3293 {
3294     register int i;
3295     register int j;
3296     int len = self->length;
3297     PyObject *str;
3298
3299     for (i = j = 0; i < len; ) {
3300         /* find a token */
3301         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3302             i++;
3303         j = i;
3304         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3305             i++;
3306         if (j < i) {
3307             if (maxcount-- <= 0)
3308                 break;
3309             SPLIT_APPEND(self->str, j, i);
3310             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3311                 i++;
3312             j = i;
3313         }
3314     }
3315     if (j < len) {
3316         SPLIT_APPEND(self->str, j, len);
3317     }
3318     return list;
3319
3320  onError:
3321     Py_DECREF(list);
3322     return NULL;
3323 }
3324
3325 PyObject *PyUnicode_Splitlines(PyObject *string,
3326                                int keepends)
3327 {
3328     register int i;
3329     register int j;
3330     int len;
3331     PyObject *list;
3332     PyObject *str;
3333     Py_UNICODE *data;
3334
3335     string = PyUnicode_FromObject(string);
3336     if (string == NULL)
3337         return NULL;
3338     data = PyUnicode_AS_UNICODE(string);
3339     len = PyUnicode_GET_SIZE(string);
3340
3341     list = PyList_New(0);
3342     if (!list)
3343         goto onError;
3344
3345     for (i = j = 0; i < len; ) {
3346         int eol;
3347
3348         /* Find a line and append it */
3349         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3350             i++;
3351
3352         /* Skip the line break reading CRLF as one line break */
3353         eol = i;
3354         if (i < len) {
3355             if (data[i] == '\r' && i + 1 < len &&
3356                 data[i+1] == '\n')
3357                 i += 2;
3358             else
3359                 i++;
3360             if (keepends)
3361                 eol = i;
3362         }
3363         SPLIT_APPEND(data, j, eol);
3364         j = i;
3365     }
3366     if (j < len) {
3367         SPLIT_APPEND(data, j, len);
3368     }
3369
3370     Py_DECREF(string);
3371     return list;
3372
3373  onError:
3374     Py_DECREF(list);
3375     Py_DECREF(string);
3376     return NULL;
3377 }
3378
3379 static
3380 PyObject *split_char(PyUnicodeObject *self,
3381                      PyObject *list,
3382                      Py_UNICODE ch,
3383                      int maxcount)
3384 {
3385     register int i;
3386     register int j;
3387     int len = self->length;
3388     PyObject *str;
3389
3390     for (i = j = 0; i < len; ) {
3391         if (self->str[i] == ch) {
3392             if (maxcount-- <= 0)
3393                 break;
3394             SPLIT_APPEND(self->str, j, i);
3395             i = j = i + 1;
3396         } else
3397             i++;
3398     }
3399     if (j <= len) {
3400         SPLIT_APPEND(self->str, j, len);
3401     }
3402     return list;
3403
3404  onError:
3405     Py_DECREF(list);
3406     return NULL;
3407 }
3408
3409 static
3410 PyObject *split_substring(PyUnicodeObject *self,
3411                           PyObject *list,
3412                           PyUnicodeObject *substring,
3413                           int maxcount)
3414 {
3415     register int i;
3416     register int j;
3417     int len = self->length;
3418     int sublen = substring->length;
3419     PyObject *str;
3420
3421     for (i = j = 0; i <= len - sublen; ) {
3422         if (Py_UNICODE_MATCH(self, i, substring)) {
3423             if (maxcount-- <= 0)
3424                 break;
3425             SPLIT_APPEND(self->str, j, i);
3426             i = j = i + sublen;
3427         } else
3428             i++;
3429     }
3430     if (j <= len) {
3431         SPLIT_APPEND(self->str, j, len);
3432     }
3433     return list;
3434
3435  onError:
3436     Py_DECREF(list);
3437     return NULL;
3438 }
3439
3440 #undef SPLIT_APPEND
3441
3442 static
3443 PyObject *split(PyUnicodeObject *self,
3444                 PyUnicodeObject *substring,
3445                 int maxcount)
3446 {
3447     PyObject *list;
3448
3449     if (maxcount < 0)
3450         maxcount = INT_MAX;
3451
3452     list = PyList_New(0);
3453     if (!list)
3454         return NULL;
3455
3456     if (substring == NULL)
3457         return split_whitespace(self,list,maxcount);
3458
3459     else if (substring->length == 1)
3460         return split_char(self,list,substring->str[0],maxcount);
3461
3462     else if (substring->length == 0) {
3463         Py_DECREF(list);
3464         PyErr_SetString(PyExc_ValueError, "empty separator");
3465         return NULL;
3466     }
3467     else
3468         return split_substring(self,list,substring,maxcount);
3469 }
3470
3471 static
3472 PyObject *strip(PyUnicodeObject *self,
3473                 int left,
3474                 int right)
3475 {
3476     Py_UNICODE *p = self->str;
3477     int start = 0;
3478     int end = self->length;
3479
3480     if (left)
3481         while (start < end && Py_UNICODE_ISSPACE(p[start]))
3482             start++;
3483
3484     if (right)
3485         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3486             end--;
3487
3488     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3489         /* couldn't strip anything off, return original string */
3490         Py_INCREF(self);
3491         return (PyObject*) self;
3492     }
3493
3494     return (PyObject*) PyUnicode_FromUnicode(
3495         self->str + start,
3496         end - start
3497         );
3498 }
3499
3500 static
3501 PyObject *replace(PyUnicodeObject *self,
3502                   PyUnicodeObject *str1,
3503                   PyUnicodeObject *str2,
3504                   int maxcount)
3505 {
3506     PyUnicodeObject *u;
3507
3508     if (maxcount < 0)
3509         maxcount = INT_MAX;
3510
3511     if (str1->length == 1 && str2->length == 1) {
3512         int i;
3513
3514         /* replace characters */
3515         if (!findchar(self->str, self->length, str1->str[0]) &&
3516             PyUnicode_CheckExact(self)) {
3517             /* nothing to replace, return original string */
3518             Py_INCREF(self);
3519             u = self;
3520         } else {
3521             Py_UNICODE u1 = str1->str[0];
3522             Py_UNICODE u2 = str2->str[0];
3523
3524             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3525                 NULL,
3526                 self->length
3527                 );
3528             if (u != NULL) {
3529                 Py_UNICODE_COPY(u->str, self->str,
3530                                 self->length);
3531                 for (i = 0; i < u->length; i++)
3532                     if (u->str[i] == u1) {
3533                         if (--maxcount < 0)
3534                             break;
3535                         u->str[i] = u2;
3536                     }
3537         }
3538         }
3539
3540     } else {
3541         int n, i;
3542         Py_UNICODE *p;
3543
3544         /* replace strings */
3545         n = count(self, 0, self->length, str1);
3546         if (n > maxcount)
3547             n = maxcount;
3548         if (n == 0 && PyUnicode_CheckExact(self)) {
3549             /* nothing to replace, return original string */
3550             Py_INCREF(self);
3551             u = self;
3552         } else {
3553             u = _PyUnicode_New(
3554                 self->length + n * (str2->length - str1->length));
3555             if (u) {
3556                 i = 0;
3557                 p = u->str;
3558                 while (i <= self->length - str1->length)
3559                     if (Py_UNICODE_MATCH(self, i, str1)) {
3560                         /* replace string segment */
3561                         Py_UNICODE_COPY(p, str2->str, str2->length);
3562                         p += str2->length;
3563                         i += str1->length;
3564                         if (--n <= 0) {
3565                             /* copy remaining part */
3566                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3567                             break;
3568                         }
3569                     } else
3570                         *p++ = self->str[i++];
3571             }
3572         }
3573     }
3574
3575     return (PyObject *) u;
3576 }
3577
3578 /* --- Unicode Object Methods --------------------------------------------- */
3579
3580 static char title__doc__[] =
3581 "S.title() -> unicode\n\
3582 \n\
3583 Return a titlecased version of S, i.e. words start with title case\n\
3584 characters, all remaining cased characters have lower case.";
3585
3586 static PyObject*
3587 unicode_title(PyUnicodeObject *self)
3588 {
3589     return fixup(self, fixtitle);
3590 }
3591
3592 static char capitalize__doc__[] =
3593 "S.capitalize() -> unicode\n\
3594 \n\
3595 Return a capitalized version of S, i.e. make the first character\n\
3596 have upper case.";
3597
3598 static PyObject*
3599 unicode_capitalize(PyUnicodeObject *self)
3600 {
3601     return fixup(self, fixcapitalize);
3602 }
3603
3604 #if 0
3605 static char capwords__doc__[] =
3606 "S.capwords() -> unicode\n\
3607 \n\
3608 Apply .capitalize() to all words in S and return the result with\n\
3609 normalized whitespace (all whitespace strings are replaced by ' ').";
3610
3611 static PyObject*
3612 unicode_capwords(PyUnicodeObject *self)
3613 {
3614     PyObject *list;
3615     PyObject *item;
3616     int i;
3617
3618     /* Split into words */
3619     list = split(self, NULL, -1);
3620     if (!list)
3621         return NULL;
3622
3623     /* Capitalize each word */
3624     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3625         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3626                      fixcapitalize);
3627         if (item == NULL)
3628             goto onError;
3629         Py_DECREF(PyList_GET_ITEM(list, i));
3630         PyList_SET_ITEM(list, i, item);
3631     }
3632
3633     /* Join the words to form a new string */
3634     item = PyUnicode_Join(NULL, list);
3635
3636 onError:
3637     Py_DECREF(list);
3638     return (PyObject *)item;
3639 }
3640 #endif
3641
3642 static char center__doc__[] =
3643 "S.center(width) -> unicode\n\
3644 \n\
3645 Return S centered in a Unicode string of length width. Padding is done\n\
3646 using spaces.";
3647
3648 static PyObject *
3649 unicode_center(PyUnicodeObject *self, PyObject *args)
3650 {
3651     int marg, left;
3652     int width;
3653
3654     if (!PyArg_ParseTuple(args, "i:center", &width))
3655         return NULL;
3656
3657     if (self->length >= width && PyUnicode_CheckExact(self)) {
3658         Py_INCREF(self);
3659         return (PyObject*) self;
3660     }
3661
3662     marg = width - self->length;
3663     left = marg / 2 + (marg & width & 1);
3664
3665     return (PyObject*) pad(self, left, marg - left, ' ');
3666 }
3667
3668 #if 0
3669
3670 /* This code should go into some future Unicode collation support
3671    module. The basic comparison should compare ordinals on a naive
3672    basis (this is what Java does and thus JPython too). */
3673
3674 /* speedy UTF-16 code point order comparison */
3675 /* gleaned from: */
3676 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3677
3678 static short utf16Fixup[32] =
3679 {
3680     0, 0, 0, 0, 0, 0, 0, 0,
3681     0, 0, 0, 0, 0, 0, 0, 0,
3682     0, 0, 0, 0, 0, 0, 0, 0,
3683     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3684 };
3685
3686 static int
3687 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3688 {
3689     int len1, len2;
3690
3691     Py_UNICODE *s1 = str1->str;
3692     Py_UNICODE *s2 = str2->str;
3693
3694     len1 = str1->length;
3695     len2 = str2->length;
3696
3697     while (len1 > 0 && len2 > 0) {
3698         Py_UNICODE c1, c2;
3699
3700         c1 = *s1++;
3701         c2 = *s2++;
3702
3703         if (c1 > (1<<11) * 26)
3704             c1 += utf16Fixup[c1>>11];
3705         if (c2 > (1<<11) * 26)
3706             c2 += utf16Fixup[c2>>11];
3707         /* now c1 and c2 are in UTF-32-compatible order */
3708
3709         if (c1 != c2)
3710             return (c1 < c2) ? -1 : 1;
3711
3712         len1--; len2--;
3713     }
3714
3715     return (len1 < len2) ? -1 : (len1 != len2);
3716 }
3717
3718 #else
3719
3720 static int
3721 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3722 {
3723     register int len1, len2;
3724
3725     Py_UNICODE *s1 = str1->str;
3726     Py_UNICODE *s2 = str2->str;
3727
3728     len1 = str1->length;
3729     len2 = str2->length;
3730
3731     while (len1 > 0 && len2 > 0) {
3732         Py_UNICODE c1, c2;
3733
3734         c1 = *s1++;
3735         c2 = *s2++;
3736
3737         if (c1 != c2)
3738             return (c1 < c2) ? -1 : 1;
3739
3740         len1--; len2--;
3741     }
3742
3743     return (len1 < len2) ? -1 : (len1 != len2);
3744 }
3745
3746 #endif
3747
3748 int PyUnicode_Compare(PyObject *left,
3749                       PyObject *right)
3750 {
3751     PyUnicodeObject *u = NULL, *v = NULL;
3752     int result;
3753
3754     /* Coerce the two arguments */
3755     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3756     if (u == NULL)
3757         goto onError;
3758     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3759     if (v == NULL)
3760         goto onError;
3761
3762     /* Shortcut for empty or interned objects */
3763     if (v == u) {
3764         Py_DECREF(u);
3765         Py_DECREF(v);
3766         return 0;
3767     }
3768
3769     result = unicode_compare(u, v);
3770
3771     Py_DECREF(u);
3772     Py_DECREF(v);
3773     return result;
3774
3775 onError:
3776     Py_XDECREF(u);
3777     Py_XDECREF(v);
3778     return -1;
3779 }
3780
3781 int PyUnicode_Contains(PyObject *container,
3782                        PyObject *element)
3783 {
3784     PyUnicodeObject *u = NULL, *v = NULL;
3785     int result;
3786     register const Py_UNICODE *p, *e;
3787     register Py_UNICODE ch;
3788
3789     /* Coerce the two arguments */
3790     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3791     if (v == NULL) {
3792         PyErr_SetString(PyExc_TypeError,
3793             "'in <string>' requires character as left operand");
3794         goto onError;
3795     }
3796     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3797     if (u == NULL) {
3798         Py_DECREF(v);
3799         goto onError;
3800     }
3801
3802     /* Check v in u */
3803     if (PyUnicode_GET_SIZE(v) != 1) {
3804         PyErr_SetString(PyExc_TypeError,
3805             "'in <string>' requires character as left operand");
3806         goto onError;
3807     }
3808     ch = *PyUnicode_AS_UNICODE(v);
3809     p = PyUnicode_AS_UNICODE(u);
3810     e = p + PyUnicode_GET_SIZE(u);
3811     result = 0;
3812     while (p < e) {
3813         if (*p++ == ch) {
3814             result = 1;
3815             break;
3816         }
3817     }
3818
3819     Py_DECREF(u);
3820     Py_DECREF(v);
3821     return result;
3822
3823 onError:
3824     Py_XDECREF(u);
3825     Py_XDECREF(v);
3826     return -1;
3827 }
3828
3829 /* Concat to string or Unicode object giving a new Unicode object. */
3830
3831 PyObject *PyUnicode_Concat(PyObject *left,
3832                            PyObject *right)
3833 {
3834     PyUnicodeObject *u = NULL, *v = NULL, *w;
3835
3836     /* Coerce the two arguments */
3837     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3838     if (u == NULL)
3839         goto onError;
3840     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3841     if (v == NULL)
3842         goto onError;
3843
3844     /* Shortcuts */
3845     if (v == unicode_empty) {
3846         Py_DECREF(v);
3847         return (PyObject *)u;
3848     }
3849     if (u == unicode_empty) {
3850         Py_DECREF(u);
3851         return (PyObject *)v;
3852     }
3853
3854     /* Concat the two Unicode strings */
3855     w = _PyUnicode_New(u->length + v->length);
3856     if (w == NULL)
3857         goto onError;
3858     Py_UNICODE_COPY(w->str, u->str, u->length);
3859     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3860
3861     Py_DECREF(u);
3862     Py_DECREF(v);
3863     return (PyObject *)w;
3864
3865 onError:
3866     Py_XDECREF(u);
3867     Py_XDECREF(v);
3868     return NULL;
3869 }
3870
3871 static char count__doc__[] =
3872 "S.count(sub[, start[, end]]) -> int\n\
3873 \n\
3874 Return the number of occurrences of substring sub in Unicode string\n\
3875 S[start:end].  Optional arguments start and end are\n\
3876 interpreted as in slice notation.";
3877
3878 static PyObject *
3879 unicode_count(PyUnicodeObject *self, PyObject *args)
3880 {
3881     PyUnicodeObject *substring;
3882     int start = 0;
3883     int end = INT_MAX;
3884     PyObject *result;
3885
3886     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3887                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3888         return NULL;
3889
3890     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3891                                                 (PyObject *)substring);
3892     if (substring == NULL)
3893         return NULL;
3894
3895     if (start < 0)
3896         start += self->length;
3897     if (start < 0)
3898         start = 0;
3899     if (end > self->length)
3900         end = self->length;
3901     if (end < 0)
3902         end += self->length;
3903     if (end < 0)
3904         end = 0;
3905
3906     result = PyInt_FromLong((long) count(self, start, end, substring));
3907
3908     Py_DECREF(substring);
3909     return result;
3910 }
3911
3912 static char encode__doc__[] =
3913 "S.encode([encoding[,errors]]) -> string\n\
3914 \n\
3915 Return an encoded string version of S. Default encoding is the current\n\
3916 default string encoding. errors may be given to set a different error\n\
3917 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3918 a ValueError. Other possible values are 'ignore' and 'replace'.";
3919
3920 static PyObject *
3921 unicode_encode(PyUnicodeObject *self, PyObject *args)
3922 {
3923     char *encoding = NULL;
3924     char *errors = NULL;
3925     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3926         return NULL;
3927     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3928 }
3929
3930 static char expandtabs__doc__[] =
3931 "S.expandtabs([tabsize]) -> unicode\n\
3932 \n\
3933 Return a copy of S where all tab characters are expanded using spaces.\n\
3934 If tabsize is not given, a tab size of 8 characters is assumed.";
3935
3936 static PyObject*
3937 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3938 {
3939     Py_UNICODE *e;
3940     Py_UNICODE *p;
3941     Py_UNICODE *q;
3942     int i, j;
3943     PyUnicodeObject *u;
3944     int tabsize = 8;
3945
3946     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3947         return NULL;
3948
3949     /* First pass: determine size of output string */
3950     i = j = 0;
3951     e = self->str + self->length;
3952     for (p = self->str; p < e; p++)
3953         if (*p == '\t') {
3954             if (tabsize > 0)
3955                 j += tabsize - (j % tabsize);
3956         }
3957         else {
3958             j++;
3959             if (*p == '\n' || *p == '\r') {
3960                 i += j;
3961                 j = 0;
3962             }
3963         }
3964
3965     /* Second pass: create output string and fill it */
3966     u = _PyUnicode_New(i + j);
3967     if (!u)
3968         return NULL;
3969
3970     j = 0;
3971     q = u->str;
3972
3973     for (p = self->str; p < e; p++)
3974         if (*p == '\t') {
3975             if (tabsize > 0) {
3976                 i = tabsize - (j % tabsize);
3977                 j += i;
3978                 while (i--)
3979                     *q++ = ' ';
3980             }
3981         }
3982         else {
3983             j++;
3984             *q++ = *p;
3985             if (*p == '\n' || *p == '\r')
3986                 j = 0;
3987         }
3988
3989     return (PyObject*) u;
3990 }
3991
3992 static char find__doc__[] =
3993 "S.find(sub [,start [,end]]) -> int\n\
3994 \n\
3995 Return the lowest index in S where substring sub is found,\n\
3996 such that sub is contained within s[start,end].  Optional\n\
3997 arguments start and end are interpreted as in slice notation.\n\
3998 \n\
3999 Return -1 on failure.";
4000
4001 static PyObject *
4002 unicode_find(PyUnicodeObject *self, PyObject *args)
4003 {
4004     PyUnicodeObject *substring;
4005     int start = 0;
4006     int end = INT_MAX;
4007     PyObject *result;
4008
4009     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4010                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4011         return NULL;
4012     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4013                                                 (PyObject *)substring);
4014     if (substring == NULL)
4015         return NULL;
4016
4017     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4018
4019     Py_DECREF(substring);
4020     return result;
4021 }
4022
4023 static PyObject *
4024 unicode_getitem(PyUnicodeObject *self, int index)
4025 {
4026     if (index < 0 || index >= self->length) {
4027         PyErr_SetString(PyExc_IndexError, "string index out of range");
4028         return NULL;
4029     }
4030
4031     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4032 }
4033
4034 static long
4035 unicode_hash(PyUnicodeObject *self)
4036 {
4037     /* Since Unicode objects compare equal to their ASCII string
4038        counterparts, they should use the individual character values
4039        as basis for their hash value.  This is needed to assure that
4040        strings and Unicode objects behave in the same way as
4041        dictionary keys. */
4042
4043     register int len;
4044     register Py_UNICODE *p;
4045     register long x;
4046
4047     if (self->hash != -1)
4048         return self->hash;
4049     len = PyUnicode_GET_SIZE(self);
4050     p = PyUnicode_AS_UNICODE(self);
4051     x = *p << 7;
4052     while (--len >= 0)
4053         x = (1000003*x) ^ *p++;
4054     x ^= PyUnicode_GET_SIZE(self);
4055     if (x == -1)
4056         x = -2;
4057     self->hash = x;
4058     return x;
4059 }
4060
4061 static char index__doc__[] =
4062 "S.index(sub [,start [,end]]) -> int\n\
4063 \n\
4064 Like S.find() but raise ValueError when the substring is not found.";
4065
4066 static PyObject *
4067 unicode_index(PyUnicodeObject *self, PyObject *args)
4068 {
4069     int result;
4070     PyUnicodeObject *substring;
4071     int start = 0;
4072     int end = INT_MAX;
4073
4074     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4075                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4076         return NULL;
4077
4078     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4079                                                 (PyObject *)substring);
4080     if (substring == NULL)
4081         return NULL;
4082
4083     result = findstring(self, substring, start, end, 1);
4084
4085     Py_DECREF(substring);
4086     if (result < 0) {
4087         PyErr_SetString(PyExc_ValueError, "substring not found");
4088         return NULL;
4089     }
4090     return PyInt_FromLong(result);
4091 }
4092
4093 static char islower__doc__[] =
4094 "S.islower() -> int\n\
4095 \n\
4096 Return 1 if  all cased characters in S are lowercase and there is\n\
4097 at least one cased character in S, 0 otherwise.";
4098
4099 static PyObject*
4100 unicode_islower(PyUnicodeObject *self)
4101 {
4102     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4103     register const Py_UNICODE *e;
4104     int cased;
4105
4106     /* Shortcut for single character strings */
4107     if (PyUnicode_GET_SIZE(self) == 1)
4108         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4109
4110     /* Special case for empty strings */
4111     if (PyString_GET_SIZE(self) == 0)
4112         return PyInt_FromLong(0);
4113
4114     e = p + PyUnicode_GET_SIZE(self);
4115     cased = 0;
4116     for (; p < e; p++) {
4117         register const Py_UNICODE ch = *p;
4118
4119         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4120             return PyInt_FromLong(0);
4121         else if (!cased && Py_UNICODE_ISLOWER(ch))
4122             cased = 1;
4123     }
4124     return PyInt_FromLong(cased);
4125 }
4126
4127 static char isupper__doc__[] =
4128 "S.isupper() -> int\n\
4129 \n\
4130 Return 1 if  all cased characters in S are uppercase and there is\n\
4131 at least one cased character in S, 0 otherwise.";
4132
4133 static PyObject*
4134 unicode_isupper(PyUnicodeObject *self)
4135 {
4136     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4137     register const Py_UNICODE *e;
4138     int cased;
4139
4140     /* Shortcut for single character strings */
4141     if (PyUnicode_GET_SIZE(self) == 1)
4142         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4143
4144     /* Special case for empty strings */
4145     if (PyString_GET_SIZE(self) == 0)
4146         return PyInt_FromLong(0);
4147
4148     e = p + PyUnicode_GET_SIZE(self);
4149     cased = 0;
4150     for (; p < e; p++) {
4151         register const Py_UNICODE ch = *p;
4152
4153         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4154             return PyInt_FromLong(0);
4155         else if (!cased && Py_UNICODE_ISUPPER(ch))
4156             cased = 1;
4157     }
4158     return PyInt_FromLong(cased);
4159 }
4160
4161 static char istitle__doc__[] =
4162 "S.istitle() -> int\n\
4163 \n\
4164 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4165 may only follow uncased characters and lowercase characters only cased\n\
4166 ones. Return 0 otherwise.";
4167
4168 static PyObject*
4169 unicode_istitle(PyUnicodeObject *self)
4170 {
4171     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4172     register const Py_UNICODE *e;
4173     int cased, previous_is_cased;
4174
4175     /* Shortcut for single character strings */
4176     if (PyUnicode_GET_SIZE(self) == 1)
4177         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4178                               (Py_UNICODE_ISUPPER(*p) != 0));
4179
4180     /* Special case for empty strings */
4181     if (PyString_GET_SIZE(self) == 0)
4182         return PyInt_FromLong(0);
4183
4184     e = p + PyUnicode_GET_SIZE(self);
4185     cased = 0;
4186     previous_is_cased = 0;
4187     for (; p < e; p++) {
4188         register const Py_UNICODE ch = *p;
4189
4190         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4191             if (previous_is_cased)
4192                 return PyInt_FromLong(0);
4193             previous_is_cased = 1;
4194             cased = 1;
4195         }
4196         else if (Py_UNICODE_ISLOWER(ch)) {
4197             if (!previous_is_cased)
4198                 return PyInt_FromLong(0);
4199             previous_is_cased = 1;
4200             cased = 1;
4201         }
4202         else
4203             previous_is_cased = 0;
4204     }
4205     return PyInt_FromLong(cased);
4206 }
4207
4208 static char isspace__doc__[] =
4209 "S.isspace() -> int\n\
4210 \n\
4211 Return 1 if there are only whitespace characters in S,\n\
4212 0 otherwise.";
4213
4214 static PyObject*
4215 unicode_isspace(PyUnicodeObject *self)
4216 {
4217     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4218     register const Py_UNICODE *e;
4219
4220     /* Shortcut for single character strings */
4221     if (PyUnicode_GET_SIZE(self) == 1 &&
4222         Py_UNICODE_ISSPACE(*p))
4223         return PyInt_FromLong(1);
4224
4225     /* Special case for empty strings */
4226     if (PyString_GET_SIZE(self) == 0)
4227         return PyInt_FromLong(0);
4228
4229     e = p + PyUnicode_GET_SIZE(self);
4230     for (; p < e; p++) {
4231         if (!Py_UNICODE_ISSPACE(*p))
4232             return PyInt_FromLong(0);
4233     }
4234     return PyInt_FromLong(1);
4235 }
4236
4237 static char isalpha__doc__[] =
4238 "S.isalpha() -> int\n\
4239 \n\
4240 Return 1 if  all characters in S are alphabetic\n\
4241 and there is at least one character in S, 0 otherwise.";
4242
4243 static PyObject*
4244 unicode_isalpha(PyUnicodeObject *self)
4245 {
4246     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4247     register const Py_UNICODE *e;
4248
4249     /* Shortcut for single character strings */
4250     if (PyUnicode_GET_SIZE(self) == 1 &&
4251         Py_UNICODE_ISALPHA(*p))
4252         return PyInt_FromLong(1);
4253
4254     /* Special case for empty strings */
4255     if (PyString_GET_SIZE(self) == 0)
4256         return PyInt_FromLong(0);
4257
4258     e = p + PyUnicode_GET_SIZE(self);
4259     for (; p < e; p++) {
4260         if (!Py_UNICODE_ISALPHA(*p))
4261             return PyInt_FromLong(0);
4262     }
4263     return PyInt_FromLong(1);
4264 }
4265
4266 static char isalnum__doc__[] =
4267 "S.isalnum() -> int\n\
4268 \n\
4269 Return 1 if  all characters in S are alphanumeric\n\
4270 and there is at least one character in S, 0 otherwise.";
4271
4272 static PyObject*
4273 unicode_isalnum(PyUnicodeObject *self)
4274 {
4275     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4276     register const Py_UNICODE *e;
4277
4278     /* Shortcut for single character strings */
4279     if (PyUnicode_GET_SIZE(self) == 1 &&
4280         Py_UNICODE_ISALNUM(*p))
4281         return PyInt_FromLong(1);
4282
4283     /* Special case for empty strings */
4284     if (PyString_GET_SIZE(self) == 0)
4285         return PyInt_FromLong(0);
4286
4287     e = p + PyUnicode_GET_SIZE(self);
4288     for (; p < e; p++) {
4289         if (!Py_UNICODE_ISALNUM(*p))
4290             return PyInt_FromLong(0);
4291     }
4292     return PyInt_FromLong(1);
4293 }
4294
4295 static char isdecimal__doc__[] =
4296 "S.isdecimal() -> int\n\
4297 \n\
4298 Return 1 if there are only decimal characters in S,\n\
4299 0 otherwise.";
4300
4301 static PyObject*
4302 unicode_isdecimal(PyUnicodeObject *self)
4303 {
4304     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4305     register const Py_UNICODE *e;
4306
4307     /* Shortcut for single character strings */
4308     if (PyUnicode_GET_SIZE(self) == 1 &&
4309         Py_UNICODE_ISDECIMAL(*p))
4310         return PyInt_FromLong(1);
4311
4312     /* Special case for empty strings */
4313     if (PyString_GET_SIZE(self) == 0)
4314         return PyInt_FromLong(0);
4315
4316     e = p + PyUnicode_GET_SIZE(self);
4317     for (; p < e; p++) {
4318         if (!Py_UNICODE_ISDECIMAL(*p))
4319             return PyInt_FromLong(0);
4320     }
4321     return PyInt_FromLong(1);
4322 }
4323
4324 static char isdigit__doc__[] =
4325 "S.isdigit() -> int\n\
4326 \n\
4327 Return 1 if there are only digit characters in S,\n\
4328 0 otherwise.";
4329
4330 static PyObject*
4331 unicode_isdigit(PyUnicodeObject *self)
4332 {
4333     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4334     register const Py_UNICODE *e;
4335
4336     /* Shortcut for single character strings */
4337     if (PyUnicode_GET_SIZE(self) == 1 &&
4338         Py_UNICODE_ISDIGIT(*p))
4339         return PyInt_FromLong(1);
4340
4341     /* Special case for empty strings */
4342     if (PyString_GET_SIZE(self) == 0)
4343         return PyInt_FromLong(0);
4344
4345     e = p + PyUnicode_GET_SIZE(self);
4346     for (; p < e; p++) {
4347         if (!Py_UNICODE_ISDIGIT(*p))
4348             return PyInt_FromLong(0);
4349     }
4350     return PyInt_FromLong(1);
4351 }
4352
4353 static char isnumeric__doc__[] =
4354 "S.isnumeric() -> int\n\
4355 \n\
4356 Return 1 if there are only numeric characters in S,\n\
4357 0 otherwise.";
4358
4359 static PyObject*
4360 unicode_isnumeric(PyUnicodeObject *self)
4361 {
4362     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4363     register const Py_UNICODE *e;
4364
4365     /* Shortcut for single character strings */
4366     if (PyUnicode_GET_SIZE(self) == 1 &&
4367         Py_UNICODE_ISNUMERIC(*p))
4368         return PyInt_FromLong(1);
4369
4370     /* Special case for empty strings */
4371     if (PyString_GET_SIZE(self) == 0)
4372         return PyInt_FromLong(0);
4373
4374     e = p + PyUnicode_GET_SIZE(self);
4375     for (; p < e; p++) {
4376         if (!Py_UNICODE_ISNUMERIC(*p))
4377             return PyInt_FromLong(0);
4378     }
4379     return PyInt_FromLong(1);
4380 }
4381
4382 static char join__doc__[] =
4383 "S.join(sequence) -> unicode\n\
4384 \n\
4385 Return a string which is the concatenation of the strings in the\n\
4386 sequence.  The separator between elements is S.";
4387
4388 static PyObject*
4389 unicode_join(PyObject *self, PyObject *data)
4390 {
4391     return PyUnicode_Join(self, data);
4392 }
4393
4394 static int
4395 unicode_length(PyUnicodeObject *self)
4396 {
4397     return self->length;
4398 }
4399
4400 static char ljust__doc__[] =
4401 "S.ljust(width) -> unicode\n\
4402 \n\
4403 Return S left justified in a Unicode string of length width. Padding is\n\
4404 done using spaces.";
4405
4406 static PyObject *
4407 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4408 {
4409     int width;
4410     if (!PyArg_ParseTuple(args, "i:ljust", &width))
4411         return NULL;
4412
4413     if (self->length >= width && PyUnicode_CheckExact(self)) {
4414         Py_INCREF(self);
4415         return (PyObject*) self;
4416     }
4417
4418     return (PyObject*) pad(self, 0, width - self->length, ' ');
4419 }
4420
4421 static char lower__doc__[] =
4422 "S.lower() -> unicode\n\
4423 \n\
4424 Return a copy of the string S converted to lowercase.";
4425
4426 static PyObject*
4427 unicode_lower(PyUnicodeObject *self)
4428 {
4429     return fixup(self, fixlower);
4430 }
4431
4432 static char lstrip__doc__[] =
4433 "S.lstrip() -> unicode\n\
4434 \n\
4435 Return a copy of the string S with leading whitespace removed.";
4436
4437 static PyObject *
4438 unicode_lstrip(PyUnicodeObject *self)
4439 {
4440     return strip(self, 1, 0);
4441 }
4442
4443 static PyObject*
4444 unicode_repeat(PyUnicodeObject *str, int len)
4445 {
4446     PyUnicodeObject *u;
4447     Py_UNICODE *p;
4448     int nchars;
4449     size_t nbytes;
4450
4451     if (len < 0)
4452         len = 0;
4453
4454     if (len == 1 && PyUnicode_CheckExact(str)) {
4455         /* no repeat, return original string */
4456         Py_INCREF(str);
4457         return (PyObject*) str;
4458     }
4459
4460     /* ensure # of chars needed doesn't overflow int and # of bytes
4461      * needed doesn't overflow size_t
4462      */
4463     nchars = len * str->length;
4464     if (len && nchars / len != str->length) {
4465         PyErr_SetString(PyExc_OverflowError,
4466                         "repeated string is too long");
4467         return NULL;
4468     }
4469     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4470     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4471         PyErr_SetString(PyExc_OverflowError,
4472                         "repeated string is too long");
4473         return NULL;
4474     }
4475     u = _PyUnicode_New(nchars);
4476     if (!u)
4477         return NULL;
4478
4479     p = u->str;
4480
4481     while (len-- > 0) {
4482         Py_UNICODE_COPY(p, str->str, str->length);
4483         p += str->length;
4484     }
4485
4486     return (PyObject*) u;
4487 }
4488
4489 PyObject *PyUnicode_Replace(PyObject *obj,
4490                             PyObject *subobj,
4491                             PyObject *replobj,
4492                             int maxcount)
4493 {
4494     PyObject *self;
4495     PyObject *str1;
4496     PyObject *str2;
4497     PyObject *result;
4498
4499     self = PyUnicode_FromObject(obj);
4500     if (self == NULL)
4501         return NULL;
4502     str1 = PyUnicode_FromObject(subobj);
4503     if (str1 == NULL) {
4504         Py_DECREF(self);
4505         return NULL;
4506     }
4507     str2 = PyUnicode_FromObject(replobj);
4508     if (str2 == NULL) {
4509         Py_DECREF(self);
4510         Py_DECREF(str1);
4511         return NULL;
4512     }
4513     result = replace((PyUnicodeObject *)self,
4514                      (PyUnicodeObject *)str1,
4515                      (PyUnicodeObject *)str2,
4516                      maxcount);
4517     Py_DECREF(self);
4518     Py_DECREF(str1);
4519     Py_DECREF(str2);
4520     return result;
4521 }
4522
4523 static char replace__doc__[] =
4524 "S.replace (old, new[, maxsplit]) -> unicode\n\
4525 \n\
4526 Return a copy of S with all occurrences of substring\n\
4527 old replaced by new.  If the optional argument maxsplit is\n\
4528 given, only the first maxsplit occurrences are replaced.";
4529
4530 static PyObject*
4531 unicode_replace(PyUnicodeObject *self, PyObject *args)
4532 {
4533     PyUnicodeObject *str1;
4534     PyUnicodeObject *str2;
4535     int maxcount = -1;
4536     PyObject *result;
4537
4538     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4539         return NULL;
4540     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4541     if (str1 == NULL)
4542         return NULL;
4543     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4544     if (str2 == NULL)
4545         return NULL;
4546
4547     result = replace(self, str1, str2, maxcount);
4548
4549     Py_DECREF(str1);
4550     Py_DECREF(str2);
4551     return result;
4552 }
4553
4554 static
4555 PyObject *unicode_repr(PyObject *unicode)
4556 {
4557     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4558                                 PyUnicode_GET_SIZE(unicode),
4559                                 1);
4560 }
4561
4562 static char rfind__doc__[] =
4563 "S.rfind(sub [,start [,end]]) -> int\n\
4564 \n\
4565 Return the highest index in S where substring sub is found,\n\
4566 such that sub is contained within s[start,end].  Optional\n\
4567 arguments start and end are interpreted as in slice notation.\n\
4568 \n\
4569 Return -1 on failure.";
4570
4571 static PyObject *
4572 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4573 {
4574     PyUnicodeObject *substring;
4575     int start = 0;
4576     int end = INT_MAX;
4577     PyObject *result;
4578
4579     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4580                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4581         return NULL;
4582     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4583                                                 (PyObject *)substring);
4584     if (substring == NULL)
4585         return NULL;
4586
4587     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4588
4589     Py_DECREF(substring);
4590     return result;
4591 }
4592
4593 static char rindex__doc__[] =
4594 "S.rindex(sub [,start [,end]]) -> int\n\
4595 \n\
4596 Like S.rfind() but raise ValueError when the substring is not found.";
4597
4598 static PyObject *
4599 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4600 {
4601     int result;
4602     PyUnicodeObject *substring;
4603     int start = 0;
4604     int end = INT_MAX;
4605
4606     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4607                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4608         return NULL;
4609     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4610                                                 (PyObject *)substring);
4611     if (substring == NULL)
4612         return NULL;
4613
4614     result = findstring(self, substring, start, end, -1);
4615
4616     Py_DECREF(substring);
4617     if (result < 0) {
4618         PyErr_SetString(PyExc_ValueError, "substring not found");
4619         return NULL;
4620     }
4621     return PyInt_FromLong(result);
4622 }
4623
4624 static char rjust__doc__[] =
4625 "S.rjust(width) -> unicode\n\
4626 \n\
4627 Return S right justified in a Unicode string of length width. Padding is\n\
4628 done using spaces.";
4629
4630 static PyObject *
4631 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4632 {
4633     int width;
4634     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4635         return NULL;
4636
4637     if (self->length >= width && PyUnicode_CheckExact(self)) {
4638         Py_INCREF(self);
4639         return (PyObject*) self;
4640     }
4641
4642     return (PyObject*) pad(self, width - self->length, 0, ' ');
4643 }
4644
4645 static char rstrip__doc__[] =
4646 "S.rstrip() -> unicode\n\
4647 \n\
4648 Return a copy of the string S with trailing whitespace removed.";
4649
4650 static PyObject *
4651 unicode_rstrip(PyUnicodeObject *self)
4652 {
4653     return strip(self, 0, 1);
4654 }
4655
4656 static PyObject*
4657 unicode_slice(PyUnicodeObject *self, int start, int end)
4658 {
4659     /* standard clamping */
4660     if (start < 0)
4661         start = 0;
4662     if (end < 0)
4663         end = 0;
4664     if (end > self->length)
4665         end = self->length;
4666     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4667         /* full slice, return original string */
4668         Py_INCREF(self);
4669         return (PyObject*) self;
4670     }
4671     if (start > end)
4672         start = end;
4673     /* copy slice */
4674     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4675                                              end - start);
4676 }
4677
4678 PyObject *PyUnicode_Split(PyObject *s,
4679                           PyObject *sep,
4680                           int maxsplit)
4681 {
4682     PyObject *result;
4683
4684     s = PyUnicode_FromObject(s);
4685     if (s == NULL)
4686         return NULL;
4687     if (sep != NULL) {
4688         sep = PyUnicode_FromObject(sep);
4689         if (sep == NULL) {
4690             Py_DECREF(s);
4691             return NULL;
4692         }
4693     }
4694
4695     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4696
4697     Py_DECREF(s);
4698     Py_XDECREF(sep);
4699     return result;
4700 }
4701
4702 static char split__doc__[] =
4703 "S.split([sep [,maxsplit]]) -> list of strings\n\
4704 \n\
4705 Return a list of the words in S, using sep as the\n\
4706 delimiter string.  If maxsplit is given, at most maxsplit\n\
4707 splits are done. If sep is not specified, any whitespace string\n\
4708 is a separator.";
4709
4710 static PyObject*
4711 unicode_split(PyUnicodeObject *self, PyObject *args)
4712 {
4713     PyObject *substring = Py_None;
4714     int maxcount = -1;
4715
4716     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4717         return NULL;
4718
4719     if (substring == Py_None)
4720         return split(self, NULL, maxcount);
4721     else if (PyUnicode_Check(substring))
4722         return split(self, (PyUnicodeObject *)substring, maxcount);
4723     else
4724         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4725 }
4726
4727 static char splitlines__doc__[] =
4728 "S.splitlines([keepends]]) -> list of strings\n\
4729 \n\
4730 Return a list of the lines in S, breaking at line boundaries.\n\
4731 Line breaks are not included in the resulting list unless keepends\n\
4732 is given and true.";
4733
4734 static PyObject*
4735 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4736 {
4737     int keepends = 0;
4738
4739     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4740         return NULL;
4741
4742     return PyUnicode_Splitlines((PyObject *)self, keepends);
4743 }
4744
4745 static
4746 PyObject *unicode_str(PyUnicodeObject *self)
4747 {
4748     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4749 }
4750
4751 static char strip__doc__[] =
4752 "S.strip() -> unicode\n\
4753 \n\
4754 Return a copy of S with leading and trailing whitespace removed.";
4755
4756 static PyObject *
4757 unicode_strip(PyUnicodeObject *self)
4758 {
4759     return strip(self, 1, 1);
4760 }
4761
4762 static char swapcase__doc__[] =
4763 "S.swapcase() -> unicode\n\
4764 \n\
4765 Return a copy of S with uppercase characters converted to lowercase\n\
4766 and vice versa.";
4767
4768 static PyObject*
4769 unicode_swapcase(PyUnicodeObject *self)
4770 {
4771     return fixup(self, fixswapcase);
4772 }
4773
4774 static char translate__doc__[] =
4775 "S.translate(table) -> unicode\n\
4776 \n\
4777 Return a copy of the string S, where all characters have been mapped\n\
4778 through the given translation table, which must be a mapping of\n\
4779 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4780 are left untouched. Characters mapped to None are deleted.";
4781
4782 static PyObject*
4783 unicode_translate(PyUnicodeObject *self, PyObject *table)
4784 {
4785     return PyUnicode_TranslateCharmap(self->str,
4786                                       self->length,
4787                                       table,
4788                                       "ignore");
4789 }
4790
4791 static char upper__doc__[] =
4792 "S.upper() -> unicode\n\
4793 \n\
4794 Return a copy of S converted to uppercase.";
4795
4796 static PyObject*
4797 unicode_upper(PyUnicodeObject *self)
4798 {
4799     return fixup(self, fixupper);
4800 }
4801
4802 #if 0
4803 static char zfill__doc__[] =
4804 "S.zfill(width) -> unicode\n\
4805 \n\
4806 Pad a numeric string x with zeros on the left, to fill a field\n\
4807 of the specified width. The string x is never truncated.";
4808
4809 static PyObject *
4810 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4811 {
4812     int fill;
4813     PyUnicodeObject *u;
4814
4815     int width;
4816     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4817         return NULL;
4818
4819     if (self->length >= width) {
4820         Py_INCREF(self);
4821         return (PyObject*) self;
4822     }
4823
4824     fill = width - self->length;
4825
4826     u = pad(self, fill, 0, '0');
4827
4828     if (u->str[fill] == '+' || u->str[fill] == '-') {
4829         /* move sign to beginning of string */
4830         u->str[0] = u->str[fill];
4831         u->str[fill] = '0';
4832     }
4833
4834     return (PyObject*) u;
4835 }
4836 #endif
4837
4838 #if 0
4839 static PyObject*
4840 unicode_freelistsize(PyUnicodeObject *self)
4841 {
4842     return PyInt_FromLong(unicode_freelist_size);
4843 }
4844 #endif
4845
4846 static char startswith__doc__[] =
4847 "S.startswith(prefix[, start[, end]]) -> int\n\
4848 \n\
4849 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4850 optional start, test S beginning at that position.  With optional end, stop\n\
4851 comparing S at that position.";
4852
4853 static PyObject *
4854 unicode_startswith(PyUnicodeObject *self,
4855                    PyObject *args)
4856 {
4857     PyUnicodeObject *substring;
4858     int start = 0;
4859     int end = INT_MAX;
4860     PyObject *result;
4861
4862     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4863                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4864         return NULL;
4865     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4866                                                 (PyObject *)substring);
4867     if (substring == NULL)
4868         return NULL;
4869
4870     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4871
4872     Py_DECREF(substring);
4873     return result;
4874 }
4875
4876
4877 static char endswith__doc__[] =
4878 "S.endswith(suffix[, start[, end]]) -> int\n\
4879 \n\
4880 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4881 optional start, test S beginning at that position.  With optional end, stop\n\
4882 comparing S at that position.";
4883
4884 static PyObject *
4885 unicode_endswith(PyUnicodeObject *self,
4886                  PyObject *args)
4887 {
4888     PyUnicodeObject *substring;
4889     int start = 0;
4890     int end = INT_MAX;
4891     PyObject *result;
4892
4893     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4894                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4895         return NULL;
4896     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4897                                                 (PyObject *)substring);
4898     if (substring == NULL)
4899         return NULL;
4900
4901     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4902
4903     Py_DECREF(substring);
4904     return result;
4905 }
4906
4907
4908 static PyMethodDef unicode_methods[] = {
4909
4910     /* Order is according to common usage: often used methods should
4911        appear first, since lookup is done sequentially. */
4912
4913     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4914     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4915     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4916     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4917     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4918     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4919     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4920     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4921     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4922     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4923     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4924     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4925     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4926     {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4927 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4928     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4929     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4930     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4931     {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4932     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4933     {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4934     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4935     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4936     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4937     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4938     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4939     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4940     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4941     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4942     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4943     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4944     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4945     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4946     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4947     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4948 #if 0
4949     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4950     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4951 #endif
4952
4953 #if 0
4954     /* This one is just used for debugging the implementation. */
4955     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4956 #endif
4957
4958     {NULL, NULL}
4959 };
4960
4961 static PySequenceMethods unicode_as_sequence = {
4962     (inquiry) unicode_length,           /* sq_length */
4963     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4964     (intargfunc) unicode_repeat,        /* sq_repeat */
4965     (intargfunc) unicode_getitem,       /* sq_item */
4966     (intintargfunc) unicode_slice,      /* sq_slice */
4967     0,                                  /* sq_ass_item */
4968     0,                                  /* sq_ass_slice */
4969     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4970 };
4971
4972 static int
4973 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4974                           int index,
4975                           const void **ptr)
4976 {
4977     if (index != 0) {
4978         PyErr_SetString(PyExc_SystemError,
4979                         "accessing non-existent unicode segment");
4980         return -1;
4981     }
4982     *ptr = (void *) self->str;
4983     return PyUnicode_GET_DATA_SIZE(self);
4984 }
4985
4986 static int
4987 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4988                            const void **ptr)
4989 {
4990     PyErr_SetString(PyExc_TypeError,
4991                     "cannot use unicode as modifyable buffer");
4992     return -1;
4993 }
4994
4995 static int
4996 unicode_buffer_getsegcount(PyUnicodeObject *self,
4997                            int *lenp)
4998 {
4999     if (lenp)
5000         *lenp = PyUnicode_GET_DATA_SIZE(self);
5001     return 1;
5002 }
5003
5004 static int
5005 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5006                           int index,
5007                           const void **ptr)
5008 {
5009     PyObject *str;
5010
5011     if (index != 0) {
5012         PyErr_SetString(PyExc_SystemError,
5013                         "accessing non-existent unicode segment");
5014         return -1;
5015     }
5016     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5017     if (str == NULL)
5018         return -1;
5019     *ptr = (void *) PyString_AS_STRING(str);
5020     return PyString_GET_SIZE(str);
5021 }
5022
5023 /* Helpers for PyUnicode_Format() */
5024
5025 static PyObject *
5026 getnextarg(PyObject *args, int arglen, int *p_argidx)
5027 {
5028     int argidx = *p_argidx;
5029     if (argidx < arglen) {
5030         (*p_argidx)++;
5031         if (arglen < 0)
5032             return args;
5033         else
5034             return PyTuple_GetItem(args, argidx);
5035     }
5036     PyErr_SetString(PyExc_TypeError,
5037                     "not enough arguments for format string");
5038     return NULL;
5039 }
5040
5041 #define F_LJUST (1<<0)
5042 #define F_SIGN  (1<<1)
5043 #define F_BLANK (1<<2)
5044 #define F_ALT   (1<<3)
5045 #define F_ZERO  (1<<4)
5046
5047 static
5048 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5049 {
5050     register int i;
5051     int len;
5052     va_list va;
5053     char *charbuffer;
5054     va_start(va, format);
5055
5056     /* First, format the string as char array, then expand to Py_UNICODE
5057        array. */
5058     charbuffer = (char *)buffer;
5059     len = vsprintf(charbuffer, format, va);
5060     for (i = len - 1; i >= 0; i--)
5061         buffer[i] = (Py_UNICODE) charbuffer[i];
5062
5063     va_end(va);
5064     return len;
5065 }
5066
5067 static int
5068 formatfloat(Py_UNICODE *buf,
5069             size_t buflen,
5070             int flags,
5071             int prec,
5072             int type,
5073             PyObject *v)
5074 {
5075     /* fmt = '%#.' + `prec` + `type`
5076        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5077     char fmt[20];
5078     double x;
5079
5080     x = PyFloat_AsDouble(v);
5081     if (x == -1.0 && PyErr_Occurred())
5082         return -1;
5083     if (prec < 0)
5084         prec = 6;
5085     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5086         type = 'g';
5087     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
5088     /* worst case length calc to ensure no buffer overrun:
5089          fmt = %#.<prec>g
5090          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5091             for any double rep.)
5092          len = 1 + prec + 1 + 2 + 5 = 9 + prec
5093        If prec=0 the effective precision is 1 (the leading digit is
5094        always given), therefore increase by one to 10+prec. */
5095     if (buflen <= (size_t)10 + (size_t)prec) {
5096         PyErr_SetString(PyExc_OverflowError,
5097             "formatted float is too long (precision too long?)");
5098         return -1;
5099     }
5100     return usprintf(buf, fmt, x);
5101 }
5102
5103 static PyObject*
5104 formatlong(PyObject *val, int flags, int prec, int type)
5105 {
5106         char *buf;
5107         int i, len;
5108         PyObject *str; /* temporary string object. */
5109         PyUnicodeObject *result;
5110
5111         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5112         if (!str)
5113                 return NULL;
5114         result = _PyUnicode_New(len);
5115         for (i = 0; i < len; i++)
5116                 result->str[i] = buf[i];
5117         result->str[len] = 0;
5118         Py_DECREF(str);
5119         return (PyObject*)result;
5120 }
5121
5122 static int
5123 formatint(Py_UNICODE *buf,
5124           size_t buflen,
5125           int flags,
5126           int prec,
5127           int type,
5128           PyObject *v)
5129 {
5130     /* fmt = '%#.' + `prec` + 'l' + `type`
5131        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5132        + 1 + 1 = 24*/
5133     char fmt[64]; /* plenty big enough! */
5134     long x;
5135     int use_native_c_format = 1;
5136
5137     x = PyInt_AsLong(v);
5138     if (x == -1 && PyErr_Occurred())
5139         return -1;
5140     if (prec < 0)
5141         prec = 1;
5142     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5143        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5144     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5145         PyErr_SetString(PyExc_OverflowError,
5146             "formatted integer is too long (precision too long?)");
5147         return -1;
5148     }
5149     /* When converting 0 under %#x or %#X, C leaves off the base marker,
5150      * but we want it (for consistency with other %#x conversions, and
5151      * for consistency with Python's hex() function).
5152      * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
5153      * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5154      * So add it only if the platform doesn't already.
5155      */
5156     if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5157         /* Only way to know what the platform does is to try it. */
5158         sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5159         if (fmt[1] != (char)type) {
5160             /* Supply our own leading 0x/0X -- needed under std C */
5161             use_native_c_format = 0;
5162             sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5163         }
5164     }
5165     if (use_native_c_format)
5166          sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
5167     return usprintf(buf, fmt, x);
5168 }
5169
5170 static int
5171 formatchar(Py_UNICODE *buf,
5172            size_t buflen,
5173            PyObject *v)
5174 {
5175     /* presume that the buffer is at least 2 characters long */
5176     if (PyUnicode_Check(v)) {
5177         if (PyUnicode_GET_SIZE(v) != 1)
5178             goto onError;
5179         buf[0] = PyUnicode_AS_UNICODE(v)[0];
5180     }
5181
5182     else if (PyString_Check(v)) {
5183         if (PyString_GET_SIZE(v) != 1)
5184             goto onError;
5185         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5186     }
5187
5188     else {
5189         /* Integer input truncated to a character */
5190         long x;
5191         x = PyInt_AsLong(v);
5192         if (x == -1 && PyErr_Occurred())
5193             goto onError;
5194         buf[0] = (char) x;
5195     }
5196     buf[1] = '\0';
5197     return 1;
5198
5199  onError:
5200     PyErr_SetString(PyExc_TypeError,
5201                     "%c requires int or char");
5202     return -1;
5203 }
5204
5205 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5206
5207    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5208    chars are formatted. XXX This is a magic number. Each formatting
5209    routine does bounds checking to ensure no overflow, but a better
5210    solution may be to malloc a buffer of appropriate size for each
5211    format. For now, the current solution is sufficient.
5212 */
5213 #define FORMATBUFLEN (size_t)120
5214
5215 PyObject *PyUnicode_Format(PyObject *format,
5216                            PyObject *args)
5217 {
5218     Py_UNICODE *fmt, *res;
5219     int fmtcnt, rescnt, reslen, arglen, argidx;
5220     int args_owned = 0;
5221     PyUnicodeObject *result = NULL;
5222     PyObject *dict = NULL;
5223     PyObject *uformat;
5224
5225     if (format == NULL || args == NULL) {
5226         PyErr_BadInternalCall();
5227         return NULL;
5228     }
5229     uformat = PyUnicode_FromObject(format);
5230     if (uformat == NULL)
5231         return NULL;
5232     fmt = PyUnicode_AS_UNICODE(uformat);
5233     fmtcnt = PyUnicode_GET_SIZE(uformat);
5234
5235     reslen = rescnt = fmtcnt + 100;
5236     result = _PyUnicode_New(reslen);
5237     if (result == NULL)
5238         goto onError;
5239     res = PyUnicode_AS_UNICODE(result);
5240
5241     if (PyTuple_Check(args)) {
5242         arglen = PyTuple_Size(args);
5243         argidx = 0;
5244     }
5245     else {
5246         arglen = -1;
5247         argidx = -2;
5248     }
5249     if (args->ob_type->tp_as_mapping)
5250         dict = args;
5251
5252     while (--fmtcnt >= 0) {
5253         if (*fmt != '%') {
5254             if (--rescnt < 0) {
5255                 rescnt = fmtcnt + 100;
5256                 reslen += rescnt;
5257                 if (_PyUnicode_Resize(&result, reslen) < 0)
5258                     return NULL;
5259                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5260                 --rescnt;
5261             }
5262             *res++ = *fmt++;
5263         }
5264         else {
5265             /* Got a format specifier */
5266             int flags = 0;
5267             int width = -1;
5268             int prec = -1;
5269             Py_UNICODE c = '\0';
5270             Py_UNICODE fill;
5271             PyObject *v = NULL;
5272             PyObject *temp = NULL;
5273             Py_UNICODE *pbuf;
5274             Py_UNICODE sign;
5275             int len;
5276             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5277
5278             fmt++;
5279             if (*fmt == '(') {
5280                 Py_UNICODE *keystart;
5281                 int keylen;
5282                 PyObject *key;
5283                 int pcount = 1;
5284
5285                 if (dict == NULL) {
5286                     PyErr_SetString(PyExc_TypeError,
5287                                     "format requires a mapping");
5288                     goto onError;
5289                 }
5290                 ++fmt;
5291                 --fmtcnt;
5292                 keystart = fmt;
5293                 /* Skip over balanced parentheses */
5294                 while (pcount > 0 && --fmtcnt >= 0) {
5295                     if (*fmt == ')')
5296                         --pcount;
5297                     else if (*fmt == '(')
5298                         ++pcount;
5299                     fmt++;
5300                 }
5301                 keylen = fmt - keystart - 1;
5302                 if (fmtcnt < 0 || pcount > 0) {
5303                     PyErr_SetString(PyExc_ValueError,
5304                                     "incomplete format key");
5305                     goto onError;
5306                 }
5307                 /* keys are converted to strings using UTF-8 and
5308                    then looked up since Python uses strings to hold
5309                    variables names etc. in its namespaces and we
5310                    wouldn't want to break common idioms. */
5311                 key = PyUnicode_EncodeUTF8(keystart,
5312                                            keylen,
5313                                            NULL);
5314                 if (key == NULL)
5315                     goto onError;
5316                 if (args_owned) {
5317                     Py_DECREF(args);
5318                     args_owned = 0;
5319                 }
5320                 args = PyObject_GetItem(dict, key);
5321                 Py_DECREF(key);
5322                 if (args == NULL) {
5323                     goto onError;
5324                 }
5325                 args_owned = 1;
5326                 arglen = -1;
5327                 argidx = -2;
5328             }
5329             while (--fmtcnt >= 0) {
5330                 switch (c = *fmt++) {
5331                 case '-': flags |= F_LJUST; continue;
5332                 case '+': flags |= F_SIGN; continue;
5333                 case ' ': flags |= F_BLANK; continue;
5334                 case '#': flags |= F_ALT; continue;
5335                 case '0': flags |= F_ZERO; continue;
5336                 }
5337                 break;
5338             }
5339             if (c == '*') {
5340                 v = getnextarg(args, arglen, &argidx);
5341                 if (v == NULL)
5342                     goto onError;
5343                 if (!PyInt_Check(v)) {
5344                     PyErr_SetString(PyExc_TypeError,
5345                                     "* wants int");
5346                     goto onError;
5347                 }
5348                 width = PyInt_AsLong(v);
5349                 if (width < 0) {
5350                     flags |= F_LJUST;
5351                     width = -width;
5352                 }
5353                 if (--fmtcnt >= 0)
5354                     c = *fmt++;
5355             }
5356             else if (c >= '0' && c <= '9') {
5357                 width = c - '0';
5358                 while (--fmtcnt >= 0) {
5359                     c = *fmt++;
5360                     if (c < '0' || c > '9')
5361                         break;
5362                     if ((width*10) / 10 != width) {
5363                         PyErr_SetString(PyExc_ValueError,
5364                                         "width too big");
5365                         goto onError;
5366                     }
5367                     width = width*10 + (c - '0');
5368                 }
5369             }
5370             if (c == '.') {
5371                 prec = 0;
5372                 if (--fmtcnt >= 0)
5373                     c = *fmt++;
5374                 if (c == '*') {
5375                     v = getnextarg(args, arglen, &argidx);
5376                     if (v == NULL)
5377                         goto onError;
5378                     if (!PyInt_Check(v)) {
5379                         PyErr_SetString(PyExc_TypeError,
5380                                         "* wants int");
5381                         goto onError;
5382                     }
5383                     prec = PyInt_AsLong(v);
5384                     if (prec < 0)
5385                         prec = 0;
5386                     if (--fmtcnt >= 0)
5387                         c = *fmt++;
5388                 }
5389                 else if (c >= '0' && c <= '9') {
5390                     prec = c - '0';
5391                     while (--fmtcnt >= 0) {
5392                         c = Py_CHARMASK(*fmt++);
5393                         if (c < '0' || c > '9')
5394                             break;
5395                         if ((prec*10) / 10 != prec) {
5396                             PyErr_SetString(PyExc_ValueError,
5397                                             "prec too big");
5398                             goto onError;
5399                         }
5400                         prec = prec*10 + (c - '0');
5401                     }
5402                 }
5403             } /* prec */
5404             if (fmtcnt >= 0) {
5405                 if (c == 'h' || c == 'l' || c == 'L') {
5406                     if (--fmtcnt >= 0)
5407                         c = *fmt++;
5408                 }
5409             }
5410             if (fmtcnt < 0) {
5411                 PyErr_SetString(PyExc_ValueError,
5412                                 "incomplete format");
5413                 goto onError;
5414             }
5415             if (c != '%') {
5416                 v = getnextarg(args, arglen, &argidx);
5417                 if (v == NULL)
5418                     goto onError;
5419             }
5420             sign = 0;
5421             fill = ' ';
5422             switch (c) {
5423
5424             case '%':
5425                 pbuf = formatbuf;
5426                 /* presume that buffer length is at least 1 */
5427                 pbuf[0] = '%';
5428                 len = 1;
5429                 break;
5430
5431             case 's':
5432             case 'r':
5433                 if (PyUnicode_Check(v) && c == 's') {
5434                     temp = v;
5435                     Py_INCREF(temp);
5436                 }
5437                 else {
5438                     PyObject *unicode;
5439                     if (c == 's')
5440                         temp = PyObject_Str(v);
5441                     else
5442                         temp = PyObject_Repr(v);
5443                     if (temp == NULL)
5444                         goto onError;
5445                     if (!PyString_Check(temp)) {
5446                         /* XXX Note: this should never happen, since
5447                                PyObject_Repr() and PyObject_Str() assure
5448                                this */
5449                         Py_DECREF(temp);
5450                         PyErr_SetString(PyExc_TypeError,
5451                                         "%s argument has non-string str()");
5452                         goto onError;
5453                     }
5454                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5455                                                    PyString_GET_SIZE(temp),
5456                                                NULL,
5457                                                    "strict");
5458                     Py_DECREF(temp);
5459                     temp = unicode;
5460                     if (temp == NULL)
5461                         goto onError;
5462                 }
5463                 pbuf = PyUnicode_AS_UNICODE(temp);
5464                 len = PyUnicode_GET_SIZE(temp);
5465                 if (prec >= 0 && len > prec)
5466                     len = prec;
5467                 break;
5468
5469             case 'i':
5470             case 'd':
5471             case 'u':
5472             case 'o':
5473             case 'x':
5474             case 'X':
5475                 if (c == 'i')
5476                     c = 'd';
5477                 if (PyLong_Check(v)) {
5478                     temp = formatlong(v, flags, prec, c);
5479                     if (!temp)
5480                         goto onError;
5481                     pbuf = PyUnicode_AS_UNICODE(temp);
5482                     len = PyUnicode_GET_SIZE(temp);
5483                     /* unbounded ints can always produce
5484                        a sign character! */
5485                     sign = 1;
5486                 }
5487                 else {
5488                     pbuf = formatbuf;
5489                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5490                                     flags, prec, c, v);
5491                     if (len < 0)
5492                         goto onError;
5493                     /* only d conversion is signed */
5494                     sign = c == 'd';
5495                 }
5496                 if (flags & F_ZERO)
5497                     fill = '0';
5498                 break;
5499
5500             case 'e':
5501             case 'E':
5502             case 'f':
5503             case 'g':
5504             case 'G':
5505                 pbuf = formatbuf;
5506                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5507                         flags, prec, c, v);
5508                 if (len < 0)
5509                     goto onError;
5510                 sign = 1;
5511                 if (flags & F_ZERO)
5512                     fill = '0';
5513                 break;
5514
5515             case 'c':
5516                 pbuf = formatbuf;
5517                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5518                 if (len < 0)
5519                     goto onError;
5520                 break;
5521
5522             default:
5523                 PyErr_Format(PyExc_ValueError,
5524                              "unsupported format character '%c' (0x%x) "
5525                              "at index %i",
5526                              (31<=c && c<=126) ? c : '?',
5527                              c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5528                 goto onError;
5529             }
5530             if (sign) {
5531                 if (*pbuf == '-' || *pbuf == '+') {
5532                     sign = *pbuf++;
5533                     len--;
5534                 }
5535                 else if (flags & F_SIGN)
5536                     sign = '+';
5537                 else if (flags & F_BLANK)
5538                     sign = ' ';
5539                 else
5540                     sign = 0;
5541             }
5542             if (width < len)
5543                 width = len;
5544             if (rescnt < width + (sign != 0)) {
5545                 reslen -= rescnt;
5546                 rescnt = width + fmtcnt + 100;
5547                 reslen += rescnt;
5548                 if (_PyUnicode_Resize(&result, reslen) < 0)
5549                     return NULL;
5550                 res = PyUnicode_AS_UNICODE(result)
5551                     + reslen - rescnt;
5552             }
5553             if (sign) {
5554                 if (fill != ' ')
5555                     *res++ = sign;
5556                 rescnt--;
5557                 if (width > len)
5558                     width--;
5559             }
5560             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5561                 assert(pbuf[0] == '0');
5562                 assert(pbuf[1] == c);
5563                 if (fill != ' ') {
5564                     *res++ = *pbuf++;
5565                     *res++ = *pbuf++;
5566                 }
5567                 rescnt -= 2;
5568                 width -= 2;
5569                 if (width < 0)
5570                     width = 0;
5571                 len -= 2;
5572             }
5573             if (width > len && !(flags & F_LJUST)) {
5574                 do {
5575                     --rescnt;
5576                     *res++ = fill;
5577                 } while (--width > len);
5578             }
5579             if (fill == ' ') {
5580                 if (sign)
5581                     *res++ = sign;
5582                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5583                     assert(pbuf[0] == '0');
5584                     assert(pbuf[1] == c);
5585                     *res++ = *pbuf++;
5586                     *res++ = *pbuf++;
5587                 }
5588             }
5589             Py_UNICODE_COPY(res, pbuf, len);
5590             res += len;
5591             rescnt -= len;
5592             while (--width >= len) {
5593                 --rescnt;
5594                 *res++ = ' ';
5595             }
5596             if (dict && (argidx < arglen) && c != '%') {
5597                 PyErr_SetString(PyExc_TypeError,
5598                                 "not all arguments converted");
5599                 goto onError;
5600             }
5601             Py_XDECREF(temp);
5602         } /* '%' */
5603     } /* until end */
5604     if (argidx < arglen && !dict) {
5605         PyErr_SetString(PyExc_TypeError,
5606                         "not all arguments converted");
5607         goto onError;
5608     }
5609
5610     if (args_owned) {
5611         Py_DECREF(args);
5612     }
5613     Py_DECREF(uformat);
5614     if (_PyUnicode_Resize(&result, reslen - rescnt))
5615         goto onError;
5616     return (PyObject *)result;
5617
5618  onError:
5619     Py_XDECREF(result);
5620     Py_DECREF(uformat);
5621     if (args_owned) {
5622         Py_DECREF(args);
5623     }
5624     return NULL;
5625 }
5626
5627 static PyBufferProcs unicode_as_buffer = {
5628     (getreadbufferproc) unicode_buffer_getreadbuf,
5629     (getwritebufferproc) unicode_buffer_getwritebuf,
5630     (getsegcountproc) unicode_buffer_getsegcount,
5631     (getcharbufferproc) unicode_buffer_getcharbuf,
5632 };
5633
5634 staticforward PyObject *
5635 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5636
5637 static PyObject *
5638 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5639 {
5640         PyObject *x = NULL;
5641         static char *kwlist[] = {"string", "encoding", "errors", 0};
5642         char *encoding = NULL;
5643         char *errors = NULL;
5644
5645         if (type != &PyUnicode_Type)
5646                 return unicode_subtype_new(type, args, kwds);
5647         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5648                                           kwlist, &x, &encoding, &errors))
5649             return NULL;
5650         if (x == NULL)
5651                 return (PyObject *)_PyUnicode_New(0);
5652         return PyUnicode_FromEncodedObject(x, encoding, errors);
5653 }
5654
5655 static PyObject *
5656 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5657 {
5658         PyUnicodeObject *tmp, *pnew;
5659         int n;
5660
5661         assert(PyType_IsSubtype(type, &PyUnicode_Type));
5662         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5663         if (tmp == NULL)
5664                 return NULL;
5665         assert(PyUnicode_Check(tmp));
5666         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5667         if (pnew == NULL)
5668                 return NULL;
5669         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5670         if (pnew->str == NULL) {
5671                 _Py_ForgetReference((PyObject *)pnew);
5672                 PyObject_DEL(pnew);
5673                 return NULL;
5674         }
5675         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5676         pnew->length = n;
5677         pnew->hash = tmp->hash;
5678         Py_DECREF(tmp);
5679         return (PyObject *)pnew;
5680 }
5681
5682 static char unicode_doc[] =
5683 "unicode(string [, encoding[, errors]]) -> object\n\
5684 \n\
5685 Create a new Unicode object from the given encoded string.\n\
5686 encoding defaults to the current default string encoding and \n\
5687 errors, defining the error handling, to 'strict'.";
5688
5689 PyTypeObject PyUnicode_Type = {
5690     PyObject_HEAD_INIT(&PyType_Type)
5691     0,                                  /* ob_size */
5692     "unicode",                          /* tp_name */
5693     sizeof(PyUnicodeObject),            /* tp_size */
5694     0,                                  /* tp_itemsize */
5695     /* Slots */
5696     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5697     0,                                  /* tp_print */
5698     0,                                  /* tp_getattr */
5699     0,                                  /* tp_setattr */
5700     (cmpfunc) unicode_compare,          /* tp_compare */
5701     (reprfunc) unicode_repr,            /* tp_repr */
5702     0,                                  /* tp_as_number */
5703     &unicode_as_sequence,               /* tp_as_sequence */
5704     0,                                  /* tp_as_mapping */
5705     (hashfunc) unicode_hash,            /* tp_hash*/
5706     0,                                  /* tp_call*/
5707     (reprfunc) unicode_str,             /* tp_str */
5708     PyObject_GenericGetAttr,            /* tp_getattro */
5709     0,                                  /* tp_setattro */
5710     &unicode_as_buffer,                 /* tp_as_buffer */
5711     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5712     unicode_doc,                        /* tp_doc */
5713     0,                                  /* tp_traverse */
5714     0,                                  /* tp_clear */
5715     0,                                  /* tp_richcompare */
5716     0,                                  /* tp_weaklistoffset */
5717     0,                                  /* tp_iter */
5718     0,                                  /* tp_iternext */
5719     unicode_methods,                    /* tp_methods */
5720     0,                                  /* tp_members */
5721     0,                                  /* tp_getset */
5722     0,                                  /* tp_base */
5723     0,                                  /* tp_dict */
5724     0,                                  /* tp_descr_get */
5725     0,                                  /* tp_descr_set */
5726     0,                                  /* tp_dictoffset */
5727     0,                                  /* tp_init */
5728     0,                                  /* tp_alloc */
5729     unicode_new,                        /* tp_new */
5730 };
5731
5732 /* Initialize the Unicode implementation */
5733
5734 void _PyUnicode_Init(void)
5735 {
5736     int i;
5737
5738     /* Init the implementation */
5739     unicode_freelist = NULL;
5740     unicode_freelist_size = 0;
5741     unicode_empty = _PyUnicode_New(0);
5742     strcpy(unicode_default_encoding, "ascii");
5743     for (i = 0; i < 256; i++)
5744         unicode_latin1[i] = NULL;
5745 }
5746
5747 /* Finalize the Unicode implementation */
5748
5749 void
5750 _PyUnicode_Fini(void)
5751 {
5752     PyUnicodeObject *u;
5753     int i;
5754
5755     Py_XDECREF(unicode_empty);
5756     unicode_empty = NULL;
5757
5758     for (i = 0; i < 256; i++) {
5759         if (unicode_latin1[i]) {
5760             Py_DECREF(unicode_latin1[i]);
5761             unicode_latin1[i] = NULL;
5762         }
5763     }
5764
5765     for (u = unicode_freelist; u != NULL;) {
5766         PyUnicodeObject *v = u;
5767         u = *(PyUnicodeObject **)u;
5768         if (v->str)
5769             PyMem_DEL(v->str);
5770         Py_XDECREF(v->defenc);
5771         PyObject_DEL(v);
5772     }
5773     unicode_freelist = NULL;
5774     unicode_freelist_size = 0;
5775 }