Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_DEL(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void unicode_dealloc(register PyUnicodeObject *unicode)
 228 {
 229     if (PyUnicode_CheckExact(unicode) &&
 230         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 231         /* Keep-Alive optimization */
 232         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 233             PyMem_DEL(unicode->str);
 234             unicode->str = NULL;
 235             unicode->length = 0;
 236         }
 237         if (unicode->defenc) {
 238             Py_DECREF(unicode->defenc);
 239             unicode->defenc = NULL;
 240         }
 241         /* Add to free list */
 242         *(PyUnicodeObject **)unicode = unicode_freelist;
 243         unicode_freelist = unicode;
 244         unicode_freelist_size++;
 245     }
 246     else {
 247         PyMem_DEL(unicode->str);
 248         Py_XDECREF(unicode->defenc);
 249         unicode->ob_type->tp_free((PyObject *)unicode);
 250     }
 251 }
 252
 253 int PyUnicode_Resize(PyObject **unicode,
 254                      int length)
 255 {
 256     register PyUnicodeObject *v;
 257
 258     /* Argument checks */
 259     if (unicode == NULL) {
 260         PyErr_BadInternalCall();
 261         return -1;
 262     }
 263     v = (PyUnicodeObject *)*unicode;
 264     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 265         PyErr_BadInternalCall();
 266         return -1;
 267     }
 268
 269     /* Resizing unicode_empty and single character objects is not
 270        possible since these are being shared. We simply return a fresh
 271        copy with the same Unicode content. */
 272     if (v->length != length &&
 273         (v == unicode_empty || v->length == 1)) {
 274         PyUnicodeObject *w = _PyUnicode_New(length);
 275         if (w == NULL)
 276             return -1;
 277         Py_UNICODE_COPY(w->str, v->str,
 278                         length < v->length ? length : v->length);
 279         *unicode = (PyObject *)w;
 280         return 0;
 281     }
 282
 283     /* Note that we don't have to modify *unicode for unshared Unicode
 284        objects, since we can modify them in-place. */
 285     return unicode_resize(v, length);
 286 }
 287
 288 /* Internal API for use in unicodeobject.c only ! */
 289 #define _PyUnicode_Resize(unicodevar, length) \
 290         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 291
 292 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 293                                 int size)
 294 {
 295     PyUnicodeObject *unicode;
 296
 297     /* If the Unicode data is known at construction time, we can apply
 298        some optimizations which share commonly used objects. */
 299     if (u != NULL) {
 300
 301         /* Optimization for empty strings */
 302         if (size == 0 && unicode_empty != NULL) {
 303             Py_INCREF(unicode_empty);
 304             return (PyObject *)unicode_empty;
 305         }
 306
 307         /* Single character Unicode objects in the Latin-1 range are
 308            shared when using this constructor */
 309         if (size == 1 && *u < 256) {
 310             unicode = unicode_latin1[*u];
 311             if (!unicode) {
 312                 unicode = _PyUnicode_New(1);
 313                 if (!unicode)
 314                     return NULL;
 315                 unicode->str[0] = *u;
 316                 unicode_latin1[*u] = unicode;
 317             }
 318             Py_INCREF(unicode);
 319             return (PyObject *)unicode;
 320         }
 321     }
 322
 323     unicode = _PyUnicode_New(size);
 324     if (!unicode)
 325         return NULL;
 326
 327     /* Copy the Unicode data into the new object */
 328     if (u != NULL)
 329         Py_UNICODE_COPY(unicode->str, u, size);
 330
 331     return (PyObject *)unicode;
 332 }
 333
 334 #ifdef HAVE_WCHAR_H
 335
 336 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 337                                  int size)
 338 {
 339     PyUnicodeObject *unicode;
 340
 341     if (w == NULL) {
 342         PyErr_BadInternalCall();
 343         return NULL;
 344     }
 345
 346     unicode = _PyUnicode_New(size);
 347     if (!unicode)
 348         return NULL;
 349
 350     /* Copy the wchar_t data into the new object */
 351 #ifdef HAVE_USABLE_WCHAR_T
 352     memcpy(unicode->str, w, size * sizeof(wchar_t));
 353 #else
 354     {
 355         register Py_UNICODE *u;
 356         register int i;
 357         u = PyUnicode_AS_UNICODE(unicode);
 358         for (i = size; i >= 0; i--)
 359             *u++ = *w++;
 360     }
 361 #endif
 362
 363     return (PyObject *)unicode;
 364 }
 365
 366 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 367                          register wchar_t *w,
 368                          int size)
 369 {
 370     if (unicode == NULL) {
 371         PyErr_BadInternalCall();
 372         return -1;
 373     }
 374     if (size > PyUnicode_GET_SIZE(unicode))
 375         size = PyUnicode_GET_SIZE(unicode);
 376 #ifdef HAVE_USABLE_WCHAR_T
 377     memcpy(w, unicode->str, size * sizeof(wchar_t));
 378 #else
 379     {
 380         register Py_UNICODE *u;
 381         register int i;
 382         u = PyUnicode_AS_UNICODE(unicode);
 383         for (i = size; i >= 0; i--)
 384             *w++ = *u++;
 385     }
 386 #endif
 387
 388     return size;
 389 }
 390
 391 #endif
 392
 393 PyObject *PyUnicode_FromObject(register PyObject *obj)
 394 {
 395     /* XXX Perhaps we should make this API an alias of
 396            PyObject_Unicode() instead ?! */
 397     if (PyUnicode_CheckExact(obj)) {
 398         Py_INCREF(obj);
 399         return obj;
 400     }
 401     if (PyUnicode_Check(obj)) {
 402         /* For a Unicode subtype that's not a Unicode object,
 403            return a true Unicode object with the same data. */
 404         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 405                                      PyUnicode_GET_SIZE(obj));
 406     }
 407     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 408 }
 409
 410 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 411                                       const char *encoding,
 412                                       const char *errors)
 413 {
 414     const char *s = NULL;
 415     int len;
 416     int owned = 0;
 417     PyObject *v;
 418
 419     if (obj == NULL) {
 420         PyErr_BadInternalCall();
 421         return NULL;
 422     }
 423
 424 #if 0
 425     /* For b/w compatibility we also accept Unicode objects provided
 426        that no encodings is given and then redirect to
 427        PyObject_Unicode() which then applies the additional logic for
 428        Unicode subclasses.
 429
 430        NOTE: This API should really only be used for object which
 431              represent *encoded* Unicode !
 432
 433     */
 434         if (PyUnicode_Check(obj)) {
 435             if (encoding) {
 436                 PyErr_SetString(PyExc_TypeError,
 437                                 "decoding Unicode is not supported");
 438             return NULL;
 439             }
 440         return PyObject_Unicode(obj);
 441             }
 442 #else
 443     if (PyUnicode_Check(obj)) {
 444         PyErr_SetString(PyExc_TypeError,
 445                         "decoding Unicode is not supported");
 446         return NULL;
 447         }
 448 #endif
 449
 450     /* Coerce object */
 451     if (PyString_Check(obj)) {
 452             s = PyString_AS_STRING(obj);
 453             len = PyString_GET_SIZE(obj);
 454             }
 455     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 456         /* Overwrite the error message with something more useful in
 457            case of a TypeError. */
 458         if (PyErr_ExceptionMatches(PyExc_TypeError))
 459         PyErr_Format(PyExc_TypeError,
 460                          "coercing to Unicode: need string or buffer, "
 461                          "%.80s found",
 462                      obj->ob_type->tp_name);
 463         goto onError;
 464     }
 465
 466     /* Convert to Unicode */
 467     if (len == 0) {
 468         Py_INCREF(unicode_empty);
 469         v = (PyObject *)unicode_empty;
 470     }
 471     else
 472         v = PyUnicode_Decode(s, len, encoding, errors);
 473
 474     if (owned) {
 475         Py_DECREF(obj);
 476     }
 477     return v;
 478
 479  onError:
 480     if (owned) {
 481         Py_DECREF(obj);
 482     }
 483     return NULL;
 484 }
 485
 486 PyObject *PyUnicode_Decode(const char *s,
 487                            int size,
 488                            const char *encoding,
 489                            const char *errors)
 490 {
 491     PyObject *buffer = NULL, *unicode;
 492
 493     if (encoding == NULL)
 494         encoding = PyUnicode_GetDefaultEncoding();
 495
 496     /* Shortcuts for common default encodings */
 497     if (strcmp(encoding, "utf-8") == 0)
 498         return PyUnicode_DecodeUTF8(s, size, errors);
 499     else if (strcmp(encoding, "latin-1") == 0)
 500         return PyUnicode_DecodeLatin1(s, size, errors);
 501     else if (strcmp(encoding, "ascii") == 0)
 502         return PyUnicode_DecodeASCII(s, size, errors);
 503
 504     /* Decode via the codec registry */
 505     buffer = PyBuffer_FromMemory((void *)s, size);
 506     if (buffer == NULL)
 507         goto onError;
 508     unicode = PyCodec_Decode(buffer, encoding, errors);
 509     if (unicode == NULL)
 510         goto onError;
 511     if (!PyUnicode_Check(unicode)) {
 512         PyErr_Format(PyExc_TypeError,
 513                      "decoder did not return an unicode object (type=%.400s)",
 514                      unicode->ob_type->tp_name);
 515         Py_DECREF(unicode);
 516         goto onError;
 517     }
 518     Py_DECREF(buffer);
 519     return unicode;
 520
 521  onError:
 522     Py_XDECREF(buffer);
 523     return NULL;
 524 }
 525
 526 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 527                            int size,
 528                            const char *encoding,
 529                            const char *errors)
 530 {
 531     PyObject *v, *unicode;
 532
 533     unicode = PyUnicode_FromUnicode(s, size);
 534     if (unicode == NULL)
 535         return NULL;
 536     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 537     Py_DECREF(unicode);
 538     return v;
 539 }
 540
 541 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 542                                     const char *encoding,
 543                                     const char *errors)
 544 {
 545     PyObject *v;
 546
 547     if (!PyUnicode_Check(unicode)) {
 548         PyErr_BadArgument();
 549         goto onError;
 550     }
 551
 552     if (encoding == NULL)
 553         encoding = PyUnicode_GetDefaultEncoding();
 554
 555     /* Shortcuts for common default encodings */
 556     if (errors == NULL) {
 557         if (strcmp(encoding, "utf-8") == 0)
 558             return PyUnicode_AsUTF8String(unicode);
 559         else if (strcmp(encoding, "latin-1") == 0)
 560             return PyUnicode_AsLatin1String(unicode);
 561         else if (strcmp(encoding, "ascii") == 0)
 562             return PyUnicode_AsASCIIString(unicode);
 563     }
 564
 565     /* Encode via the codec registry */
 566     v = PyCodec_Encode(unicode, encoding, errors);
 567     if (v == NULL)
 568         goto onError;
 569     /* XXX Should we really enforce this ? */
 570     if (!PyString_Check(v)) {
 571         PyErr_Format(PyExc_TypeError,
 572                      "encoder did not return a string object (type=%.400s)",
 573                      v->ob_type->tp_name);
 574         Py_DECREF(v);
 575         goto onError;
 576     }
 577     return v;
 578
 579  onError:
 580     return NULL;
 581 }
 582
 583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 584                                             const char *errors)
 585 {
 586     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 587
 588     if (v)
 589         return v;
 590     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 591     if (v && errors == NULL)
 592         ((PyUnicodeObject *)unicode)->defenc = v;
 593     return v;
 594 }
 595
 596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 597 {
 598     if (!PyUnicode_Check(unicode)) {
 599         PyErr_BadArgument();
 600         goto onError;
 601     }
 602     return PyUnicode_AS_UNICODE(unicode);
 603
 604  onError:
 605     return NULL;
 606 }
 607
 608 int PyUnicode_GetSize(PyObject *unicode)
 609 {
 610     if (!PyUnicode_Check(unicode)) {
 611         PyErr_BadArgument();
 612         goto onError;
 613     }
 614     return PyUnicode_GET_SIZE(unicode);
 615
 616  onError:
 617     return -1;
 618 }
 619
 620 const char *PyUnicode_GetDefaultEncoding(void)
 621 {
 622     return unicode_default_encoding;
 623 }
 624
 625 int PyUnicode_SetDefaultEncoding(const char *encoding)
 626 {
 627     PyObject *v;
 628
 629     /* Make sure the encoding is valid. As side effect, this also
 630        loads the encoding into the codec registry cache. */
 631     v = _PyCodec_Lookup(encoding);
 632     if (v == NULL)
 633         goto onError;
 634     Py_DECREF(v);
 635     strncpy(unicode_default_encoding,
 636             encoding,
 637             sizeof(unicode_default_encoding));
 638     return 0;
 639
 640  onError:
 641     return -1;
 642 }
 643
 644 /* --- UTF-7 Codec -------------------------------------------------------- */
 645
 646 /* see RFC2152 for details */
 647
 648 static
 649 char utf7_special[128] = {
 650     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 651        encoded:
 652            0 - not special
 653            1 - special
 654            2 - whitespace (optional)
 655            3 - RFC2152 Set O (optional) */
 656     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 657     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 658     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 659     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 660     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 661     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 662     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 663     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 664
 665 };
 666
 667 #define SPECIAL(c, encodeO, encodeWS) \
 668         (((c)>127 || utf7_special[(c)] == 1) || \
 669          (encodeWS && (utf7_special[(c)] == 2)) || \
 670      (encodeO && (utf7_special[(c)] == 3)))
 671
 672 #define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
 674 #define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
 675                         (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
 676
 677 #define ENCODE(out, ch, bits) \
 678     while (bits >= 6) { \
 679         *out++ = B64(ch >> (bits-6)); \
 680         bits -= 6; \
 681     }
 682
 683 #define DECODE(out, ch, bits, surrogate) \
 684     while (bits >= 16) { \
 685         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
 686         bits -= 16; \
 687                 if (surrogate) { \
 688                         /* We have already generated an error for the high surrogate
 689                so let's not bother seeing if the low surrogate is correct or not */\
 690                         surrogate = 0; \
 691                 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
 692             /* This is a surrogate pair. Unfortunately we can't represent \
 693                it in a 16-bit character */ \
 694                         surrogate = 1; \
 695             errmsg = "code pairs are not supported"; \
 696                 goto utf7Error; \
 697                 } else { \
 698                                 *out++ = outCh; \
 699                 } \
 700     } \
 701
 702 static
 703 int utf7_decoding_error(Py_UNICODE **dest,
 704                         const char *errors,
 705                         const char *details)
 706 {
 707     if ((errors == NULL) ||
 708         (strcmp(errors,"strict") == 0)) {
 709         PyErr_Format(PyExc_UnicodeError,
 710                      "UTF-7 decoding error: %.400s",
 711                      details);
 712         return -1;
 713     }
 714     else if (strcmp(errors,"ignore") == 0) {
 715         return 0;
 716     }
 717     else if (strcmp(errors,"replace") == 0) {
 718         if (dest != NULL) {
 719             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 720             (*dest)++;
 721         }
 722         return 0;
 723     }
 724     else {
 725         PyErr_Format(PyExc_ValueError,
 726                      "UTF-7 decoding error; unknown error handling code: %.400s",
 727                      errors);
 728         return -1;
 729     }
 730 }
 731
 732 PyObject *PyUnicode_DecodeUTF7(const char *s,
 733                                int size,
 734                                const char *errors)
 735 {
 736     const char *e;
 737     PyUnicodeObject *unicode;
 738     Py_UNICODE *p;
 739     const char *errmsg = "";
 740     int inShift = 0;
 741     unsigned int bitsleft = 0;
 742     unsigned long charsleft = 0;
 743         int surrogate = 0;
 744
 745     unicode = _PyUnicode_New(size);
 746     if (!unicode)
 747         return NULL;
 748     if (size == 0)
 749         return (PyObject *)unicode;
 750
 751     p = unicode->str;
 752     e = s + size;
 753
 754     while (s < e) {
 755         Py_UNICODE ch = *s;
 756
 757         if (inShift) {
 758             if ((ch == '-') || !B64CHAR(ch)) {
 759                 inShift = 0;
 760                 s++;
 761
 762                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 763                 if (bitsleft >= 6) {
 764                     /* The shift sequence has a partial character in it. If
 765                        bitsleft < 6 then we could just classify it as padding
 766                        but that is not the case here */
 767
 768                     errmsg = "partial character in shift sequence";
 769                     goto utf7Error;
 770                 }
 771                 /* According to RFC2152 the remaining bits should be zero. We
 772                    choose to signal an error/insert a replacement character
 773                    here so indicate the potential of a misencoded character. */
 774
 775                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 776                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 777                     errmsg = "non-zero padding bits in shift sequence";
 778                     goto utf7Error;
 779                 }
 780
 781                 if (ch == '-') {
 782                     if ((s < e) && (*(s) == '-')) {
 783                         *p++ = '-';
 784                         inShift = 1;
 785                     }
 786                 } else if (SPECIAL(ch,0,0)) {
 787                     errmsg = "unexpected special character";
 788                         goto utf7Error;
 789                 } else  {
 790                     *p++ = ch;
 791                 }
 792             } else {
 793                 charsleft = (charsleft << 6) | UB64(ch);
 794                 bitsleft += 6;
 795                 s++;
 796                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 797             }
 798         }
 799         else if ( ch == '+' ) {
 800             s++;
 801             if (s < e && *s == '-') {
 802                 s++;
 803                 *p++ = '+';
 804             } else
 805             {
 806                 inShift = 1;
 807                 bitsleft = 0;
 808             }
 809         }
 810         else if (SPECIAL(ch,0,0)) {
 811             errmsg = "unexpected special character";
 812             s++;
 813                 goto utf7Error;
 814         }
 815         else {
 816             *p++ = ch;
 817             s++;
 818         }
 819         continue;
 820     utf7Error:
 821       if (utf7_decoding_error(&p, errors, errmsg))
 822           goto onError;
 823     }
 824
 825     if (inShift) {
 826         if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
 827             goto onError;
 828     }
 829
 830     if (_PyUnicode_Resize(&unicode, p - unicode->str))
 831         goto onError;
 832
 833     return (PyObject *)unicode;
 834
 835 onError:
 836     Py_DECREF(unicode);
 837     return NULL;
 838 }
 839
 840
 841 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
 842                    int size,
 843                    int encodeSetO,
 844                    int encodeWhiteSpace,
 845                    const char *errors)
 846 {
 847     PyObject *v;
 848     /* It might be possible to tighten this worst case */
 849     unsigned int cbAllocated = 5 * size;
 850     int inShift = 0;
 851     int i = 0;
 852     unsigned int bitsleft = 0;
 853     unsigned long charsleft = 0;
 854     char * out;
 855     char * start;
 856
 857     if (size == 0)
 858                 return PyString_FromStringAndSize(NULL, 0);
 859
 860     v = PyString_FromStringAndSize(NULL, cbAllocated);
 861     if (v == NULL)
 862         return NULL;
 863
 864     start = out = PyString_AS_STRING(v);
 865     for (;i < size; ++i) {
 866         Py_UNICODE ch = s[i];
 867
 868         if (!inShift) {
 869                         if (ch == '+') {
 870                                 *out++ = '+';
 871                 *out++ = '-';
 872             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 873                 charsleft = ch;
 874                 bitsleft = 16;
 875                 *out++ = '+';
 876                                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 877                 inShift = bitsleft > 0;
 878                         } else {
 879                                 *out++ = (char) ch;
 880                         }
 881                 } else {
 882             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 883                 *out++ = B64(charsleft << (6-bitsleft));
 884                 charsleft = 0;
 885                 bitsleft = 0;
 886                 /* Characters not in the BASE64 set implicitly unshift the sequence
 887                    so no '-' is required, except if the character is itself a '-' */
 888                 if (B64CHAR(ch) || ch == '-') {
 889                     *out++ = '-';
 890                 }
 891                 inShift = 0;
 892                 *out++ = (char) ch;
 893             } else {
 894                 bitsleft += 16;
 895                 charsleft = (charsleft << 16) | ch;
 896                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 897
 898                 /* If the next character is special then we dont' need to terminate
 899                    the shift sequence. If the next character is not a BASE64 character
 900                    or '-' then the shift sequence will be terminated implicitly and we
 901                    don't have to insert a '-'. */
 902
 903                 if (bitsleft == 0) {
 904                     if (i + 1 < size) {
 905                         Py_UNICODE ch2 = s[i+1];
 906
 907                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
 908
 909                         } else if (B64CHAR(ch2) || ch2 == '-') {
 910                             *out++ = '-';
 911                             inShift = 0;
 912                         } else {
 913                             inShift = 0;
 914                         }
 915
 916                     }
 917                     else {
 918                         *out++ = '-';
 919                         inShift = 0;
 920                     }
 921                 }
 922             }
 923         }
 924         }
 925     if (bitsleft) {
 926         *out++= B64(charsleft << (6-bitsleft) );
 927         *out++ = '-';
 928     }
 929
 930     if (_PyString_Resize(&v, out - start)) {
 931         Py_DECREF(v);
 932         return NULL;
 933     }
 934     return v;
 935 }
 936
 937 #undef SPECIAL
 938 #undef B64
 939 #undef B64CHAR
 940 #undef UB64
 941 #undef ENCODE
 942 #undef DECODE
 943
 944 /* --- UTF-8 Codec -------------------------------------------------------- */
 945
 946 static
 947 char utf8_code_length[256] = {
 948     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 949        illegal prefix.  see RFC 2279 for details */
 950     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 951     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 952     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 953     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 955     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 956     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 958     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 959     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 960     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 961     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 962     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 963     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 964     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 965     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 966 };
 967
 968 static
 969 int utf8_decoding_error(const char **source,
 970                         Py_UNICODE **dest,
 971                         const char *errors,
 972                         const char *details)
 973 {
 974     if ((errors == NULL) ||
 975         (strcmp(errors,"strict") == 0)) {
 976         PyErr_Format(PyExc_UnicodeError,
 977                      "UTF-8 decoding error: %.400s",
 978                      details);
 979         return -1;
 980     }
 981     else if (strcmp(errors,"ignore") == 0) {
 982         (*source)++;
 983         return 0;
 984     }
 985     else if (strcmp(errors,"replace") == 0) {
 986         (*source)++;
 987         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 988         (*dest)++;
 989         return 0;
 990     }
 991     else {
 992         PyErr_Format(PyExc_ValueError,
 993                      "UTF-8 decoding error; unknown error handling code: %.400s",
 994                      errors);
 995         return -1;
 996     }
 997 }
 998
 999 PyObject *PyUnicode_DecodeUTF8(const char *s,
1000                                int size,
1001                                const char *errors)
1002 {
1003     int n;
1004     const char *e;
1005     PyUnicodeObject *unicode;
1006     Py_UNICODE *p;
1007     const char *errmsg = "";
1008
1009     /* Note: size will always be longer than the resulting Unicode
1010        character count */
1011     unicode = _PyUnicode_New(size);
1012     if (!unicode)
1013         return NULL;
1014     if (size == 0)
1015         return (PyObject *)unicode;
1016
1017     /* Unpack UTF-8 encoded data */
1018     p = unicode->str;
1019     e = s + size;
1020
1021     while (s < e) {
1022         Py_UCS4 ch = (unsigned char)*s;
1023
1024         if (ch < 0x80) {
1025             *p++ = (Py_UNICODE)ch;
1026             s++;
1027             continue;
1028         }
1029
1030         n = utf8_code_length[ch];
1031
1032         if (s + n > e) {
1033             errmsg = "unexpected end of data";
1034             goto utf8Error;
1035         }
1036
1037         switch (n) {
1038
1039         case 0:
1040             errmsg = "unexpected code byte";
1041             goto utf8Error;
1042
1043         case 1:
1044             errmsg = "internal error";
1045             goto utf8Error;
1046
1047         case 2:
1048             if ((s[1] & 0xc0) != 0x80) {
1049                 errmsg = "invalid data";
1050                 goto utf8Error;
1051             }
1052             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053             if (ch < 0x80) {
1054                 errmsg = "illegal encoding";
1055                 goto utf8Error;
1056             }
1057             else
1058                 *p++ = (Py_UNICODE)ch;
1059             break;
1060
1061         case 3:
1062             if ((s[1] & 0xc0) != 0x80 ||
1063                 (s[2] & 0xc0) != 0x80) {
1064                 errmsg = "invalid data";
1065                 goto utf8Error;
1066             }
1067             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069                 errmsg = "illegal encoding";
1070                 goto utf8Error;
1071             }
1072             else
1073                                 *p++ = (Py_UNICODE)ch;
1074             break;
1075
1076         case 4:
1077             if ((s[1] & 0xc0) != 0x80 ||
1078                 (s[2] & 0xc0) != 0x80 ||
1079                 (s[3] & 0xc0) != 0x80) {
1080                 errmsg = "invalid data";
1081                 goto utf8Error;
1082             }
1083             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085             /* validate and convert to UTF-16 */
1086             if ((ch < 0x10000)        /* minimum value allowed for 4
1087                                        byte encoding */
1088                 || (ch > 0x10ffff))   /* maximum value allowed for
1089                                        UTF-16 */
1090             {
1091                 errmsg = "illegal encoding";
1092                 goto utf8Error;
1093             }
1094 #ifdef Py_UNICODE_WIDE
1095             *p++ = (Py_UNICODE)ch;
1096 #else
1097             /*  compute and append the two surrogates: */
1098
1099             /*  translate from 10000..10FFFF to 0..FFFF */
1100             ch -= 0x10000;
1101
1102             /*  high surrogate = top 10 bits added to D800 */
1103             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1104
1105             /*  low surrogate = bottom 10 bits added to DC00 */
1106             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1107 #endif
1108             break;
1109
1110         default:
1111             /* Other sizes are only needed for UCS-4 */
1112             errmsg = "unsupported Unicode code range";
1113             goto utf8Error;
1114         }
1115         s += n;
1116         continue;
1117
1118     utf8Error:
1119       if (utf8_decoding_error(&s, &p, errors, errmsg))
1120           goto onError;
1121     }
1122
1123     /* Adjust length */
1124     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1125         goto onError;
1126
1127     return (PyObject *)unicode;
1128
1129 onError:
1130     Py_DECREF(unicode);
1131     return NULL;
1132 }
1133
1134 /* Not used anymore, now that the encoder supports UTF-16
1135    surrogates. */
1136 #if 0
1137 static
1138 int utf8_encoding_error(const Py_UNICODE **source,
1139                         char **dest,
1140                         const char *errors,
1141                         const char *details)
1142 {
1143     if ((errors == NULL) ||
1144         (strcmp(errors,"strict") == 0)) {
1145         PyErr_Format(PyExc_UnicodeError,
1146                      "UTF-8 encoding error: %.400s",
1147                      details);
1148         return -1;
1149     }
1150     else if (strcmp(errors,"ignore") == 0) {
1151         return 0;
1152     }
1153     else if (strcmp(errors,"replace") == 0) {
1154         **dest = '?';
1155         (*dest)++;
1156         return 0;
1157     }
1158     else {
1159         PyErr_Format(PyExc_ValueError,
1160                      "UTF-8 encoding error; "
1161                      "unknown error handling code: %.400s",
1162                      errors);
1163         return -1;
1164     }
1165 }
1166 #endif
1167
1168 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169                                int size,
1170                                const char *errors)
1171 {
1172     PyObject *v;
1173     char *p;
1174     char *q;
1175     Py_UCS4 ch2;
1176     unsigned int cbAllocated = 3 * size;
1177     unsigned int cbWritten = 0;
1178     int i = 0;
1179
1180     v = PyString_FromStringAndSize(NULL, cbAllocated);
1181     if (v == NULL)
1182         return NULL;
1183     if (size == 0)
1184         return v;
1185
1186     p = q = PyString_AS_STRING(v);
1187     while (i < size) {
1188         Py_UCS4 ch = s[i++];
1189         if (ch < 0x80) {
1190             *p++ = (char) ch;
1191             cbWritten++;
1192         }
1193         else if (ch < 0x0800) {
1194             *p++ = 0xc0 | (ch >> 6);
1195             *p++ = 0x80 | (ch & 0x3f);
1196             cbWritten += 2;
1197         }
1198         else if (ch < 0x10000) {
1199             /* Check for high surrogate */
1200             if (0xD800 <= ch && ch <= 0xDBFF) {
1201                 if (i != size) {
1202                     ch2 = s[i];
1203                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1204
1205                         if (cbWritten >= (cbAllocated - 4)) {
1206                             /* Provide enough room for some more
1207                                surrogates */
1208                             cbAllocated += 4*10;
1209                             if (_PyString_Resize(&v, cbAllocated))
1210                                 goto onError;
1211                         }
1212
1213                         /* combine the two values */
1214                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1215
1216                         *p++ = (char)((ch >> 18) | 0xf0);
1217                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1218                         i++;
1219                         cbWritten += 4;
1220                     }
1221                 }
1222             }
1223             else {
1224                 *p++ = (char)(0xe0 | (ch >> 12));
1225                 cbWritten += 3;
1226             }
1227             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1228             *p++ = (char)(0x80 | (ch & 0x3f));
1229         } else {
1230             *p++ = 0xf0 | (ch>>18);
1231             *p++ = 0x80 | ((ch>>12) & 0x3f);
1232             *p++ = 0x80 | ((ch>>6) & 0x3f);
1233             *p++ = 0x80 | (ch & 0x3f);
1234             cbWritten += 4;
1235         }
1236     }
1237     *p = '\0';
1238     if (_PyString_Resize(&v, p - q))
1239         goto onError;
1240     return v;
1241
1242  onError:
1243     Py_DECREF(v);
1244     return NULL;
1245 }
1246
1247 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1248 {
1249     if (!PyUnicode_Check(unicode)) {
1250         PyErr_BadArgument();
1251         return NULL;
1252     }
1253     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1254                                 PyUnicode_GET_SIZE(unicode),
1255                                 NULL);
1256 }
1257
1258 /* --- UTF-16 Codec ------------------------------------------------------- */
1259
1260 static
1261 int utf16_decoding_error(Py_UNICODE **dest,
1262                          const char *errors,
1263                          const char *details)
1264 {
1265     if ((errors == NULL) ||
1266         (strcmp(errors,"strict") == 0)) {
1267         PyErr_Format(PyExc_UnicodeError,
1268                      "UTF-16 decoding error: %.400s",
1269                      details);
1270         return -1;
1271     }
1272     else if (strcmp(errors,"ignore") == 0) {
1273         return 0;
1274     }
1275     else if (strcmp(errors,"replace") == 0) {
1276         if (dest) {
1277             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1278             (*dest)++;
1279         }
1280         return 0;
1281     }
1282     else {
1283         PyErr_Format(PyExc_ValueError,
1284                      "UTF-16 decoding error; "
1285                      "unknown error handling code: %.400s",
1286                      errors);
1287         return -1;
1288     }
1289 }
1290
1291 PyObject *
1292 PyUnicode_DecodeUTF16(const char *s,
1293                       int size,
1294                       const char *errors,
1295                       int *byteorder)
1296 {
1297     PyUnicodeObject *unicode;
1298     Py_UNICODE *p;
1299     const unsigned char *q, *e;
1300     int bo = 0;       /* assume native ordering by default */
1301     const char *errmsg = "";
1302     /* Offsets from q for retrieving byte pairs in the right order. */
1303 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1304     int ihi = 1, ilo = 0;
1305 #else
1306     int ihi = 0, ilo = 1;
1307 #endif
1308
1309     /* size should be an even number */
1310     if (size & 1) {
1311         if (utf16_decoding_error(NULL, errors, "truncated data"))
1312             return NULL;
1313         --size;  /* else ignore the oddball byte */
1314     }
1315
1316     /* Note: size will always be longer than the resulting Unicode
1317        character count */
1318     unicode = _PyUnicode_New(size);
1319     if (!unicode)
1320         return NULL;
1321     if (size == 0)
1322         return (PyObject *)unicode;
1323
1324     /* Unpack UTF-16 encoded data */
1325     p = unicode->str;
1326     q = (unsigned char *)s;
1327     e = q + size;
1328
1329     if (byteorder)
1330         bo = *byteorder;
1331
1332     /* Check for BOM marks (U+FEFF) in the input and adjust current
1333        byte order setting accordingly. In native mode, the leading BOM
1334        mark is skipped, in all other modes, it is copied to the output
1335        stream as-is (giving a ZWNBSP character). */
1336     if (bo == 0) {
1337         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1338 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1339         if (bom == 0xFEFF) {
1340             q += 2;
1341             bo = -1;
1342         }
1343         else if (bom == 0xFFFE) {
1344             q += 2;
1345             bo = 1;
1346         }
1347 #else
1348         if (bom == 0xFEFF) {
1349             q += 2;
1350             bo = 1;
1351         }
1352         else if (bom == 0xFFFE) {
1353             q += 2;
1354             bo = -1;
1355         }
1356 #endif
1357     }
1358
1359     if (bo == -1) {
1360         /* force LE */
1361         ihi = 1;
1362         ilo = 0;
1363     }
1364     else if (bo == 1) {
1365         /* force BE */
1366         ihi = 0;
1367         ilo = 1;
1368     }
1369
1370     while (q < e) {
1371         Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1372         q += 2;
1373
1374         if (ch < 0xD800 || ch > 0xDFFF) {
1375             *p++ = ch;
1376             continue;
1377         }
1378
1379         /* UTF-16 code pair: */
1380         if (q >= e) {
1381             errmsg = "unexpected end of data";
1382             goto utf16Error;
1383         }
1384         if (0xD800 <= ch && ch <= 0xDBFF) {
1385             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1386             q += 2;
1387             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1388 #ifndef Py_UNICODE_WIDE
1389                 *p++ = ch;
1390                 *p++ = ch2;
1391 #else
1392                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1393 #endif
1394                 continue;
1395             }
1396             else {
1397                 errmsg = "illegal UTF-16 surrogate";
1398                 goto utf16Error;
1399             }
1400
1401         }
1402         errmsg = "illegal encoding";
1403         /* Fall through to report the error */
1404
1405     utf16Error:
1406         if (utf16_decoding_error(&p, errors, errmsg))
1407             goto onError;
1408     }
1409
1410     if (byteorder)
1411         *byteorder = bo;
1412
1413     /* Adjust length */
1414     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1415         goto onError;
1416
1417     return (PyObject *)unicode;
1418
1419 onError:
1420     Py_DECREF(unicode);
1421     return NULL;
1422 }
1423
1424 PyObject *
1425 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1426                       int size,
1427                       const char *errors,
1428                       int byteorder)
1429 {
1430     PyObject *v;
1431     unsigned char *p;
1432     int i, pairs;
1433     /* Offsets from p for storing byte pairs in the right order. */
1434 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1435     int ihi = 1, ilo = 0;
1436 #else
1437     int ihi = 0, ilo = 1;
1438 #endif
1439
1440 #define STORECHAR(CH)                   \
1441     do {                                \
1442         p[ihi] = ((CH) >> 8) & 0xff;    \
1443         p[ilo] = (CH) & 0xff;           \
1444         p += 2;                         \
1445     } while(0)
1446
1447     for (i = pairs = 0; i < size; i++)
1448         if (s[i] >= 0x10000)
1449             pairs++;
1450     v = PyString_FromStringAndSize(NULL,
1451                   2 * (size + pairs + (byteorder == 0)));
1452     if (v == NULL)
1453         return NULL;
1454
1455     p = (unsigned char *)PyString_AS_STRING(v);
1456     if (byteorder == 0)
1457         STORECHAR(0xFEFF);
1458     if (size == 0)
1459         return v;
1460
1461     if (byteorder == -1) {
1462         /* force LE */
1463         ihi = 1;
1464         ilo = 0;
1465     }
1466     else if (byteorder == 1) {
1467         /* force BE */
1468         ihi = 0;
1469         ilo = 1;
1470     }
1471
1472     while (size-- > 0) {
1473         Py_UNICODE ch = *s++;
1474         Py_UNICODE ch2 = 0;
1475         if (ch >= 0x10000) {
1476             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1477             ch  = 0xD800 | ((ch-0x10000) >> 10);
1478         }
1479         STORECHAR(ch);
1480         if (ch2)
1481             STORECHAR(ch2);
1482     }
1483     return v;
1484 #undef STORECHAR
1485 }
1486
1487 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1488 {
1489     if (!PyUnicode_Check(unicode)) {
1490         PyErr_BadArgument();
1491         return NULL;
1492     }
1493     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1494                                  PyUnicode_GET_SIZE(unicode),
1495                                  NULL,
1496                                  0);
1497 }
1498
1499 /* --- Unicode Escape Codec ----------------------------------------------- */
1500
1501 static
1502 int unicodeescape_decoding_error(const char **source,
1503                                  Py_UNICODE *x,
1504                                  const char *errors,
1505                                  const char *details)
1506 {
1507     if ((errors == NULL) ||
1508         (strcmp(errors,"strict") == 0)) {
1509         PyErr_Format(PyExc_UnicodeError,
1510                      "Unicode-Escape decoding error: %.400s",
1511                      details);
1512         return -1;
1513     }
1514     else if (strcmp(errors,"ignore") == 0) {
1515         return 0;
1516     }
1517     else if (strcmp(errors,"replace") == 0) {
1518         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1519         return 0;
1520     }
1521     else {
1522         PyErr_Format(PyExc_ValueError,
1523                      "Unicode-Escape decoding error; "
1524                      "unknown error handling code: %.400s",
1525                      errors);
1526         return -1;
1527     }
1528 }
1529
1530 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1531
1532 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1533                                         int size,
1534                                         const char *errors)
1535 {
1536     PyUnicodeObject *v;
1537     Py_UNICODE *p, *buf;
1538     const char *end;
1539     char* message;
1540     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1541
1542     /* Escaped strings will always be longer than the resulting
1543        Unicode string, so we start with size here and then reduce the
1544        length after conversion to the true value. */
1545     v = _PyUnicode_New(size);
1546     if (v == NULL)
1547         goto onError;
1548     if (size == 0)
1549         return (PyObject *)v;
1550
1551     p = buf = PyUnicode_AS_UNICODE(v);
1552     end = s + size;
1553
1554     while (s < end) {
1555         unsigned char c;
1556         Py_UNICODE x;
1557         int i, digits;
1558
1559         /* Non-escape characters are interpreted as Unicode ordinals */
1560         if (*s != '\\') {
1561             *p++ = (unsigned char) *s++;
1562             continue;
1563         }
1564
1565         /* \ - Escapes */
1566         s++;
1567         switch (*s++) {
1568
1569         /* \x escapes */
1570         case '\n': break;
1571         case '\\': *p++ = '\\'; break;
1572         case '\'': *p++ = '\''; break;
1573         case '\"': *p++ = '\"'; break;
1574         case 'b': *p++ = '\b'; break;
1575         case 'f': *p++ = '\014'; break; /* FF */
1576         case 't': *p++ = '\t'; break;
1577         case 'n': *p++ = '\n'; break;
1578         case 'r': *p++ = '\r'; break;
1579         case 'v': *p++ = '\013'; break; /* VT */
1580         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1581
1582         /* \OOO (octal) escapes */
1583         case '0': case '1': case '2': case '3':
1584         case '4': case '5': case '6': case '7':
1585             x = s[-1] - '0';
1586             if ('0' <= *s && *s <= '7') {
1587                 x = (x<<3) + *s++ - '0';
1588                 if ('0' <= *s && *s <= '7')
1589                     x = (x<<3) + *s++ - '0';
1590             }
1591             *p++ = x;
1592             break;
1593
1594         /* hex escapes */
1595         /* \xXX */
1596         case 'x':
1597             digits = 2;
1598             message = "truncated \\xXX escape";
1599             goto hexescape;
1600
1601         /* \uXXXX */
1602         case 'u':
1603             digits = 4;
1604             message = "truncated \\uXXXX escape";
1605             goto hexescape;
1606
1607         /* \UXXXXXXXX */
1608         case 'U':
1609             digits = 8;
1610             message = "truncated \\UXXXXXXXX escape";
1611         hexescape:
1612             chr = 0;
1613             for (i = 0; i < digits; i++) {
1614                 c = (unsigned char) s[i];
1615                 if (!isxdigit(c)) {
1616                     if (unicodeescape_decoding_error(&s, &x, errors, message))
1617                         goto onError;
1618                     chr = x;
1619                     i++;
1620                     break;
1621                 }
1622                 chr = (chr<<4) & ~0xF;
1623                 if (c >= '0' && c <= '9')
1624                     chr += c - '0';
1625                 else if (c >= 'a' && c <= 'f')
1626                     chr += 10 + c - 'a';
1627                 else
1628                     chr += 10 + c - 'A';
1629             }
1630             s += i;
1631         store:
1632             /* when we get here, chr is a 32-bit unicode character */
1633             if (chr <= 0xffff)
1634                 /* UCS-2 character */
1635                 *p++ = (Py_UNICODE) chr;
1636             else if (chr <= 0x10ffff) {
1637                 /* UCS-4 character. Either store directly, or as
1638                    surrogate pair. */
1639 #ifdef Py_UNICODE_WIDE
1640                 *p++ = chr;
1641 #else
1642                 chr -= 0x10000L;
1643                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1644                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1645 #endif
1646             } else {
1647                 if (unicodeescape_decoding_error(
1648                     &s, &x, errors,
1649                     "illegal Unicode character")
1650                     )
1651                     goto onError;
1652                 *p++ = x; /* store replacement character */
1653             }
1654             break;
1655
1656         /* \N{name} */
1657         case 'N':
1658             message = "malformed \\N character escape";
1659             if (ucnhash_CAPI == NULL) {
1660                 /* load the unicode data module */
1661                 PyObject *m, *v;
1662                 m = PyImport_ImportModule("unicodedata");
1663                 if (m == NULL)
1664                     goto ucnhashError;
1665                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1666                 Py_DECREF(m);
1667                 if (v == NULL)
1668                     goto ucnhashError;
1669                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1670                 Py_DECREF(v);
1671                 if (ucnhash_CAPI == NULL)
1672                     goto ucnhashError;
1673             }
1674             if (*s == '{') {
1675                 const char *start = s+1;
1676                 /* look for the closing brace */
1677                 while (*s != '}' && s < end)
1678                     s++;
1679                 if (s > start && s < end && *s == '}') {
1680                     /* found a name.  look it up in the unicode database */
1681                     message = "unknown Unicode character name";
1682                     s++;
1683                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1684                         goto store;
1685                 }
1686             }
1687             if (unicodeescape_decoding_error(&s, &x, errors, message))
1688                 goto onError;
1689             *p++ = x;
1690             break;
1691
1692         default:
1693             *p++ = '\\';
1694             *p++ = (unsigned char)s[-1];
1695             break;
1696         }
1697     }
1698     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1699                 goto onError;
1700     return (PyObject *)v;
1701
1702 ucnhashError:
1703     PyErr_SetString(
1704         PyExc_UnicodeError,
1705         "\\N escapes not supported (can't load unicodedata module)"
1706         );
1707     return NULL;
1708
1709 onError:
1710     Py_XDECREF(v);
1711     return NULL;
1712 }
1713
1714 /* Return a Unicode-Escape string version of the Unicode object.
1715
1716    If quotes is true, the string is enclosed in u"" or u'' quotes as
1717    appropriate.
1718
1719 */
1720
1721 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1722                                   int size,
1723                                   Py_UNICODE ch);
1724
1725 static
1726 PyObject *unicodeescape_string(const Py_UNICODE *s,
1727                                int size,
1728                                int quotes)
1729 {
1730     PyObject *repr;
1731     char *p;
1732
1733     static const char *hexdigit = "0123456789abcdef";
1734
1735     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1736     if (repr == NULL)
1737         return NULL;
1738
1739     p = PyString_AS_STRING(repr);
1740
1741     if (quotes) {
1742         *p++ = 'u';
1743         *p++ = (findchar(s, size, '\'') &&
1744                 !findchar(s, size, '"')) ? '"' : '\'';
1745     }
1746     while (size-- > 0) {
1747         Py_UNICODE ch = *s++;
1748
1749         /* Escape quotes */
1750         if (quotes &&
1751             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1752             *p++ = '\\';
1753             *p++ = (char) ch;
1754             continue;
1755         }
1756
1757 #ifdef Py_UNICODE_WIDE
1758         /* Map 21-bit characters to '\U00xxxxxx' */
1759         else if (ch >= 0x10000) {
1760             int offset = p - PyString_AS_STRING(repr);
1761
1762             /* Resize the string if necessary */
1763             if (offset + 12 > PyString_GET_SIZE(repr)) {
1764                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1765                     goto onError;
1766                 p = PyString_AS_STRING(repr) + offset;
1767             }
1768
1769             *p++ = '\\';
1770             *p++ = 'U';
1771             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1772             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1773             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1774             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1775             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1776             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1777             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1778             *p++ = hexdigit[ch & 0x0000000F];
1779             continue;
1780         }
1781 #endif
1782         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1783         else if (ch >= 0xD800 && ch < 0xDC00) {
1784             Py_UNICODE ch2;
1785             Py_UCS4 ucs;
1786
1787             ch2 = *s++;
1788             size--;
1789             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1790                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1791                 *p++ = '\\';
1792                 *p++ = 'U';
1793                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1794                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1795                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1796                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1797                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1798                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1799                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1800                 *p++ = hexdigit[ucs & 0x0000000F];
1801                 continue;
1802             }
1803             /* Fall through: isolated surrogates are copied as-is */
1804             s--;
1805             size++;
1806         }
1807
1808         /* Map 16-bit characters to '\uxxxx' */
1809         if (ch >= 256) {
1810             *p++ = '\\';
1811             *p++ = 'u';
1812             *p++ = hexdigit[(ch >> 12) & 0x000F];
1813             *p++ = hexdigit[(ch >> 8) & 0x000F];
1814             *p++ = hexdigit[(ch >> 4) & 0x000F];
1815             *p++ = hexdigit[ch & 0x000F];
1816         }
1817
1818         /* Map special whitespace to '\t', \n', '\r' */
1819         else if (ch == '\t') {
1820             *p++ = '\\';
1821             *p++ = 't';
1822         }
1823         else if (ch == '\n') {
1824             *p++ = '\\';
1825             *p++ = 'n';
1826         }
1827         else if (ch == '\r') {
1828             *p++ = '\\';
1829             *p++ = 'r';
1830         }
1831
1832         /* Map non-printable US ASCII to '\xhh' */
1833         else if (ch < ' ' || ch >= 0x7F) {
1834             *p++ = '\\';
1835             *p++ = 'x';
1836             *p++ = hexdigit[(ch >> 4) & 0x000F];
1837             *p++ = hexdigit[ch & 0x000F];
1838         }
1839
1840         /* Copy everything else as-is */
1841         else
1842             *p++ = (char) ch;
1843     }
1844     if (quotes)
1845         *p++ = PyString_AS_STRING(repr)[1];
1846
1847     *p = '\0';
1848     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1849         goto onError;
1850
1851     return repr;
1852
1853  onError:
1854     Py_DECREF(repr);
1855     return NULL;
1856 }
1857
1858 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1859                                         int size)
1860 {
1861     return unicodeescape_string(s, size, 0);
1862 }
1863
1864 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1865 {
1866     if (!PyUnicode_Check(unicode)) {
1867         PyErr_BadArgument();
1868         return NULL;
1869     }
1870     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1871                                          PyUnicode_GET_SIZE(unicode));
1872 }
1873
1874 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1875
1876 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1877                                            int size,
1878                                            const char *errors)
1879 {
1880     PyUnicodeObject *v;
1881     Py_UNICODE *p, *buf;
1882     const char *end;
1883     const char *bs;
1884
1885     /* Escaped strings will always be longer than the resulting
1886        Unicode string, so we start with size here and then reduce the
1887        length after conversion to the true value. */
1888     v = _PyUnicode_New(size);
1889     if (v == NULL)
1890         goto onError;
1891     if (size == 0)
1892         return (PyObject *)v;
1893     p = buf = PyUnicode_AS_UNICODE(v);
1894     end = s + size;
1895     while (s < end) {
1896         unsigned char c;
1897         Py_UNICODE x;
1898         int i;
1899
1900         /* Non-escape characters are interpreted as Unicode ordinals */
1901         if (*s != '\\') {
1902             *p++ = (unsigned char)*s++;
1903             continue;
1904         }
1905
1906         /* \u-escapes are only interpreted iff the number of leading
1907            backslashes if odd */
1908         bs = s;
1909         for (;s < end;) {
1910             if (*s != '\\')
1911                 break;
1912             *p++ = (unsigned char)*s++;
1913         }
1914         if (((s - bs) & 1) == 0 ||
1915             s >= end ||
1916             *s != 'u') {
1917             continue;
1918         }
1919         p--;
1920         s++;
1921
1922         /* \uXXXX with 4 hex digits */
1923         for (x = 0, i = 0; i < 4; i++) {
1924             c = (unsigned char)s[i];
1925             if (!isxdigit(c)) {
1926                 if (unicodeescape_decoding_error(&s, &x, errors,
1927                                                  "truncated \\uXXXX"))
1928                     goto onError;
1929                 i++;
1930                 break;
1931             }
1932             x = (x<<4) & ~0xF;
1933             if (c >= '0' && c <= '9')
1934                 x += c - '0';
1935             else if (c >= 'a' && c <= 'f')
1936                 x += 10 + c - 'a';
1937             else
1938                 x += 10 + c - 'A';
1939         }
1940         s += i;
1941         *p++ = x;
1942     }
1943     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1944         goto onError;
1945     return (PyObject *)v;
1946
1947  onError:
1948     Py_XDECREF(v);
1949     return NULL;
1950 }
1951
1952 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1953                                            int size)
1954 {
1955     PyObject *repr;
1956     char *p;
1957     char *q;
1958
1959     static const char *hexdigit = "0123456789abcdef";
1960
1961     repr = PyString_FromStringAndSize(NULL, 6 * size);
1962     if (repr == NULL)
1963         return NULL;
1964     if (size == 0)
1965         return repr;
1966
1967     p = q = PyString_AS_STRING(repr);
1968     while (size-- > 0) {
1969         Py_UNICODE ch = *s++;
1970         /* Map 16-bit characters to '\uxxxx' */
1971         if (ch >= 256) {
1972             *p++ = '\\';
1973             *p++ = 'u';
1974             *p++ = hexdigit[(ch >> 12) & 0xf];
1975             *p++ = hexdigit[(ch >> 8) & 0xf];
1976             *p++ = hexdigit[(ch >> 4) & 0xf];
1977             *p++ = hexdigit[ch & 15];
1978         }
1979         /* Copy everything else as-is */
1980         else
1981             *p++ = (char) ch;
1982     }
1983     *p = '\0';
1984     if (_PyString_Resize(&repr, p - q))
1985         goto onError;
1986
1987     return repr;
1988
1989  onError:
1990     Py_DECREF(repr);
1991     return NULL;
1992 }
1993
1994 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1995 {
1996     if (!PyUnicode_Check(unicode)) {
1997         PyErr_BadArgument();
1998         return NULL;
1999     }
2000     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2001                                             PyUnicode_GET_SIZE(unicode));
2002 }
2003
2004 /* --- Latin-1 Codec ------------------------------------------------------ */
2005
2006 PyObject *PyUnicode_DecodeLatin1(const char *s,
2007                                  int size,
2008                                  const char *errors)
2009 {
2010     PyUnicodeObject *v;
2011     Py_UNICODE *p;
2012
2013     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2014     if (size == 1 && *(unsigned char*)s < 256) {
2015         Py_UNICODE r = *(unsigned char*)s;
2016         return PyUnicode_FromUnicode(&r, 1);
2017     }
2018
2019     v = _PyUnicode_New(size);
2020     if (v == NULL)
2021         goto onError;
2022     if (size == 0)
2023         return (PyObject *)v;
2024     p = PyUnicode_AS_UNICODE(v);
2025     while (size-- > 0)
2026         *p++ = (unsigned char)*s++;
2027     return (PyObject *)v;
2028
2029  onError:
2030     Py_XDECREF(v);
2031     return NULL;
2032 }
2033
2034 static
2035 int latin1_encoding_error(const Py_UNICODE **source,
2036                           char **dest,
2037                           const char *errors,
2038                           const char *details)
2039 {
2040     if ((errors == NULL) ||
2041         (strcmp(errors,"strict") == 0)) {
2042         PyErr_Format(PyExc_UnicodeError,
2043                      "Latin-1 encoding error: %.400s",
2044                      details);
2045         return -1;
2046     }
2047     else if (strcmp(errors,"ignore") == 0) {
2048         return 0;
2049     }
2050     else if (strcmp(errors,"replace") == 0) {
2051         **dest = '?';
2052         (*dest)++;
2053         return 0;
2054     }
2055     else {
2056         PyErr_Format(PyExc_ValueError,
2057                      "Latin-1 encoding error; "
2058                      "unknown error handling code: %.400s",
2059                      errors);
2060         return -1;
2061     }
2062 }
2063
2064 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2065                                  int size,
2066                                  const char *errors)
2067 {
2068     PyObject *repr;
2069     char *s, *start;
2070
2071     repr = PyString_FromStringAndSize(NULL, size);
2072     if (repr == NULL)
2073         return NULL;
2074     if (size == 0)
2075         return repr;
2076
2077     s = PyString_AS_STRING(repr);
2078     start = s;
2079     while (size-- > 0) {
2080         Py_UNICODE ch = *p++;
2081         if (ch >= 256) {
2082             if (latin1_encoding_error(&p, &s, errors,
2083                                       "ordinal not in range(256)"))
2084                 goto onError;
2085         }
2086         else
2087             *s++ = (char)ch;
2088     }
2089     /* Resize if error handling skipped some characters */
2090     if (s - start < PyString_GET_SIZE(repr))
2091         if (_PyString_Resize(&repr, s - start))
2092             goto onError;
2093     return repr;
2094
2095  onError:
2096     Py_DECREF(repr);
2097     return NULL;
2098 }
2099
2100 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2101 {
2102     if (!PyUnicode_Check(unicode)) {
2103         PyErr_BadArgument();
2104         return NULL;
2105     }
2106     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2107                                   PyUnicode_GET_SIZE(unicode),
2108                                   NULL);
2109 }
2110
2111 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2112
2113 static
2114 int ascii_decoding_error(const char **source,
2115                          Py_UNICODE **dest,
2116                          const char *errors,
2117                          const char *details)
2118 {
2119     if ((errors == NULL) ||
2120         (strcmp(errors,"strict") == 0)) {
2121         PyErr_Format(PyExc_UnicodeError,
2122                      "ASCII decoding error: %.400s",
2123                      details);
2124         return -1;
2125     }
2126     else if (strcmp(errors,"ignore") == 0) {
2127         return 0;
2128     }
2129     else if (strcmp(errors,"replace") == 0) {
2130         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2131         (*dest)++;
2132         return 0;
2133     }
2134     else {
2135         PyErr_Format(PyExc_ValueError,
2136                      "ASCII decoding error; "
2137                      "unknown error handling code: %.400s",
2138                      errors);
2139         return -1;
2140     }
2141 }
2142
2143 PyObject *PyUnicode_DecodeASCII(const char *s,
2144                                 int size,
2145                                 const char *errors)
2146 {
2147     PyUnicodeObject *v;
2148     Py_UNICODE *p;
2149
2150     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2151     if (size == 1 && *(unsigned char*)s < 128) {
2152         Py_UNICODE r = *(unsigned char*)s;
2153         return PyUnicode_FromUnicode(&r, 1);
2154     }
2155
2156     v = _PyUnicode_New(size);
2157     if (v == NULL)
2158         goto onError;
2159     if (size == 0)
2160         return (PyObject *)v;
2161     p = PyUnicode_AS_UNICODE(v);
2162     while (size-- > 0) {
2163         register unsigned char c;
2164
2165         c = (unsigned char)*s++;
2166         if (c < 128)
2167             *p++ = c;
2168         else if (ascii_decoding_error(&s, &p, errors,
2169                                       "ordinal not in range(128)"))
2170                 goto onError;
2171     }
2172     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2173         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2174             goto onError;
2175     return (PyObject *)v;
2176
2177  onError:
2178     Py_XDECREF(v);
2179     return NULL;
2180 }
2181
2182 static
2183 int ascii_encoding_error(const Py_UNICODE **source,
2184                          char **dest,
2185                          const char *errors,
2186                          const char *details)
2187 {
2188     if ((errors == NULL) ||
2189         (strcmp(errors,"strict") == 0)) {
2190         PyErr_Format(PyExc_UnicodeError,
2191                      "ASCII encoding error: %.400s",
2192                      details);
2193         return -1;
2194     }
2195     else if (strcmp(errors,"ignore") == 0) {
2196         return 0;
2197     }
2198     else if (strcmp(errors,"replace") == 0) {
2199         **dest = '?';
2200         (*dest)++;
2201         return 0;
2202     }
2203     else {
2204         PyErr_Format(PyExc_ValueError,
2205                      "ASCII encoding error; "
2206                      "unknown error handling code: %.400s",
2207                      errors);
2208         return -1;
2209     }
2210 }
2211
2212 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2213                                 int size,
2214                                 const char *errors)
2215 {
2216     PyObject *repr;
2217     char *s, *start;
2218
2219     repr = PyString_FromStringAndSize(NULL, size);
2220     if (repr == NULL)
2221         return NULL;
2222     if (size == 0)
2223         return repr;
2224
2225     s = PyString_AS_STRING(repr);
2226     start = s;
2227     while (size-- > 0) {
2228         Py_UNICODE ch = *p++;
2229         if (ch >= 128) {
2230             if (ascii_encoding_error(&p, &s, errors,
2231                                       "ordinal not in range(128)"))
2232                 goto onError;
2233         }
2234         else
2235             *s++ = (char)ch;
2236     }
2237     /* Resize if error handling skipped some characters */
2238     if (s - start < PyString_GET_SIZE(repr))
2239         if (_PyString_Resize(&repr, s - start))
2240             goto onError;
2241     return repr;
2242
2243  onError:
2244     Py_DECREF(repr);
2245     return NULL;
2246 }
2247
2248 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2249 {
2250     if (!PyUnicode_Check(unicode)) {
2251         PyErr_BadArgument();
2252         return NULL;
2253     }
2254     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2255                                  PyUnicode_GET_SIZE(unicode),
2256                                  NULL);
2257 }
2258
2259 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2260
2261 /* --- MBCS codecs for Windows -------------------------------------------- */
2262
2263 PyObject *PyUnicode_DecodeMBCS(const char *s,
2264                                 int size,
2265                                 const char *errors)
2266 {
2267     PyUnicodeObject *v;
2268     Py_UNICODE *p;
2269
2270     /* First get the size of the result */
2271     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2272     if (size > 0 && usize==0)
2273         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2274
2275     v = _PyUnicode_New(usize);
2276     if (v == NULL)
2277         return NULL;
2278     if (usize == 0)
2279         return (PyObject *)v;
2280     p = PyUnicode_AS_UNICODE(v);
2281     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2282         Py_DECREF(v);
2283         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2284     }
2285
2286     return (PyObject *)v;
2287 }
2288
2289 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2290                                 int size,
2291                                 const char *errors)
2292 {
2293     PyObject *repr;
2294     char *s;
2295     DWORD mbcssize;
2296
2297     /* If there are no characters, bail now! */
2298     if (size==0)
2299             return PyString_FromString("");
2300
2301     /* First get the size of the result */
2302     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2303     if (mbcssize==0)
2304         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2305
2306     repr = PyString_FromStringAndSize(NULL, mbcssize);
2307     if (repr == NULL)
2308         return NULL;
2309     if (mbcssize == 0)
2310         return repr;
2311
2312     /* Do the conversion */
2313     s = PyString_AS_STRING(repr);
2314     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2315         Py_DECREF(repr);
2316         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2317     }
2318     return repr;
2319 }
2320
2321 #endif /* MS_WIN32 */
2322
2323 /* --- Character Mapping Codec -------------------------------------------- */
2324
2325 static
2326 int charmap_decoding_error(const char **source,
2327                          Py_UNICODE **dest,
2328                          const char *errors,
2329                          const char *details)
2330 {
2331     if ((errors == NULL) ||
2332         (strcmp(errors,"strict") == 0)) {
2333         PyErr_Format(PyExc_UnicodeError,
2334                      "charmap decoding error: %.400s",
2335                      details);
2336         return -1;
2337     }
2338     else if (strcmp(errors,"ignore") == 0) {
2339         return 0;
2340     }
2341     else if (strcmp(errors,"replace") == 0) {
2342         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2343         (*dest)++;
2344         return 0;
2345     }
2346     else {
2347         PyErr_Format(PyExc_ValueError,
2348                      "charmap decoding error; "
2349                      "unknown error handling code: %.400s",
2350                      errors);
2351         return -1;
2352     }
2353 }
2354
2355 PyObject *PyUnicode_DecodeCharmap(const char *s,
2356                                   int size,
2357                                   PyObject *mapping,
2358                                   const char *errors)
2359 {
2360     PyUnicodeObject *v;
2361     Py_UNICODE *p;
2362     int extrachars = 0;
2363
2364     /* Default to Latin-1 */
2365     if (mapping == NULL)
2366         return PyUnicode_DecodeLatin1(s, size, errors);
2367
2368     v = _PyUnicode_New(size);
2369     if (v == NULL)
2370         goto onError;
2371     if (size == 0)
2372         return (PyObject *)v;
2373     p = PyUnicode_AS_UNICODE(v);
2374     while (size-- > 0) {
2375         unsigned char ch = *s++;
2376         PyObject *w, *x;
2377
2378         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2379         w = PyInt_FromLong((long)ch);
2380         if (w == NULL)
2381             goto onError;
2382         x = PyObject_GetItem(mapping, w);
2383         Py_DECREF(w);
2384         if (x == NULL) {
2385             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2386                 /* No mapping found means: mapping is undefined. */
2387                 PyErr_Clear();
2388                 x = Py_None;
2389                 Py_INCREF(x);
2390             } else
2391                 goto onError;
2392         }
2393
2394         /* Apply mapping */
2395         if (PyInt_Check(x)) {
2396             long value = PyInt_AS_LONG(x);
2397             if (value < 0 || value > 65535) {
2398                 PyErr_SetString(PyExc_TypeError,
2399                                 "character mapping must be in range(65536)");
2400                 Py_DECREF(x);
2401                 goto onError;
2402             }
2403             *p++ = (Py_UNICODE)value;
2404         }
2405         else if (x == Py_None) {
2406             /* undefined mapping */
2407             if (charmap_decoding_error(&s, &p, errors,
2408                                        "character maps to <undefined>")) {
2409                 Py_DECREF(x);
2410                 goto onError;
2411             }
2412         }
2413         else if (PyUnicode_Check(x)) {
2414             int targetsize = PyUnicode_GET_SIZE(x);
2415
2416             if (targetsize == 1)
2417                 /* 1-1 mapping */
2418                 *p++ = *PyUnicode_AS_UNICODE(x);
2419
2420             else if (targetsize > 1) {
2421                 /* 1-n mapping */
2422                 if (targetsize > extrachars) {
2423                     /* resize first */
2424                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2425                     int needed = (targetsize - extrachars) + \
2426                                  (targetsize << 2);
2427                     extrachars += needed;
2428                     if (_PyUnicode_Resize(&v,
2429                                          PyUnicode_GET_SIZE(v) + needed)) {
2430                         Py_DECREF(x);
2431                         goto onError;
2432                     }
2433                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2434                 }
2435                 Py_UNICODE_COPY(p,
2436                                 PyUnicode_AS_UNICODE(x),
2437                                 targetsize);
2438                 p += targetsize;
2439                 extrachars -= targetsize;
2440             }
2441             /* 1-0 mapping: skip the character */
2442         }
2443         else {
2444             /* wrong return value */
2445             PyErr_SetString(PyExc_TypeError,
2446                   "character mapping must return integer, None or unicode");
2447             Py_DECREF(x);
2448             goto onError;
2449         }
2450         Py_DECREF(x);
2451     }
2452     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2453         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2454             goto onError;
2455     return (PyObject *)v;
2456
2457  onError:
2458     Py_XDECREF(v);
2459     return NULL;
2460 }
2461
2462 static
2463 int charmap_encoding_error(const Py_UNICODE **source,
2464                            char **dest,
2465                            const char *errors,
2466                            const char *details)
2467 {
2468     if ((errors == NULL) ||
2469         (strcmp(errors,"strict") == 0)) {
2470         PyErr_Format(PyExc_UnicodeError,
2471                      "charmap encoding error: %.400s",
2472                      details);
2473         return -1;
2474     }
2475     else if (strcmp(errors,"ignore") == 0) {
2476         return 0;
2477     }
2478     else if (strcmp(errors,"replace") == 0) {
2479         **dest = '?';
2480         (*dest)++;
2481         return 0;
2482     }
2483     else {
2484         PyErr_Format(PyExc_ValueError,
2485                      "charmap encoding error; "
2486                      "unknown error handling code: %.400s",
2487                      errors);
2488         return -1;
2489     }
2490 }
2491
2492 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2493                                   int size,
2494                                   PyObject *mapping,
2495                                   const char *errors)
2496 {
2497     PyObject *v;
2498     char *s;
2499     int extrachars = 0;
2500
2501     /* Default to Latin-1 */
2502     if (mapping == NULL)
2503         return PyUnicode_EncodeLatin1(p, size, errors);
2504
2505     v = PyString_FromStringAndSize(NULL, size);
2506     if (v == NULL)
2507         return NULL;
2508     if (size == 0)
2509         return v;
2510     s = PyString_AS_STRING(v);
2511     while (size-- > 0) {
2512         Py_UNICODE ch = *p++;
2513         PyObject *w, *x;
2514
2515         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2516         w = PyInt_FromLong((long)ch);
2517         if (w == NULL)
2518             goto onError;
2519         x = PyObject_GetItem(mapping, w);
2520         Py_DECREF(w);
2521         if (x == NULL) {
2522             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2523                 /* No mapping found means: mapping is undefined. */
2524                 PyErr_Clear();
2525                 x = Py_None;
2526                 Py_INCREF(x);
2527             } else
2528                 goto onError;
2529         }
2530
2531         /* Apply mapping */
2532         if (PyInt_Check(x)) {
2533             long value = PyInt_AS_LONG(x);
2534             if (value < 0 || value > 255) {
2535                 PyErr_SetString(PyExc_TypeError,
2536                                 "character mapping must be in range(256)");
2537                 Py_DECREF(x);
2538                 goto onError;
2539             }
2540             *s++ = (char)value;
2541         }
2542         else if (x == Py_None) {
2543             /* undefined mapping */
2544             if (charmap_encoding_error(&p, &s, errors,
2545                                        "character maps to <undefined>")) {
2546                 Py_DECREF(x);
2547                 goto onError;
2548             }
2549         }
2550         else if (PyString_Check(x)) {
2551             int targetsize = PyString_GET_SIZE(x);
2552
2553             if (targetsize == 1)
2554                 /* 1-1 mapping */
2555                 *s++ = *PyString_AS_STRING(x);
2556
2557             else if (targetsize > 1) {
2558                 /* 1-n mapping */
2559                 if (targetsize > extrachars) {
2560                     /* resize first */
2561                     int oldpos = (int)(s - PyString_AS_STRING(v));
2562                     int needed = (targetsize - extrachars) + \
2563                                  (targetsize << 2);
2564                     extrachars += needed;
2565                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2566                         Py_DECREF(x);
2567                         goto onError;
2568                     }
2569                     s = PyString_AS_STRING(v) + oldpos;
2570                 }
2571                 memcpy(s, PyString_AS_STRING(x), targetsize);
2572                 s += targetsize;
2573                 extrachars -= targetsize;
2574             }
2575             /* 1-0 mapping: skip the character */
2576         }
2577         else {
2578             /* wrong return value */
2579             PyErr_SetString(PyExc_TypeError,
2580                   "character mapping must return integer, None or unicode");
2581             Py_DECREF(x);
2582             goto onError;
2583         }
2584         Py_DECREF(x);
2585     }
2586     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2587         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2588             goto onError;
2589     return v;
2590
2591  onError:
2592     Py_DECREF(v);
2593     return NULL;
2594 }
2595
2596 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2597                                     PyObject *mapping)
2598 {
2599     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2600         PyErr_BadArgument();
2601         return NULL;
2602     }
2603     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2604                                    PyUnicode_GET_SIZE(unicode),
2605                                    mapping,
2606                                    NULL);
2607 }
2608
2609 static
2610 int translate_error(const Py_UNICODE **source,
2611                     Py_UNICODE **dest,
2612                     const char *errors,
2613                     const char *details)
2614 {
2615     if ((errors == NULL) ||
2616         (strcmp(errors,"strict") == 0)) {
2617         PyErr_Format(PyExc_UnicodeError,
2618                      "translate error: %.400s",
2619                      details);
2620         return -1;
2621     }
2622     else if (strcmp(errors,"ignore") == 0) {
2623         return 0;
2624     }
2625     else if (strcmp(errors,"replace") == 0) {
2626         **dest = '?';
2627         (*dest)++;
2628         return 0;
2629     }
2630     else {
2631         PyErr_Format(PyExc_ValueError,
2632                      "translate error; "
2633                      "unknown error handling code: %.400s",
2634                      errors);
2635         return -1;
2636     }
2637 }
2638
2639 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2640                                      int size,
2641                                      PyObject *mapping,
2642                                      const char *errors)
2643 {
2644     PyUnicodeObject *v;
2645     Py_UNICODE *p;
2646
2647     if (mapping == NULL) {
2648         PyErr_BadArgument();
2649         return NULL;
2650     }
2651
2652     /* Output will never be longer than input */
2653     v = _PyUnicode_New(size);
2654     if (v == NULL)
2655         goto onError;
2656     if (size == 0)
2657         goto done;
2658     p = PyUnicode_AS_UNICODE(v);
2659     while (size-- > 0) {
2660         Py_UNICODE ch = *s++;
2661         PyObject *w, *x;
2662
2663         /* Get mapping */
2664         w = PyInt_FromLong(ch);
2665         if (w == NULL)
2666             goto onError;
2667         x = PyObject_GetItem(mapping, w);
2668         Py_DECREF(w);
2669         if (x == NULL) {
2670             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2671                 /* No mapping found: default to 1-1 mapping */
2672                 PyErr_Clear();
2673                 *p++ = ch;
2674                 continue;
2675             }
2676             goto onError;
2677         }
2678
2679         /* Apply mapping */
2680         if (PyInt_Check(x))
2681             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2682         else if (x == Py_None) {
2683             /* undefined mapping */
2684             if (translate_error(&s, &p, errors,
2685                                 "character maps to <undefined>")) {
2686                 Py_DECREF(x);
2687                 goto onError;
2688             }
2689         }
2690         else if (PyUnicode_Check(x)) {
2691             if (PyUnicode_GET_SIZE(x) != 1) {
2692                 /* 1-n mapping */
2693                 PyErr_SetString(PyExc_NotImplementedError,
2694                                 "1-n mappings are currently not implemented");
2695                 Py_DECREF(x);
2696                 goto onError;
2697             }
2698             *p++ = *PyUnicode_AS_UNICODE(x);
2699         }
2700         else {
2701             /* wrong return value */
2702             PyErr_SetString(PyExc_TypeError,
2703                   "translate mapping must return integer, None or unicode");
2704             Py_DECREF(x);
2705             goto onError;
2706         }
2707         Py_DECREF(x);
2708     }
2709     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2710         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2711             goto onError;
2712
2713  done:
2714     return (PyObject *)v;
2715
2716  onError:
2717     Py_XDECREF(v);
2718     return NULL;
2719 }
2720
2721 PyObject *PyUnicode_Translate(PyObject *str,
2722                               PyObject *mapping,
2723                               const char *errors)
2724 {
2725     PyObject *result;
2726
2727     str = PyUnicode_FromObject(str);
2728     if (str == NULL)
2729         goto onError;
2730     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2731                                         PyUnicode_GET_SIZE(str),
2732                                         mapping,
2733                                         errors);
2734     Py_DECREF(str);
2735     return result;
2736
2737  onError:
2738     Py_XDECREF(str);
2739     return NULL;
2740 }
2741
2742 /* --- Decimal Encoder ---------------------------------------------------- */
2743
2744 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2745                             int length,
2746                             char *output,
2747                             const char *errors)
2748 {
2749     Py_UNICODE *p, *end;
2750
2751     if (output == NULL) {
2752         PyErr_BadArgument();
2753         return -1;
2754     }
2755
2756     p = s;
2757     end = s + length;
2758     while (p < end) {
2759         register Py_UNICODE ch = *p++;
2760         int decimal;
2761
2762         if (Py_UNICODE_ISSPACE(ch)) {
2763             *output++ = ' ';
2764             continue;
2765         }
2766         decimal = Py_UNICODE_TODECIMAL(ch);
2767         if (decimal >= 0) {
2768             *output++ = '0' + decimal;
2769             continue;
2770         }
2771         if (0 < ch && ch < 256) {
2772             *output++ = (char)ch;
2773             continue;
2774         }
2775         /* All other characters are considered invalid */
2776         if (errors == NULL || strcmp(errors, "strict") == 0) {
2777             PyErr_SetString(PyExc_ValueError,
2778                             "invalid decimal Unicode string");
2779             goto onError;
2780         }
2781         else if (strcmp(errors, "ignore") == 0)
2782             continue;
2783         else if (strcmp(errors, "replace") == 0) {
2784             *output++ = '?';
2785             continue;
2786         }
2787     }
2788     /* 0-terminate the output string */
2789     *output++ = '\0';
2790     return 0;
2791
2792  onError:
2793     return -1;
2794 }
2795
2796 /* --- Helpers ------------------------------------------------------------ */
2797
2798 static
2799 int count(PyUnicodeObject *self,
2800           int start,
2801           int end,
2802           PyUnicodeObject *substring)
2803 {
2804     int count = 0;
2805
2806     if (start < 0)
2807         start += self->length;
2808     if (start < 0)
2809         start = 0;
2810     if (end > self->length)
2811         end = self->length;
2812     if (end < 0)
2813         end += self->length;
2814     if (end < 0)
2815         end = 0;
2816
2817     if (substring->length == 0)
2818         return (end - start + 1);
2819
2820     end -= substring->length;
2821
2822     while (start <= end)
2823         if (Py_UNICODE_MATCH(self, start, substring)) {
2824             count++;
2825             start += substring->length;
2826         } else
2827             start++;
2828
2829     return count;
2830 }
2831
2832 int PyUnicode_Count(PyObject *str,
2833                     PyObject *substr,
2834                     int start,
2835                     int end)
2836 {
2837     int result;
2838
2839     str = PyUnicode_FromObject(str);
2840     if (str == NULL)
2841         return -1;
2842     substr = PyUnicode_FromObject(substr);
2843     if (substr == NULL) {
2844         Py_DECREF(str);
2845         return -1;
2846     }
2847
2848     result = count((PyUnicodeObject *)str,
2849                    start, end,
2850                    (PyUnicodeObject *)substr);
2851
2852     Py_DECREF(str);
2853     Py_DECREF(substr);
2854     return result;
2855 }
2856
2857 static
2858 int findstring(PyUnicodeObject *self,
2859                PyUnicodeObject *substring,
2860                int start,
2861                int end,
2862                int direction)
2863 {
2864     if (start < 0)
2865         start += self->length;
2866     if (start < 0)
2867         start = 0;
2868
2869     if (substring->length == 0)
2870         return start;
2871
2872     if (end > self->length)
2873         end = self->length;
2874     if (end < 0)
2875         end += self->length;
2876     if (end < 0)
2877         end = 0;
2878
2879     end -= substring->length;
2880
2881     if (direction < 0) {
2882         for (; end >= start; end--)
2883             if (Py_UNICODE_MATCH(self, end, substring))
2884                 return end;
2885     } else {
2886         for (; start <= end; start++)
2887             if (Py_UNICODE_MATCH(self, start, substring))
2888                 return start;
2889     }
2890
2891     return -1;
2892 }
2893
2894 int PyUnicode_Find(PyObject *str,
2895                    PyObject *substr,
2896                    int start,
2897                    int end,
2898                    int direction)
2899 {
2900     int result;
2901
2902     str = PyUnicode_FromObject(str);
2903     if (str == NULL)
2904         return -1;
2905     substr = PyUnicode_FromObject(substr);
2906     if (substr == NULL) {
2907         Py_DECREF(substr);
2908         return -1;
2909     }
2910
2911     result = findstring((PyUnicodeObject *)str,
2912                         (PyUnicodeObject *)substr,
2913                         start, end, direction);
2914     Py_DECREF(str);
2915     Py_DECREF(substr);
2916     return result;
2917 }
2918
2919 static
2920 int tailmatch(PyUnicodeObject *self,
2921               PyUnicodeObject *substring,
2922               int start,
2923               int end,
2924               int direction)
2925 {
2926     if (start < 0)
2927         start += self->length;
2928     if (start < 0)
2929         start = 0;
2930
2931     if (substring->length == 0)
2932         return 1;
2933
2934     if (end > self->length)
2935         end = self->length;
2936     if (end < 0)
2937         end += self->length;
2938     if (end < 0)
2939         end = 0;
2940
2941     end -= substring->length;
2942     if (end < start)
2943         return 0;
2944
2945     if (direction > 0) {
2946         if (Py_UNICODE_MATCH(self, end, substring))
2947             return 1;
2948     } else {
2949         if (Py_UNICODE_MATCH(self, start, substring))
2950             return 1;
2951     }
2952
2953     return 0;
2954 }
2955
2956 int PyUnicode_Tailmatch(PyObject *str,
2957                         PyObject *substr,
2958                         int start,
2959                         int end,
2960                         int direction)
2961 {
2962     int result;
2963
2964     str = PyUnicode_FromObject(str);
2965     if (str == NULL)
2966         return -1;
2967     substr = PyUnicode_FromObject(substr);
2968     if (substr == NULL) {
2969         Py_DECREF(substr);
2970         return -1;
2971     }
2972
2973     result = tailmatch((PyUnicodeObject *)str,
2974                        (PyUnicodeObject *)substr,
2975                        start, end, direction);
2976     Py_DECREF(str);
2977     Py_DECREF(substr);
2978     return result;
2979 }
2980
2981 static
2982 const Py_UNICODE *findchar(const Py_UNICODE *s,
2983                      int size,
2984                      Py_UNICODE ch)
2985 {
2986     /* like wcschr, but doesn't stop at NULL characters */
2987
2988     while (size-- > 0) {
2989         if (*s == ch)
2990             return s;
2991         s++;
2992     }
2993
2994     return NULL;
2995 }
2996
2997 /* Apply fixfct filter to the Unicode object self and return a
2998    reference to the modified object */
2999
3000 static
3001 PyObject *fixup(PyUnicodeObject *self,
3002                 int (*fixfct)(PyUnicodeObject *s))
3003 {
3004
3005     PyUnicodeObject *u;
3006
3007     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3008     if (u == NULL)
3009         return NULL;
3010
3011     Py_UNICODE_COPY(u->str, self->str, self->length);
3012
3013     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3014         /* fixfct should return TRUE if it modified the buffer. If
3015            FALSE, return a reference to the original buffer instead
3016            (to save space, not time) */
3017         Py_INCREF(self);
3018         Py_DECREF(u);
3019         return (PyObject*) self;
3020     }
3021     return (PyObject*) u;
3022 }
3023
3024 static
3025 int fixupper(PyUnicodeObject *self)
3026 {
3027     int len = self->length;
3028     Py_UNICODE *s = self->str;
3029     int status = 0;
3030
3031     while (len-- > 0) {
3032         register Py_UNICODE ch;
3033
3034         ch = Py_UNICODE_TOUPPER(*s);
3035         if (ch != *s) {
3036             status = 1;
3037             *s = ch;
3038         }
3039         s++;
3040     }
3041
3042     return status;
3043 }
3044
3045 static
3046 int fixlower(PyUnicodeObject *self)
3047 {
3048     int len = self->length;
3049     Py_UNICODE *s = self->str;
3050     int status = 0;
3051
3052     while (len-- > 0) {
3053         register Py_UNICODE ch;
3054
3055         ch = Py_UNICODE_TOLOWER(*s);
3056         if (ch != *s) {
3057             status = 1;
3058             *s = ch;
3059         }
3060         s++;
3061     }
3062
3063     return status;
3064 }
3065
3066 static
3067 int fixswapcase(PyUnicodeObject *self)
3068 {
3069     int len = self->length;
3070     Py_UNICODE *s = self->str;
3071     int status = 0;
3072
3073     while (len-- > 0) {
3074         if (Py_UNICODE_ISUPPER(*s)) {
3075             *s = Py_UNICODE_TOLOWER(*s);
3076             status = 1;
3077         } else if (Py_UNICODE_ISLOWER(*s)) {
3078             *s = Py_UNICODE_TOUPPER(*s);
3079             status = 1;
3080         }
3081         s++;
3082     }
3083
3084     return status;
3085 }
3086
3087 static
3088 int fixcapitalize(PyUnicodeObject *self)
3089 {
3090     int len = self->length;
3091     Py_UNICODE *s = self->str;
3092     int status = 0;
3093
3094     if (len == 0)
3095         return 0;
3096     if (Py_UNICODE_ISLOWER(*s)) {
3097         *s = Py_UNICODE_TOUPPER(*s);
3098         status = 1;
3099     }
3100     s++;
3101     while (--len > 0) {
3102         if (Py_UNICODE_ISUPPER(*s)) {
3103             *s = Py_UNICODE_TOLOWER(*s);
3104             status = 1;
3105         }
3106         s++;
3107     }
3108     return status;
3109 }
3110
3111 static
3112 int fixtitle(PyUnicodeObject *self)
3113 {
3114     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3115     register Py_UNICODE *e;
3116     int previous_is_cased;
3117
3118     /* Shortcut for single character strings */
3119     if (PyUnicode_GET_SIZE(self) == 1) {
3120         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3121         if (*p != ch) {
3122             *p = ch;
3123             return 1;
3124         }
3125         else
3126             return 0;
3127     }
3128
3129     e = p + PyUnicode_GET_SIZE(self);
3130     previous_is_cased = 0;
3131     for (; p < e; p++) {
3132         register const Py_UNICODE ch = *p;
3133
3134         if (previous_is_cased)
3135             *p = Py_UNICODE_TOLOWER(ch);
3136         else
3137             *p = Py_UNICODE_TOTITLE(ch);
3138
3139         if (Py_UNICODE_ISLOWER(ch) ||
3140             Py_UNICODE_ISUPPER(ch) ||
3141             Py_UNICODE_ISTITLE(ch))
3142             previous_is_cased = 1;
3143         else
3144             previous_is_cased = 0;
3145     }
3146     return 1;
3147 }
3148
3149 PyObject *PyUnicode_Join(PyObject *separator,
3150                          PyObject *seq)
3151 {
3152     Py_UNICODE *sep;
3153     int seplen;
3154     PyUnicodeObject *res = NULL;
3155     int reslen = 0;
3156     Py_UNICODE *p;
3157     int sz = 100;
3158     int i;
3159     PyObject *it;
3160
3161     it = PyObject_GetIter(seq);
3162     if (it == NULL)
3163         return NULL;
3164
3165     if (separator == NULL) {
3166         Py_UNICODE blank = ' ';
3167         sep = &blank;
3168         seplen = 1;
3169     }
3170     else {
3171         separator = PyUnicode_FromObject(separator);
3172         if (separator == NULL)
3173             goto onError;
3174         sep = PyUnicode_AS_UNICODE(separator);
3175         seplen = PyUnicode_GET_SIZE(separator);
3176     }
3177
3178     res = _PyUnicode_New(sz);
3179     if (res == NULL)
3180         goto onError;
3181     p = PyUnicode_AS_UNICODE(res);
3182     reslen = 0;
3183
3184     for (i = 0; ; ++i) {
3185         int itemlen;
3186         PyObject *item = PyIter_Next(it);
3187         if (item == NULL) {
3188             if (PyErr_Occurred())
3189                 goto onError;
3190             break;
3191         }
3192         if (!PyUnicode_Check(item)) {
3193             PyObject *v;
3194             if (!PyString_Check(item)) {
3195                 PyErr_Format(PyExc_TypeError,
3196                              "sequence item %i: expected string or Unicode,"
3197                              " %.80s found",
3198                              i, item->ob_type->tp_name);
3199                 Py_DECREF(item);
3200                 goto onError;
3201             }
3202             v = PyUnicode_FromObject(item);
3203             Py_DECREF(item);
3204             item = v;
3205             if (item == NULL)
3206                 goto onError;
3207         }
3208         itemlen = PyUnicode_GET_SIZE(item);
3209         while (reslen + itemlen + seplen >= sz) {
3210             if (_PyUnicode_Resize(&res, sz*2)) {
3211                 Py_DECREF(item);
3212                 goto onError;
3213             }
3214             sz *= 2;
3215             p = PyUnicode_AS_UNICODE(res) + reslen;
3216         }
3217         if (i > 0) {
3218             Py_UNICODE_COPY(p, sep, seplen);
3219             p += seplen;
3220             reslen += seplen;
3221         }
3222         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3223         p += itemlen;
3224         reslen += itemlen;
3225         Py_DECREF(item);
3226     }
3227     if (_PyUnicode_Resize(&res, reslen))
3228         goto onError;
3229
3230     Py_XDECREF(separator);
3231     Py_DECREF(it);
3232     return (PyObject *)res;
3233
3234  onError:
3235     Py_XDECREF(separator);
3236     Py_XDECREF(res);
3237     Py_DECREF(it);
3238     return NULL;
3239 }
3240
3241 static
3242 PyUnicodeObject *pad(PyUnicodeObject *self,
3243                      int left,
3244                      int right,
3245                      Py_UNICODE fill)
3246 {
3247     PyUnicodeObject *u;
3248
3249     if (left < 0)
3250         left = 0;
3251     if (right < 0)
3252         right = 0;
3253
3254     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3255         Py_INCREF(self);
3256         return self;
3257     }
3258
3259     u = _PyUnicode_New(left + self->length + right);
3260     if (u) {
3261         if (left)
3262             Py_UNICODE_FILL(u->str, fill, left);
3263         Py_UNICODE_COPY(u->str + left, self->str, self->length);
3264         if (right)
3265             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3266     }
3267
3268     return u;
3269 }
3270
3271 #define SPLIT_APPEND(data, left, right)                                 \
3272         str = PyUnicode_FromUnicode(data + left, right - left);         \
3273         if (!str)                                                       \
3274             goto onError;                                               \
3275         if (PyList_Append(list, str)) {                                 \
3276             Py_DECREF(str);                                             \
3277             goto onError;                                               \
3278         }                                                               \
3279         else                                                            \
3280             Py_DECREF(str);
3281
3282 static
3283 PyObject *split_whitespace(PyUnicodeObject *self,
3284                            PyObject *list,
3285                            int maxcount)
3286 {
3287     register int i;
3288     register int j;
3289     int len = self->length;
3290     PyObject *str;
3291
3292     for (i = j = 0; i < len; ) {
3293         /* find a token */
3294         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3295             i++;
3296         j = i;
3297         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3298             i++;
3299         if (j < i) {
3300             if (maxcount-- <= 0)
3301                 break;
3302             SPLIT_APPEND(self->str, j, i);
3303             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3304                 i++;
3305             j = i;
3306         }
3307     }
3308     if (j < len) {
3309         SPLIT_APPEND(self->str, j, len);
3310     }
3311     return list;
3312
3313  onError:
3314     Py_DECREF(list);
3315     return NULL;
3316 }
3317
3318 PyObject *PyUnicode_Splitlines(PyObject *string,
3319                                int keepends)
3320 {
3321     register int i;
3322     register int j;
3323     int len;
3324     PyObject *list;
3325     PyObject *str;
3326     Py_UNICODE *data;
3327
3328     string = PyUnicode_FromObject(string);
3329     if (string == NULL)
3330         return NULL;
3331     data = PyUnicode_AS_UNICODE(string);
3332     len = PyUnicode_GET_SIZE(string);
3333
3334     list = PyList_New(0);
3335     if (!list)
3336         goto onError;
3337
3338     for (i = j = 0; i < len; ) {
3339         int eol;
3340
3341         /* Find a line and append it */
3342         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3343             i++;
3344
3345         /* Skip the line break reading CRLF as one line break */
3346         eol = i;
3347         if (i < len) {
3348             if (data[i] == '\r' && i + 1 < len &&
3349                 data[i+1] == '\n')
3350                 i += 2;
3351             else
3352                 i++;
3353             if (keepends)
3354                 eol = i;
3355         }
3356         SPLIT_APPEND(data, j, eol);
3357         j = i;
3358     }
3359     if (j < len) {
3360         SPLIT_APPEND(data, j, len);
3361     }
3362
3363     Py_DECREF(string);
3364     return list;
3365
3366  onError:
3367     Py_DECREF(list);
3368     Py_DECREF(string);
3369     return NULL;
3370 }
3371
3372 static
3373 PyObject *split_char(PyUnicodeObject *self,
3374                      PyObject *list,
3375                      Py_UNICODE ch,
3376                      int maxcount)
3377 {
3378     register int i;
3379     register int j;
3380     int len = self->length;
3381     PyObject *str;
3382
3383     for (i = j = 0; i < len; ) {
3384         if (self->str[i] == ch) {
3385             if (maxcount-- <= 0)
3386                 break;
3387             SPLIT_APPEND(self->str, j, i);
3388             i = j = i + 1;
3389         } else
3390             i++;
3391     }
3392     if (j <= len) {
3393         SPLIT_APPEND(self->str, j, len);
3394     }
3395     return list;
3396
3397  onError:
3398     Py_DECREF(list);
3399     return NULL;
3400 }
3401
3402 static
3403 PyObject *split_substring(PyUnicodeObject *self,
3404                           PyObject *list,
3405                           PyUnicodeObject *substring,
3406                           int maxcount)
3407 {
3408     register int i;
3409     register int j;
3410     int len = self->length;
3411     int sublen = substring->length;
3412     PyObject *str;
3413
3414     for (i = j = 0; i <= len - sublen; ) {
3415         if (Py_UNICODE_MATCH(self, i, substring)) {
3416             if (maxcount-- <= 0)
3417                 break;
3418             SPLIT_APPEND(self->str, j, i);
3419             i = j = i + sublen;
3420         } else
3421             i++;
3422     }
3423     if (j <= len) {
3424         SPLIT_APPEND(self->str, j, len);
3425     }
3426     return list;
3427
3428  onError:
3429     Py_DECREF(list);
3430     return NULL;
3431 }
3432
3433 #undef SPLIT_APPEND
3434
3435 static
3436 PyObject *split(PyUnicodeObject *self,
3437                 PyUnicodeObject *substring,
3438                 int maxcount)
3439 {
3440     PyObject *list;
3441
3442     if (maxcount < 0)
3443         maxcount = INT_MAX;
3444
3445     list = PyList_New(0);
3446     if (!list)
3447         return NULL;
3448
3449     if (substring == NULL)
3450         return split_whitespace(self,list,maxcount);
3451
3452     else if (substring->length == 1)
3453         return split_char(self,list,substring->str[0],maxcount);
3454
3455     else if (substring->length == 0) {
3456         Py_DECREF(list);
3457         PyErr_SetString(PyExc_ValueError, "empty separator");
3458         return NULL;
3459     }
3460     else
3461         return split_substring(self,list,substring,maxcount);
3462 }
3463
3464 static
3465 PyObject *strip(PyUnicodeObject *self,
3466                 int left,
3467                 int right)
3468 {
3469     Py_UNICODE *p = self->str;
3470     int start = 0;
3471     int end = self->length;
3472
3473     if (left)
3474         while (start < end && Py_UNICODE_ISSPACE(p[start]))
3475             start++;
3476
3477     if (right)
3478         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3479             end--;
3480
3481     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3482         /* couldn't strip anything off, return original string */
3483         Py_INCREF(self);
3484         return (PyObject*) self;
3485     }
3486
3487     return (PyObject*) PyUnicode_FromUnicode(
3488         self->str + start,
3489         end - start
3490         );
3491 }
3492
3493 static
3494 PyObject *replace(PyUnicodeObject *self,
3495                   PyUnicodeObject *str1,
3496                   PyUnicodeObject *str2,
3497                   int maxcount)
3498 {
3499     PyUnicodeObject *u;
3500
3501     if (maxcount < 0)
3502         maxcount = INT_MAX;
3503
3504     if (str1->length == 1 && str2->length == 1) {
3505         int i;
3506
3507         /* replace characters */
3508         if (!findchar(self->str, self->length, str1->str[0]) &&
3509             PyUnicode_CheckExact(self)) {
3510             /* nothing to replace, return original string */
3511             Py_INCREF(self);
3512             u = self;
3513         } else {
3514             Py_UNICODE u1 = str1->str[0];
3515             Py_UNICODE u2 = str2->str[0];
3516
3517             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3518                 NULL,
3519                 self->length
3520                 );
3521             if (u != NULL) {
3522                 Py_UNICODE_COPY(u->str, self->str,
3523                                 self->length);
3524                 for (i = 0; i < u->length; i++)
3525                     if (u->str[i] == u1) {
3526                         if (--maxcount < 0)
3527                             break;
3528                         u->str[i] = u2;
3529                     }
3530         }
3531         }
3532
3533     } else {
3534         int n, i;
3535         Py_UNICODE *p;
3536
3537         /* replace strings */
3538         n = count(self, 0, self->length, str1);
3539         if (n > maxcount)
3540             n = maxcount;
3541         if (n == 0 && PyUnicode_CheckExact(self)) {
3542             /* nothing to replace, return original string */
3543             Py_INCREF(self);
3544             u = self;
3545         } else {
3546             u = _PyUnicode_New(
3547                 self->length + n * (str2->length - str1->length));
3548             if (u) {
3549                 i = 0;
3550                 p = u->str;
3551                 while (i <= self->length - str1->length)
3552                     if (Py_UNICODE_MATCH(self, i, str1)) {
3553                         /* replace string segment */
3554                         Py_UNICODE_COPY(p, str2->str, str2->length);
3555                         p += str2->length;
3556                         i += str1->length;
3557                         if (--n <= 0) {
3558                             /* copy remaining part */
3559                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3560                             break;
3561                         }
3562                     } else
3563                         *p++ = self->str[i++];
3564             }
3565         }
3566     }
3567
3568     return (PyObject *) u;
3569 }
3570
3571 /* --- Unicode Object Methods --------------------------------------------- */
3572
3573 static char title__doc__[] =
3574 "S.title() -> unicode\n\
3575 \n\
3576 Return a titlecased version of S, i.e. words start with title case\n\
3577 characters, all remaining cased characters have lower case.";
3578
3579 static PyObject*
3580 unicode_title(PyUnicodeObject *self)
3581 {
3582     return fixup(self, fixtitle);
3583 }
3584
3585 static char capitalize__doc__[] =
3586 "S.capitalize() -> unicode\n\
3587 \n\
3588 Return a capitalized version of S, i.e. make the first character\n\
3589 have upper case.";
3590
3591 static PyObject*
3592 unicode_capitalize(PyUnicodeObject *self)
3593 {
3594     return fixup(self, fixcapitalize);
3595 }
3596
3597 #if 0
3598 static char capwords__doc__[] =
3599 "S.capwords() -> unicode\n\
3600 \n\
3601 Apply .capitalize() to all words in S and return the result with\n\
3602 normalized whitespace (all whitespace strings are replaced by ' ').";
3603
3604 static PyObject*
3605 unicode_capwords(PyUnicodeObject *self)
3606 {
3607     PyObject *list;
3608     PyObject *item;
3609     int i;
3610
3611     /* Split into words */
3612     list = split(self, NULL, -1);
3613     if (!list)
3614         return NULL;
3615
3616     /* Capitalize each word */
3617     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3618         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3619                      fixcapitalize);
3620         if (item == NULL)
3621             goto onError;
3622         Py_DECREF(PyList_GET_ITEM(list, i));
3623         PyList_SET_ITEM(list, i, item);
3624     }
3625
3626     /* Join the words to form a new string */
3627     item = PyUnicode_Join(NULL, list);
3628
3629 onError:
3630     Py_DECREF(list);
3631     return (PyObject *)item;
3632 }
3633 #endif
3634
3635 static char center__doc__[] =
3636 "S.center(width) -> unicode\n\
3637 \n\
3638 Return S centered in a Unicode string of length width. Padding is done\n\
3639 using spaces.";
3640
3641 static PyObject *
3642 unicode_center(PyUnicodeObject *self, PyObject *args)
3643 {
3644     int marg, left;
3645     int width;
3646
3647     if (!PyArg_ParseTuple(args, "i:center", &width))
3648         return NULL;
3649
3650     if (self->length >= width && PyUnicode_CheckExact(self)) {
3651         Py_INCREF(self);
3652         return (PyObject*) self;
3653     }
3654
3655     marg = width - self->length;
3656     left = marg / 2 + (marg & width & 1);
3657
3658     return (PyObject*) pad(self, left, marg - left, ' ');
3659 }
3660
3661 #if 0
3662
3663 /* This code should go into some future Unicode collation support
3664    module. The basic comparison should compare ordinals on a naive
3665    basis (this is what Java does and thus JPython too). */
3666
3667 /* speedy UTF-16 code point order comparison */
3668 /* gleaned from: */
3669 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3670
3671 static short utf16Fixup[32] =
3672 {
3673     0, 0, 0, 0, 0, 0, 0, 0,
3674     0, 0, 0, 0, 0, 0, 0, 0,
3675     0, 0, 0, 0, 0, 0, 0, 0,
3676     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3677 };
3678
3679 static int
3680 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3681 {
3682     int len1, len2;
3683
3684     Py_UNICODE *s1 = str1->str;
3685     Py_UNICODE *s2 = str2->str;
3686
3687     len1 = str1->length;
3688     len2 = str2->length;
3689
3690     while (len1 > 0 && len2 > 0) {
3691         Py_UNICODE c1, c2;
3692
3693         c1 = *s1++;
3694         c2 = *s2++;
3695
3696         if (c1 > (1<<11) * 26)
3697             c1 += utf16Fixup[c1>>11];
3698         if (c2 > (1<<11) * 26)
3699             c2 += utf16Fixup[c2>>11];
3700         /* now c1 and c2 are in UTF-32-compatible order */
3701
3702         if (c1 != c2)
3703             return (c1 < c2) ? -1 : 1;
3704
3705         len1--; len2--;
3706     }
3707
3708     return (len1 < len2) ? -1 : (len1 != len2);
3709 }
3710
3711 #else
3712
3713 static int
3714 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3715 {
3716     register int len1, len2;
3717
3718     Py_UNICODE *s1 = str1->str;
3719     Py_UNICODE *s2 = str2->str;
3720
3721     len1 = str1->length;
3722     len2 = str2->length;
3723
3724     while (len1 > 0 && len2 > 0) {
3725         Py_UNICODE c1, c2;
3726
3727         c1 = *s1++;
3728         c2 = *s2++;
3729
3730         if (c1 != c2)
3731             return (c1 < c2) ? -1 : 1;
3732
3733         len1--; len2--;
3734     }
3735
3736     return (len1 < len2) ? -1 : (len1 != len2);
3737 }
3738
3739 #endif
3740
3741 int PyUnicode_Compare(PyObject *left,
3742                       PyObject *right)
3743 {
3744     PyUnicodeObject *u = NULL, *v = NULL;
3745     int result;
3746
3747     /* Coerce the two arguments */
3748     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3749     if (u == NULL)
3750         goto onError;
3751     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3752     if (v == NULL)
3753         goto onError;
3754
3755     /* Shortcut for empty or interned objects */
3756     if (v == u) {
3757         Py_DECREF(u);
3758         Py_DECREF(v);
3759         return 0;
3760     }
3761
3762     result = unicode_compare(u, v);
3763
3764     Py_DECREF(u);
3765     Py_DECREF(v);
3766     return result;
3767
3768 onError:
3769     Py_XDECREF(u);
3770     Py_XDECREF(v);
3771     return -1;
3772 }
3773
3774 int PyUnicode_Contains(PyObject *container,
3775                        PyObject *element)
3776 {
3777     PyUnicodeObject *u = NULL, *v = NULL;
3778     int result;
3779     register const Py_UNICODE *p, *e;
3780     register Py_UNICODE ch;
3781
3782     /* Coerce the two arguments */
3783     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3784     if (v == NULL) {
3785         PyErr_SetString(PyExc_TypeError,
3786             "'in <string>' requires character as left operand");
3787         goto onError;
3788     }
3789     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3790     if (u == NULL) {
3791         Py_DECREF(v);
3792         goto onError;
3793     }
3794
3795     /* Check v in u */
3796     if (PyUnicode_GET_SIZE(v) != 1) {
3797         PyErr_SetString(PyExc_TypeError,
3798             "'in <string>' requires character as left operand");
3799         goto onError;
3800     }
3801     ch = *PyUnicode_AS_UNICODE(v);
3802     p = PyUnicode_AS_UNICODE(u);
3803     e = p + PyUnicode_GET_SIZE(u);
3804     result = 0;
3805     while (p < e) {
3806         if (*p++ == ch) {
3807             result = 1;
3808             break;
3809         }
3810     }
3811
3812     Py_DECREF(u);
3813     Py_DECREF(v);
3814     return result;
3815
3816 onError:
3817     Py_XDECREF(u);
3818     Py_XDECREF(v);
3819     return -1;
3820 }
3821
3822 /* Concat to string or Unicode object giving a new Unicode object. */
3823
3824 PyObject *PyUnicode_Concat(PyObject *left,
3825                            PyObject *right)
3826 {
3827     PyUnicodeObject *u = NULL, *v = NULL, *w;
3828
3829     /* Coerce the two arguments */
3830     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3831     if (u == NULL)
3832         goto onError;
3833     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3834     if (v == NULL)
3835         goto onError;
3836
3837     /* Shortcuts */
3838     if (v == unicode_empty) {
3839         Py_DECREF(v);
3840         return (PyObject *)u;
3841     }
3842     if (u == unicode_empty) {
3843         Py_DECREF(u);
3844         return (PyObject *)v;
3845     }
3846
3847     /* Concat the two Unicode strings */
3848     w = _PyUnicode_New(u->length + v->length);
3849     if (w == NULL)
3850         goto onError;
3851     Py_UNICODE_COPY(w->str, u->str, u->length);
3852     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3853
3854     Py_DECREF(u);
3855     Py_DECREF(v);
3856     return (PyObject *)w;
3857
3858 onError:
3859     Py_XDECREF(u);
3860     Py_XDECREF(v);
3861     return NULL;
3862 }
3863
3864 static char count__doc__[] =
3865 "S.count(sub[, start[, end]]) -> int\n\
3866 \n\
3867 Return the number of occurrences of substring sub in Unicode string\n\
3868 S[start:end].  Optional arguments start and end are\n\
3869 interpreted as in slice notation.";
3870
3871 static PyObject *
3872 unicode_count(PyUnicodeObject *self, PyObject *args)
3873 {
3874     PyUnicodeObject *substring;
3875     int start = 0;
3876     int end = INT_MAX;
3877     PyObject *result;
3878
3879     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3880                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3881         return NULL;
3882
3883     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3884                                                 (PyObject *)substring);
3885     if (substring == NULL)
3886         return NULL;
3887
3888     if (start < 0)
3889         start += self->length;
3890     if (start < 0)
3891         start = 0;
3892     if (end > self->length)
3893         end = self->length;
3894     if (end < 0)
3895         end += self->length;
3896     if (end < 0)
3897         end = 0;
3898
3899     result = PyInt_FromLong((long) count(self, start, end, substring));
3900
3901     Py_DECREF(substring);
3902     return result;
3903 }
3904
3905 static char encode__doc__[] =
3906 "S.encode([encoding[,errors]]) -> string\n\
3907 \n\
3908 Return an encoded string version of S. Default encoding is the current\n\
3909 default string encoding. errors may be given to set a different error\n\
3910 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3911 a ValueError. Other possible values are 'ignore' and 'replace'.";
3912
3913 static PyObject *
3914 unicode_encode(PyUnicodeObject *self, PyObject *args)
3915 {
3916     char *encoding = NULL;
3917     char *errors = NULL;
3918     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3919         return NULL;
3920     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3921 }
3922
3923 static char expandtabs__doc__[] =
3924 "S.expandtabs([tabsize]) -> unicode\n\
3925 \n\
3926 Return a copy of S where all tab characters are expanded using spaces.\n\
3927 If tabsize is not given, a tab size of 8 characters is assumed.";
3928
3929 static PyObject*
3930 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3931 {
3932     Py_UNICODE *e;
3933     Py_UNICODE *p;
3934     Py_UNICODE *q;
3935     int i, j;
3936     PyUnicodeObject *u;
3937     int tabsize = 8;
3938
3939     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3940         return NULL;
3941
3942     /* First pass: determine size of output string */
3943     i = j = 0;
3944     e = self->str + self->length;
3945     for (p = self->str; p < e; p++)
3946         if (*p == '\t') {
3947             if (tabsize > 0)
3948                 j += tabsize - (j % tabsize);
3949         }
3950         else {
3951             j++;
3952             if (*p == '\n' || *p == '\r') {
3953                 i += j;
3954                 j = 0;
3955             }
3956         }
3957
3958     /* Second pass: create output string and fill it */
3959     u = _PyUnicode_New(i + j);
3960     if (!u)
3961         return NULL;
3962
3963     j = 0;
3964     q = u->str;
3965
3966     for (p = self->str; p < e; p++)
3967         if (*p == '\t') {
3968             if (tabsize > 0) {
3969                 i = tabsize - (j % tabsize);
3970                 j += i;
3971                 while (i--)
3972                     *q++ = ' ';
3973             }
3974         }
3975         else {
3976             j++;
3977             *q++ = *p;
3978             if (*p == '\n' || *p == '\r')
3979                 j = 0;
3980         }
3981
3982     return (PyObject*) u;
3983 }
3984
3985 static char find__doc__[] =
3986 "S.find(sub [,start [,end]]) -> int\n\
3987 \n\
3988 Return the lowest index in S where substring sub is found,\n\
3989 such that sub is contained within s[start,end].  Optional\n\
3990 arguments start and end are interpreted as in slice notation.\n\
3991 \n\
3992 Return -1 on failure.";
3993
3994 static PyObject *
3995 unicode_find(PyUnicodeObject *self, PyObject *args)
3996 {
3997     PyUnicodeObject *substring;
3998     int start = 0;
3999     int end = INT_MAX;
4000     PyObject *result;
4001
4002     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4003                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4004         return NULL;
4005     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4006                                                 (PyObject *)substring);
4007     if (substring == NULL)
4008         return NULL;
4009
4010     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4011
4012     Py_DECREF(substring);
4013     return result;
4014 }
4015
4016 static PyObject *
4017 unicode_getitem(PyUnicodeObject *self, int index)
4018 {
4019     if (index < 0 || index >= self->length) {
4020         PyErr_SetString(PyExc_IndexError, "string index out of range");
4021         return NULL;
4022     }
4023
4024     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4025 }
4026
4027 static long
4028 unicode_hash(PyUnicodeObject *self)
4029 {
4030     /* Since Unicode objects compare equal to their ASCII string
4031        counterparts, they should use the individual character values
4032        as basis for their hash value.  This is needed to assure that
4033        strings and Unicode objects behave in the same way as
4034        dictionary keys. */
4035
4036     register int len;
4037     register Py_UNICODE *p;
4038     register long x;
4039
4040     if (self->hash != -1)
4041         return self->hash;
4042     len = PyUnicode_GET_SIZE(self);
4043     p = PyUnicode_AS_UNICODE(self);
4044     x = *p << 7;
4045     while (--len >= 0)
4046         x = (1000003*x) ^ *p++;
4047     x ^= PyUnicode_GET_SIZE(self);
4048     if (x == -1)
4049         x = -2;
4050     self->hash = x;
4051     return x;
4052 }
4053
4054 static char index__doc__[] =
4055 "S.index(sub [,start [,end]]) -> int\n\
4056 \n\
4057 Like S.find() but raise ValueError when the substring is not found.";
4058
4059 static PyObject *
4060 unicode_index(PyUnicodeObject *self, PyObject *args)
4061 {
4062     int result;
4063     PyUnicodeObject *substring;
4064     int start = 0;
4065     int end = INT_MAX;
4066
4067     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4068                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4069         return NULL;
4070
4071     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4072                                                 (PyObject *)substring);
4073     if (substring == NULL)
4074         return NULL;
4075
4076     result = findstring(self, substring, start, end, 1);
4077
4078     Py_DECREF(substring);
4079     if (result < 0) {
4080         PyErr_SetString(PyExc_ValueError, "substring not found");
4081         return NULL;
4082     }
4083     return PyInt_FromLong(result);
4084 }
4085
4086 static char islower__doc__[] =
4087 "S.islower() -> int\n\
4088 \n\
4089 Return 1 if  all cased characters in S are lowercase and there is\n\
4090 at least one cased character in S, 0 otherwise.";
4091
4092 static PyObject*
4093 unicode_islower(PyUnicodeObject *self)
4094 {
4095     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4096     register const Py_UNICODE *e;
4097     int cased;
4098
4099     /* Shortcut for single character strings */
4100     if (PyUnicode_GET_SIZE(self) == 1)
4101         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4102
4103     /* Special case for empty strings */
4104     if (PyString_GET_SIZE(self) == 0)
4105         return PyInt_FromLong(0);
4106
4107     e = p + PyUnicode_GET_SIZE(self);
4108     cased = 0;
4109     for (; p < e; p++) {
4110         register const Py_UNICODE ch = *p;
4111
4112         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4113             return PyInt_FromLong(0);
4114         else if (!cased && Py_UNICODE_ISLOWER(ch))
4115             cased = 1;
4116     }
4117     return PyInt_FromLong(cased);
4118 }
4119
4120 static char isupper__doc__[] =
4121 "S.isupper() -> int\n\
4122 \n\
4123 Return 1 if  all cased characters in S are uppercase and there is\n\
4124 at least one cased character in S, 0 otherwise.";
4125
4126 static PyObject*
4127 unicode_isupper(PyUnicodeObject *self)
4128 {
4129     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4130     register const Py_UNICODE *e;
4131     int cased;
4132
4133     /* Shortcut for single character strings */
4134     if (PyUnicode_GET_SIZE(self) == 1)
4135         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4136
4137     /* Special case for empty strings */
4138     if (PyString_GET_SIZE(self) == 0)
4139         return PyInt_FromLong(0);
4140
4141     e = p + PyUnicode_GET_SIZE(self);
4142     cased = 0;
4143     for (; p < e; p++) {
4144         register const Py_UNICODE ch = *p;
4145
4146         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4147             return PyInt_FromLong(0);
4148         else if (!cased && Py_UNICODE_ISUPPER(ch))
4149             cased = 1;
4150     }
4151     return PyInt_FromLong(cased);
4152 }
4153
4154 static char istitle__doc__[] =
4155 "S.istitle() -> int\n\
4156 \n\
4157 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4158 may only follow uncased characters and lowercase characters only cased\n\
4159 ones. Return 0 otherwise.";
4160
4161 static PyObject*
4162 unicode_istitle(PyUnicodeObject *self)
4163 {
4164     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4165     register const Py_UNICODE *e;
4166     int cased, previous_is_cased;
4167
4168     /* Shortcut for single character strings */
4169     if (PyUnicode_GET_SIZE(self) == 1)
4170         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4171                               (Py_UNICODE_ISUPPER(*p) != 0));
4172
4173     /* Special case for empty strings */
4174     if (PyString_GET_SIZE(self) == 0)
4175         return PyInt_FromLong(0);
4176
4177     e = p + PyUnicode_GET_SIZE(self);
4178     cased = 0;
4179     previous_is_cased = 0;
4180     for (; p < e; p++) {
4181         register const Py_UNICODE ch = *p;
4182
4183         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4184             if (previous_is_cased)
4185                 return PyInt_FromLong(0);
4186             previous_is_cased = 1;
4187             cased = 1;
4188         }
4189         else if (Py_UNICODE_ISLOWER(ch)) {
4190             if (!previous_is_cased)
4191                 return PyInt_FromLong(0);
4192             previous_is_cased = 1;
4193             cased = 1;
4194         }
4195         else
4196             previous_is_cased = 0;
4197     }
4198     return PyInt_FromLong(cased);
4199 }
4200
4201 static char isspace__doc__[] =
4202 "S.isspace() -> int\n\
4203 \n\
4204 Return 1 if there are only whitespace characters in S,\n\
4205 0 otherwise.";
4206
4207 static PyObject*
4208 unicode_isspace(PyUnicodeObject *self)
4209 {
4210     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4211     register const Py_UNICODE *e;
4212
4213     /* Shortcut for single character strings */
4214     if (PyUnicode_GET_SIZE(self) == 1 &&
4215         Py_UNICODE_ISSPACE(*p))
4216         return PyInt_FromLong(1);
4217
4218     /* Special case for empty strings */
4219     if (PyString_GET_SIZE(self) == 0)
4220         return PyInt_FromLong(0);
4221
4222     e = p + PyUnicode_GET_SIZE(self);
4223     for (; p < e; p++) {
4224         if (!Py_UNICODE_ISSPACE(*p))
4225             return PyInt_FromLong(0);
4226     }
4227     return PyInt_FromLong(1);
4228 }
4229
4230 static char isalpha__doc__[] =
4231 "S.isalpha() -> int\n\
4232 \n\
4233 Return 1 if  all characters in S are alphabetic\n\
4234 and there is at least one character in S, 0 otherwise.";
4235
4236 static PyObject*
4237 unicode_isalpha(PyUnicodeObject *self)
4238 {
4239     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4240     register const Py_UNICODE *e;
4241
4242     /* Shortcut for single character strings */
4243     if (PyUnicode_GET_SIZE(self) == 1 &&
4244         Py_UNICODE_ISALPHA(*p))
4245         return PyInt_FromLong(1);
4246
4247     /* Special case for empty strings */
4248     if (PyString_GET_SIZE(self) == 0)
4249         return PyInt_FromLong(0);
4250
4251     e = p + PyUnicode_GET_SIZE(self);
4252     for (; p < e; p++) {
4253         if (!Py_UNICODE_ISALPHA(*p))
4254             return PyInt_FromLong(0);
4255     }
4256     return PyInt_FromLong(1);
4257 }
4258
4259 static char isalnum__doc__[] =
4260 "S.isalnum() -> int\n\
4261 \n\
4262 Return 1 if  all characters in S are alphanumeric\n\
4263 and there is at least one character in S, 0 otherwise.";
4264
4265 static PyObject*
4266 unicode_isalnum(PyUnicodeObject *self)
4267 {
4268     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4269     register const Py_UNICODE *e;
4270
4271     /* Shortcut for single character strings */
4272     if (PyUnicode_GET_SIZE(self) == 1 &&
4273         Py_UNICODE_ISALNUM(*p))
4274         return PyInt_FromLong(1);
4275
4276     /* Special case for empty strings */
4277     if (PyString_GET_SIZE(self) == 0)
4278         return PyInt_FromLong(0);
4279
4280     e = p + PyUnicode_GET_SIZE(self);
4281     for (; p < e; p++) {
4282         if (!Py_UNICODE_ISALNUM(*p))
4283             return PyInt_FromLong(0);
4284     }
4285     return PyInt_FromLong(1);
4286 }
4287
4288 static char isdecimal__doc__[] =
4289 "S.isdecimal() -> int\n\
4290 \n\
4291 Return 1 if there are only decimal characters in S,\n\
4292 0 otherwise.";
4293
4294 static PyObject*
4295 unicode_isdecimal(PyUnicodeObject *self)
4296 {
4297     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4298     register const Py_UNICODE *e;
4299
4300     /* Shortcut for single character strings */
4301     if (PyUnicode_GET_SIZE(self) == 1 &&
4302         Py_UNICODE_ISDECIMAL(*p))
4303         return PyInt_FromLong(1);
4304
4305     /* Special case for empty strings */
4306     if (PyString_GET_SIZE(self) == 0)
4307         return PyInt_FromLong(0);
4308
4309     e = p + PyUnicode_GET_SIZE(self);
4310     for (; p < e; p++) {
4311         if (!Py_UNICODE_ISDECIMAL(*p))
4312             return PyInt_FromLong(0);
4313     }
4314     return PyInt_FromLong(1);
4315 }
4316
4317 static char isdigit__doc__[] =
4318 "S.isdigit() -> int\n\
4319 \n\
4320 Return 1 if there are only digit characters in S,\n\
4321 0 otherwise.";
4322
4323 static PyObject*
4324 unicode_isdigit(PyUnicodeObject *self)
4325 {
4326     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4327     register const Py_UNICODE *e;
4328
4329     /* Shortcut for single character strings */
4330     if (PyUnicode_GET_SIZE(self) == 1 &&
4331         Py_UNICODE_ISDIGIT(*p))
4332         return PyInt_FromLong(1);
4333
4334     /* Special case for empty strings */
4335     if (PyString_GET_SIZE(self) == 0)
4336         return PyInt_FromLong(0);
4337
4338     e = p + PyUnicode_GET_SIZE(self);
4339     for (; p < e; p++) {
4340         if (!Py_UNICODE_ISDIGIT(*p))
4341             return PyInt_FromLong(0);
4342     }
4343     return PyInt_FromLong(1);
4344 }
4345
4346 static char isnumeric__doc__[] =
4347 "S.isnumeric() -> int\n\
4348 \n\
4349 Return 1 if there are only numeric characters in S,\n\
4350 0 otherwise.";
4351
4352 static PyObject*
4353 unicode_isnumeric(PyUnicodeObject *self)
4354 {
4355     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4356     register const Py_UNICODE *e;
4357
4358     /* Shortcut for single character strings */
4359     if (PyUnicode_GET_SIZE(self) == 1 &&
4360         Py_UNICODE_ISNUMERIC(*p))
4361         return PyInt_FromLong(1);
4362
4363     /* Special case for empty strings */
4364     if (PyString_GET_SIZE(self) == 0)
4365         return PyInt_FromLong(0);
4366
4367     e = p + PyUnicode_GET_SIZE(self);
4368     for (; p < e; p++) {
4369         if (!Py_UNICODE_ISNUMERIC(*p))
4370             return PyInt_FromLong(0);
4371     }
4372     return PyInt_FromLong(1);
4373 }
4374
4375 static char join__doc__[] =
4376 "S.join(sequence) -> unicode\n\
4377 \n\
4378 Return a string which is the concatenation of the strings in the\n\
4379 sequence.  The separator between elements is S.";
4380
4381 static PyObject*
4382 unicode_join(PyObject *self, PyObject *data)
4383 {
4384     return PyUnicode_Join(self, data);
4385 }
4386
4387 static int
4388 unicode_length(PyUnicodeObject *self)
4389 {
4390     return self->length;
4391 }
4392
4393 static char ljust__doc__[] =
4394 "S.ljust(width) -> unicode\n\
4395 \n\
4396 Return S left justified in a Unicode string of length width. Padding is\n\
4397 done using spaces.";
4398
4399 static PyObject *
4400 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4401 {
4402     int width;
4403     if (!PyArg_ParseTuple(args, "i:ljust", &width))
4404         return NULL;
4405
4406     if (self->length >= width && PyUnicode_CheckExact(self)) {
4407         Py_INCREF(self);
4408         return (PyObject*) self;
4409     }
4410
4411     return (PyObject*) pad(self, 0, width - self->length, ' ');
4412 }
4413
4414 static char lower__doc__[] =
4415 "S.lower() -> unicode\n\
4416 \n\
4417 Return a copy of the string S converted to lowercase.";
4418
4419 static PyObject*
4420 unicode_lower(PyUnicodeObject *self)
4421 {
4422     return fixup(self, fixlower);
4423 }
4424
4425 static char lstrip__doc__[] =
4426 "S.lstrip() -> unicode\n\
4427 \n\
4428 Return a copy of the string S with leading whitespace removed.";
4429
4430 static PyObject *
4431 unicode_lstrip(PyUnicodeObject *self)
4432 {
4433     return strip(self, 1, 0);
4434 }
4435
4436 static PyObject*
4437 unicode_repeat(PyUnicodeObject *str, int len)
4438 {
4439     PyUnicodeObject *u;
4440     Py_UNICODE *p;
4441     int nchars;
4442     size_t nbytes;
4443
4444     if (len < 0)
4445         len = 0;
4446
4447     if (len == 1 && PyUnicode_CheckExact(str)) {
4448         /* no repeat, return original string */
4449         Py_INCREF(str);
4450         return (PyObject*) str;
4451     }
4452
4453     /* ensure # of chars needed doesn't overflow int and # of bytes
4454      * needed doesn't overflow size_t
4455      */
4456     nchars = len * str->length;
4457     if (len && nchars / len != str->length) {
4458         PyErr_SetString(PyExc_OverflowError,
4459                         "repeated string is too long");
4460         return NULL;
4461     }
4462     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4463     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4464         PyErr_SetString(PyExc_OverflowError,
4465                         "repeated string is too long");
4466         return NULL;
4467     }
4468     u = _PyUnicode_New(nchars);
4469     if (!u)
4470         return NULL;
4471
4472     p = u->str;
4473
4474     while (len-- > 0) {
4475         Py_UNICODE_COPY(p, str->str, str->length);
4476         p += str->length;
4477     }
4478
4479     return (PyObject*) u;
4480 }
4481
4482 PyObject *PyUnicode_Replace(PyObject *obj,
4483                             PyObject *subobj,
4484                             PyObject *replobj,
4485                             int maxcount)
4486 {
4487     PyObject *self;
4488     PyObject *str1;
4489     PyObject *str2;
4490     PyObject *result;
4491
4492     self = PyUnicode_FromObject(obj);
4493     if (self == NULL)
4494         return NULL;
4495     str1 = PyUnicode_FromObject(subobj);
4496     if (str1 == NULL) {
4497         Py_DECREF(self);
4498         return NULL;
4499     }
4500     str2 = PyUnicode_FromObject(replobj);
4501     if (str2 == NULL) {
4502         Py_DECREF(self);
4503         Py_DECREF(str1);
4504         return NULL;
4505     }
4506     result = replace((PyUnicodeObject *)self,
4507                      (PyUnicodeObject *)str1,
4508                      (PyUnicodeObject *)str2,
4509                      maxcount);
4510     Py_DECREF(self);
4511     Py_DECREF(str1);
4512     Py_DECREF(str2);
4513     return result;
4514 }
4515
4516 static char replace__doc__[] =
4517 "S.replace (old, new[, maxsplit]) -> unicode\n\
4518 \n\
4519 Return a copy of S with all occurrences of substring\n\
4520 old replaced by new.  If the optional argument maxsplit is\n\
4521 given, only the first maxsplit occurrences are replaced.";
4522
4523 static PyObject*
4524 unicode_replace(PyUnicodeObject *self, PyObject *args)
4525 {
4526     PyUnicodeObject *str1;
4527     PyUnicodeObject *str2;
4528     int maxcount = -1;
4529     PyObject *result;
4530
4531     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4532         return NULL;
4533     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4534     if (str1 == NULL)
4535         return NULL;
4536     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4537     if (str2 == NULL)
4538         return NULL;
4539
4540     result = replace(self, str1, str2, maxcount);
4541
4542     Py_DECREF(str1);
4543     Py_DECREF(str2);
4544     return result;
4545 }
4546
4547 static
4548 PyObject *unicode_repr(PyObject *unicode)
4549 {
4550     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4551                                 PyUnicode_GET_SIZE(unicode),
4552                                 1);
4553 }
4554
4555 static char rfind__doc__[] =
4556 "S.rfind(sub [,start [,end]]) -> int\n\
4557 \n\
4558 Return the highest index in S where substring sub is found,\n\
4559 such that sub is contained within s[start,end].  Optional\n\
4560 arguments start and end are interpreted as in slice notation.\n\
4561 \n\
4562 Return -1 on failure.";
4563
4564 static PyObject *
4565 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4566 {
4567     PyUnicodeObject *substring;
4568     int start = 0;
4569     int end = INT_MAX;
4570     PyObject *result;
4571
4572     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4573                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4574         return NULL;
4575     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4576                                                 (PyObject *)substring);
4577     if (substring == NULL)
4578         return NULL;
4579
4580     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4581
4582     Py_DECREF(substring);
4583     return result;
4584 }
4585
4586 static char rindex__doc__[] =
4587 "S.rindex(sub [,start [,end]]) -> int\n\
4588 \n\
4589 Like S.rfind() but raise ValueError when the substring is not found.";
4590
4591 static PyObject *
4592 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4593 {
4594     int result;
4595     PyUnicodeObject *substring;
4596     int start = 0;
4597     int end = INT_MAX;
4598
4599     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4600                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4601         return NULL;
4602     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4603                                                 (PyObject *)substring);
4604     if (substring == NULL)
4605         return NULL;
4606
4607     result = findstring(self, substring, start, end, -1);
4608
4609     Py_DECREF(substring);
4610     if (result < 0) {
4611         PyErr_SetString(PyExc_ValueError, "substring not found");
4612         return NULL;
4613     }
4614     return PyInt_FromLong(result);
4615 }
4616
4617 static char rjust__doc__[] =
4618 "S.rjust(width) -> unicode\n\
4619 \n\
4620 Return S right justified in a Unicode string of length width. Padding is\n\
4621 done using spaces.";
4622
4623 static PyObject *
4624 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4625 {
4626     int width;
4627     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4628         return NULL;
4629
4630     if (self->length >= width && PyUnicode_CheckExact(self)) {
4631         Py_INCREF(self);
4632         return (PyObject*) self;
4633     }
4634
4635     return (PyObject*) pad(self, width - self->length, 0, ' ');
4636 }
4637
4638 static char rstrip__doc__[] =
4639 "S.rstrip() -> unicode\n\
4640 \n\
4641 Return a copy of the string S with trailing whitespace removed.";
4642
4643 static PyObject *
4644 unicode_rstrip(PyUnicodeObject *self)
4645 {
4646     return strip(self, 0, 1);
4647 }
4648
4649 static PyObject*
4650 unicode_slice(PyUnicodeObject *self, int start, int end)
4651 {
4652     /* standard clamping */
4653     if (start < 0)
4654         start = 0;
4655     if (end < 0)
4656         end = 0;
4657     if (end > self->length)
4658         end = self->length;
4659     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4660         /* full slice, return original string */
4661         Py_INCREF(self);
4662         return (PyObject*) self;
4663     }
4664     if (start > end)
4665         start = end;
4666     /* copy slice */
4667     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4668                                              end - start);
4669 }
4670
4671 PyObject *PyUnicode_Split(PyObject *s,
4672                           PyObject *sep,
4673                           int maxsplit)
4674 {
4675     PyObject *result;
4676
4677     s = PyUnicode_FromObject(s);
4678     if (s == NULL)
4679         return NULL;
4680     if (sep != NULL) {
4681         sep = PyUnicode_FromObject(sep);
4682         if (sep == NULL) {
4683             Py_DECREF(s);
4684             return NULL;
4685         }
4686     }
4687
4688     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4689
4690     Py_DECREF(s);
4691     Py_XDECREF(sep);
4692     return result;
4693 }
4694
4695 static char split__doc__[] =
4696 "S.split([sep [,maxsplit]]) -> list of strings\n\
4697 \n\
4698 Return a list of the words in S, using sep as the\n\
4699 delimiter string.  If maxsplit is given, at most maxsplit\n\
4700 splits are done. If sep is not specified, any whitespace string\n\
4701 is a separator.";
4702
4703 static PyObject*
4704 unicode_split(PyUnicodeObject *self, PyObject *args)
4705 {
4706     PyObject *substring = Py_None;
4707     int maxcount = -1;
4708
4709     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4710         return NULL;
4711
4712     if (substring == Py_None)
4713         return split(self, NULL, maxcount);
4714     else if (PyUnicode_Check(substring))
4715         return split(self, (PyUnicodeObject *)substring, maxcount);
4716     else
4717         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4718 }
4719
4720 static char splitlines__doc__[] =
4721 "S.splitlines([keepends]]) -> list of strings\n\
4722 \n\
4723 Return a list of the lines in S, breaking at line boundaries.\n\
4724 Line breaks are not included in the resulting list unless keepends\n\
4725 is given and true.";
4726
4727 static PyObject*
4728 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4729 {
4730     int keepends = 0;
4731
4732     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4733         return NULL;
4734
4735     return PyUnicode_Splitlines((PyObject *)self, keepends);
4736 }
4737
4738 static
4739 PyObject *unicode_str(PyUnicodeObject *self)
4740 {
4741     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4742 }
4743
4744 static char strip__doc__[] =
4745 "S.strip() -> unicode\n\
4746 \n\
4747 Return a copy of S with leading and trailing whitespace removed.";
4748
4749 static PyObject *
4750 unicode_strip(PyUnicodeObject *self)
4751 {
4752     return strip(self, 1, 1);
4753 }
4754
4755 static char swapcase__doc__[] =
4756 "S.swapcase() -> unicode\n\
4757 \n\
4758 Return a copy of S with uppercase characters converted to lowercase\n\
4759 and vice versa.";
4760
4761 static PyObject*
4762 unicode_swapcase(PyUnicodeObject *self)
4763 {
4764     return fixup(self, fixswapcase);
4765 }
4766
4767 static char translate__doc__[] =
4768 "S.translate(table) -> unicode\n\
4769 \n\
4770 Return a copy of the string S, where all characters have been mapped\n\
4771 through the given translation table, which must be a mapping of\n\
4772 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4773 are left untouched. Characters mapped to None are deleted.";
4774
4775 static PyObject*
4776 unicode_translate(PyUnicodeObject *self, PyObject *table)
4777 {
4778     return PyUnicode_TranslateCharmap(self->str,
4779                                       self->length,
4780                                       table,
4781                                       "ignore");
4782 }
4783
4784 static char upper__doc__[] =
4785 "S.upper() -> unicode\n\
4786 \n\
4787 Return a copy of S converted to uppercase.";
4788
4789 static PyObject*
4790 unicode_upper(PyUnicodeObject *self)
4791 {
4792     return fixup(self, fixupper);
4793 }
4794
4795 #if 0
4796 static char zfill__doc__[] =
4797 "S.zfill(width) -> unicode\n\
4798 \n\
4799 Pad a numeric string x with zeros on the left, to fill a field\n\
4800 of the specified width. The string x is never truncated.";
4801
4802 static PyObject *
4803 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4804 {
4805     int fill;
4806     PyUnicodeObject *u;
4807
4808     int width;
4809     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4810         return NULL;
4811
4812     if (self->length >= width) {
4813         Py_INCREF(self);
4814         return (PyObject*) self;
4815     }
4816
4817     fill = width - self->length;
4818
4819     u = pad(self, fill, 0, '0');
4820
4821     if (u->str[fill] == '+' || u->str[fill] == '-') {
4822         /* move sign to beginning of string */
4823         u->str[0] = u->str[fill];
4824         u->str[fill] = '0';
4825     }
4826
4827     return (PyObject*) u;
4828 }
4829 #endif
4830
4831 #if 0
4832 static PyObject*
4833 unicode_freelistsize(PyUnicodeObject *self)
4834 {
4835     return PyInt_FromLong(unicode_freelist_size);
4836 }
4837 #endif
4838
4839 static char startswith__doc__[] =
4840 "S.startswith(prefix[, start[, end]]) -> int\n\
4841 \n\
4842 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4843 optional start, test S beginning at that position.  With optional end, stop\n\
4844 comparing S at that position.";
4845
4846 static PyObject *
4847 unicode_startswith(PyUnicodeObject *self,
4848                    PyObject *args)
4849 {
4850     PyUnicodeObject *substring;
4851     int start = 0;
4852     int end = INT_MAX;
4853     PyObject *result;
4854
4855     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4856                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4857         return NULL;
4858     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4859                                                 (PyObject *)substring);
4860     if (substring == NULL)
4861         return NULL;
4862
4863     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4864
4865     Py_DECREF(substring);
4866     return result;
4867 }
4868
4869
4870 static char endswith__doc__[] =
4871 "S.endswith(suffix[, start[, end]]) -> int\n\
4872 \n\
4873 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4874 optional start, test S beginning at that position.  With optional end, stop\n\
4875 comparing S at that position.";
4876
4877 static PyObject *
4878 unicode_endswith(PyUnicodeObject *self,
4879                  PyObject *args)
4880 {
4881     PyUnicodeObject *substring;
4882     int start = 0;
4883     int end = INT_MAX;
4884     PyObject *result;
4885
4886     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4887                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4888         return NULL;
4889     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4890                                                 (PyObject *)substring);
4891     if (substring == NULL)
4892         return NULL;
4893
4894     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4895
4896     Py_DECREF(substring);
4897     return result;
4898 }
4899
4900
4901 static PyMethodDef unicode_methods[] = {
4902
4903     /* Order is according to common usage: often used methods should
4904        appear first, since lookup is done sequentially. */
4905
4906     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4907     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4908     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4909     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4910     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4911     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4912     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4913     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4914     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4915     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4916     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4917     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4918     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4919     {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4920 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4921     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4922     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4923     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4924     {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4925     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4926     {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4927     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4928     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4929     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4930     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4931     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4932     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4933     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4934     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4935     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4936     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4937     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4938     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4939     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4940     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4941 #if 0
4942     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4943     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4944 #endif
4945
4946 #if 0
4947     /* This one is just used for debugging the implementation. */
4948     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4949 #endif
4950
4951     {NULL, NULL}
4952 };
4953
4954 static PySequenceMethods unicode_as_sequence = {
4955     (inquiry) unicode_length,           /* sq_length */
4956     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4957     (intargfunc) unicode_repeat,        /* sq_repeat */
4958     (intargfunc) unicode_getitem,       /* sq_item */
4959     (intintargfunc) unicode_slice,      /* sq_slice */
4960     0,                                  /* sq_ass_item */
4961     0,                                  /* sq_ass_slice */
4962     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4963 };
4964
4965 static int
4966 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4967                           int index,
4968                           const void **ptr)
4969 {
4970     if (index != 0) {
4971         PyErr_SetString(PyExc_SystemError,
4972                         "accessing non-existent unicode segment");
4973         return -1;
4974     }
4975     *ptr = (void *) self->str;
4976     return PyUnicode_GET_DATA_SIZE(self);
4977 }
4978
4979 static int
4980 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4981                            const void **ptr)
4982 {
4983     PyErr_SetString(PyExc_TypeError,
4984                     "cannot use unicode as modifyable buffer");
4985     return -1;
4986 }
4987
4988 static int
4989 unicode_buffer_getsegcount(PyUnicodeObject *self,
4990                            int *lenp)
4991 {
4992     if (lenp)
4993         *lenp = PyUnicode_GET_DATA_SIZE(self);
4994     return 1;
4995 }
4996
4997 static int
4998 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4999                           int index,
5000                           const void **ptr)
5001 {
5002     PyObject *str;
5003
5004     if (index != 0) {
5005         PyErr_SetString(PyExc_SystemError,
5006                         "accessing non-existent unicode segment");
5007         return -1;
5008     }
5009     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5010     if (str == NULL)
5011         return -1;
5012     *ptr = (void *) PyString_AS_STRING(str);
5013     return PyString_GET_SIZE(str);
5014 }
5015
5016 /* Helpers for PyUnicode_Format() */
5017
5018 static PyObject *
5019 getnextarg(PyObject *args, int arglen, int *p_argidx)
5020 {
5021     int argidx = *p_argidx;
5022     if (argidx < arglen) {
5023         (*p_argidx)++;
5024         if (arglen < 0)
5025             return args;
5026         else
5027             return PyTuple_GetItem(args, argidx);
5028     }
5029     PyErr_SetString(PyExc_TypeError,
5030                     "not enough arguments for format string");
5031     return NULL;
5032 }
5033
5034 #define F_LJUST (1<<0)
5035 #define F_SIGN  (1<<1)
5036 #define F_BLANK (1<<2)
5037 #define F_ALT   (1<<3)
5038 #define F_ZERO  (1<<4)
5039
5040 static
5041 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5042 {
5043     register int i;
5044     int len;
5045     va_list va;
5046     char *charbuffer;
5047     va_start(va, format);
5048
5049     /* First, format the string as char array, then expand to Py_UNICODE
5050        array. */
5051     charbuffer = (char *)buffer;
5052     len = vsprintf(charbuffer, format, va);
5053     for (i = len - 1; i >= 0; i--)
5054         buffer[i] = (Py_UNICODE) charbuffer[i];
5055
5056     va_end(va);
5057     return len;
5058 }
5059
5060 static int
5061 formatfloat(Py_UNICODE *buf,
5062             size_t buflen,
5063             int flags,
5064             int prec,
5065             int type,
5066             PyObject *v)
5067 {
5068     /* fmt = '%#.' + `prec` + `type`
5069        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5070     char fmt[20];
5071     double x;
5072
5073     x = PyFloat_AsDouble(v);
5074     if (x == -1.0 && PyErr_Occurred())
5075         return -1;
5076     if (prec < 0)
5077         prec = 6;
5078     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5079         type = 'g';
5080     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5081                   (flags & F_ALT) ? "#" : "", prec, type);
5082     /* worst case length calc to ensure no buffer overrun:
5083          fmt = %#.<prec>g
5084          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5085             for any double rep.)
5086          len = 1 + prec + 1 + 2 + 5 = 9 + prec
5087        If prec=0 the effective precision is 1 (the leading digit is
5088        always given), therefore increase by one to 10+prec. */
5089     if (buflen <= (size_t)10 + (size_t)prec) {
5090         PyErr_SetString(PyExc_OverflowError,
5091             "formatted float is too long (precision too long?)");
5092         return -1;
5093     }
5094     return usprintf(buf, fmt, x);
5095 }
5096
5097 static PyObject*
5098 formatlong(PyObject *val, int flags, int prec, int type)
5099 {
5100         char *buf;
5101         int i, len;
5102         PyObject *str; /* temporary string object. */
5103         PyUnicodeObject *result;
5104
5105         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5106         if (!str)
5107                 return NULL;
5108         result = _PyUnicode_New(len);
5109         for (i = 0; i < len; i++)
5110                 result->str[i] = buf[i];
5111         result->str[len] = 0;
5112         Py_DECREF(str);
5113         return (PyObject*)result;
5114 }
5115
5116 static int
5117 formatint(Py_UNICODE *buf,
5118           size_t buflen,
5119           int flags,
5120           int prec,
5121           int type,
5122           PyObject *v)
5123 {
5124     /* fmt = '%#.' + `prec` + 'l' + `type`
5125        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5126        + 1 + 1 = 24*/
5127     char fmt[64]; /* plenty big enough! */
5128     long x;
5129     int use_native_c_format = 1;
5130
5131     x = PyInt_AsLong(v);
5132     if (x == -1 && PyErr_Occurred())
5133         return -1;
5134     if (prec < 0)
5135         prec = 1;
5136     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5137        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5138     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5139         PyErr_SetString(PyExc_OverflowError,
5140             "formatted integer is too long (precision too long?)");
5141         return -1;
5142     }
5143     /* When converting 0 under %#x or %#X, C leaves off the base marker,
5144      * but we want it (for consistency with other %#x conversions, and
5145      * for consistency with Python's hex() function).
5146      * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
5147      * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5148      * So add it only if the platform doesn't already.
5149      */
5150     if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5151         /* Only way to know what the platform does is to try it. */
5152         PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
5153         if (fmt[1] != (char)type) {
5154             /* Supply our own leading 0x/0X -- needed under std C */
5155             use_native_c_format = 0;
5156             PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
5157         }
5158     }
5159     if (use_native_c_format)
5160          PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5161                        (flags & F_ALT) ? "#" : "", prec, type);
5162     return usprintf(buf, fmt, x);
5163 }
5164
5165 static int
5166 formatchar(Py_UNICODE *buf,
5167            size_t buflen,
5168            PyObject *v)
5169 {
5170     /* presume that the buffer is at least 2 characters long */
5171     if (PyUnicode_Check(v)) {
5172         if (PyUnicode_GET_SIZE(v) != 1)
5173             goto onError;
5174         buf[0] = PyUnicode_AS_UNICODE(v)[0];
5175     }
5176
5177     else if (PyString_Check(v)) {
5178         if (PyString_GET_SIZE(v) != 1)
5179             goto onError;
5180         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5181     }
5182
5183     else {
5184         /* Integer input truncated to a character */
5185         long x;
5186         x = PyInt_AsLong(v);
5187         if (x == -1 && PyErr_Occurred())
5188             goto onError;
5189         buf[0] = (char) x;
5190     }
5191     buf[1] = '\0';
5192     return 1;
5193
5194  onError:
5195     PyErr_SetString(PyExc_TypeError,
5196                     "%c requires int or char");
5197     return -1;
5198 }
5199
5200 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5201
5202    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5203    chars are formatted. XXX This is a magic number. Each formatting
5204    routine does bounds checking to ensure no overflow, but a better
5205    solution may be to malloc a buffer of appropriate size for each
5206    format. For now, the current solution is sufficient.
5207 */
5208 #define FORMATBUFLEN (size_t)120
5209
5210 PyObject *PyUnicode_Format(PyObject *format,
5211                            PyObject *args)
5212 {
5213     Py_UNICODE *fmt, *res;
5214     int fmtcnt, rescnt, reslen, arglen, argidx;
5215     int args_owned = 0;
5216     PyUnicodeObject *result = NULL;
5217     PyObject *dict = NULL;
5218     PyObject *uformat;
5219
5220     if (format == NULL || args == NULL) {
5221         PyErr_BadInternalCall();
5222         return NULL;
5223     }
5224     uformat = PyUnicode_FromObject(format);
5225     if (uformat == NULL)
5226         return NULL;
5227     fmt = PyUnicode_AS_UNICODE(uformat);
5228     fmtcnt = PyUnicode_GET_SIZE(uformat);
5229
5230     reslen = rescnt = fmtcnt + 100;
5231     result = _PyUnicode_New(reslen);
5232     if (result == NULL)
5233         goto onError;
5234     res = PyUnicode_AS_UNICODE(result);
5235
5236     if (PyTuple_Check(args)) {
5237         arglen = PyTuple_Size(args);
5238         argidx = 0;
5239     }
5240     else {
5241         arglen = -1;
5242         argidx = -2;
5243     }
5244     if (args->ob_type->tp_as_mapping)
5245         dict = args;
5246
5247     while (--fmtcnt >= 0) {
5248         if (*fmt != '%') {
5249             if (--rescnt < 0) {
5250                 rescnt = fmtcnt + 100;
5251                 reslen += rescnt;
5252                 if (_PyUnicode_Resize(&result, reslen) < 0)
5253                     return NULL;
5254                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5255                 --rescnt;
5256             }
5257             *res++ = *fmt++;
5258         }
5259         else {
5260             /* Got a format specifier */
5261             int flags = 0;
5262             int width = -1;
5263             int prec = -1;
5264             Py_UNICODE c = '\0';
5265             Py_UNICODE fill;
5266             PyObject *v = NULL;
5267             PyObject *temp = NULL;
5268             Py_UNICODE *pbuf;
5269             Py_UNICODE sign;
5270             int len;
5271             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5272
5273             fmt++;
5274             if (*fmt == '(') {
5275                 Py_UNICODE *keystart;
5276                 int keylen;
5277                 PyObject *key;
5278                 int pcount = 1;
5279
5280                 if (dict == NULL) {
5281                     PyErr_SetString(PyExc_TypeError,
5282                                     "format requires a mapping");
5283                     goto onError;
5284                 }
5285                 ++fmt;
5286                 --fmtcnt;
5287                 keystart = fmt;
5288                 /* Skip over balanced parentheses */
5289                 while (pcount > 0 && --fmtcnt >= 0) {
5290                     if (*fmt == ')')
5291                         --pcount;
5292                     else if (*fmt == '(')
5293                         ++pcount;
5294                     fmt++;
5295                 }
5296                 keylen = fmt - keystart - 1;
5297                 if (fmtcnt < 0 || pcount > 0) {
5298                     PyErr_SetString(PyExc_ValueError,
5299                                     "incomplete format key");
5300                     goto onError;
5301                 }
5302 #if 0
5303                 /* keys are converted to strings using UTF-8 and
5304                    then looked up since Python uses strings to hold
5305                    variables names etc. in its namespaces and we
5306                    wouldn't want to break common idioms. */
5307                 key = PyUnicode_EncodeUTF8(keystart,
5308                                            keylen,
5309                                            NULL);
5310 #else
5311                 key = PyUnicode_FromUnicode(keystart, keylen);
5312 #endif
5313                 if (key == NULL)
5314                     goto onError;
5315                 if (args_owned) {
5316                     Py_DECREF(args);
5317                     args_owned = 0;
5318                 }
5319                 args = PyObject_GetItem(dict, key);
5320                 Py_DECREF(key);
5321                 if (args == NULL) {
5322                     goto onError;
5323                 }
5324                 args_owned = 1;
5325                 arglen = -1;
5326                 argidx = -2;
5327             }
5328             while (--fmtcnt >= 0) {
5329                 switch (c = *fmt++) {
5330                 case '-': flags |= F_LJUST; continue;
5331                 case '+': flags |= F_SIGN; continue;
5332                 case ' ': flags |= F_BLANK; continue;
5333                 case '#': flags |= F_ALT; continue;
5334                 case '0': flags |= F_ZERO; continue;
5335                 }
5336                 break;
5337             }
5338             if (c == '*') {
5339                 v = getnextarg(args, arglen, &argidx);
5340                 if (v == NULL)
5341                     goto onError;
5342                 if (!PyInt_Check(v)) {
5343                     PyErr_SetString(PyExc_TypeError,
5344                                     "* wants int");
5345                     goto onError;
5346                 }
5347                 width = PyInt_AsLong(v);
5348                 if (width < 0) {
5349                     flags |= F_LJUST;
5350                     width = -width;
5351                 }
5352                 if (--fmtcnt >= 0)
5353                     c = *fmt++;
5354             }
5355             else if (c >= '0' && c <= '9') {
5356                 width = c - '0';
5357                 while (--fmtcnt >= 0) {
5358                     c = *fmt++;
5359                     if (c < '0' || c > '9')
5360                         break;
5361                     if ((width*10) / 10 != width) {
5362                         PyErr_SetString(PyExc_ValueError,
5363                                         "width too big");
5364                         goto onError;
5365                     }
5366                     width = width*10 + (c - '0');
5367                 }
5368             }
5369             if (c == '.') {
5370                 prec = 0;
5371                 if (--fmtcnt >= 0)
5372                     c = *fmt++;
5373                 if (c == '*') {
5374                     v = getnextarg(args, arglen, &argidx);
5375                     if (v == NULL)
5376                         goto onError;
5377                     if (!PyInt_Check(v)) {
5378                         PyErr_SetString(PyExc_TypeError,
5379                                         "* wants int");
5380                         goto onError;
5381                     }
5382                     prec = PyInt_AsLong(v);
5383                     if (prec < 0)
5384                         prec = 0;
5385                     if (--fmtcnt >= 0)
5386                         c = *fmt++;
5387                 }
5388                 else if (c >= '0' && c <= '9') {
5389                     prec = c - '0';
5390                     while (--fmtcnt >= 0) {
5391                         c = Py_CHARMASK(*fmt++);
5392                         if (c < '0' || c > '9')
5393                             break;
5394                         if ((prec*10) / 10 != prec) {
5395                             PyErr_SetString(PyExc_ValueError,
5396                                             "prec too big");
5397                             goto onError;
5398                         }
5399                         prec = prec*10 + (c - '0');
5400                     }
5401                 }
5402             } /* prec */
5403             if (fmtcnt >= 0) {
5404                 if (c == 'h' || c == 'l' || c == 'L') {
5405                     if (--fmtcnt >= 0)
5406                         c = *fmt++;
5407                 }
5408             }
5409             if (fmtcnt < 0) {
5410                 PyErr_SetString(PyExc_ValueError,
5411                                 "incomplete format");
5412                 goto onError;
5413             }
5414             if (c != '%') {
5415                 v = getnextarg(args, arglen, &argidx);
5416                 if (v == NULL)
5417                     goto onError;
5418             }
5419             sign = 0;
5420             fill = ' ';
5421             switch (c) {
5422
5423             case '%':
5424                 pbuf = formatbuf;
5425                 /* presume that buffer length is at least 1 */
5426                 pbuf[0] = '%';
5427                 len = 1;
5428                 break;
5429
5430             case 's':
5431             case 'r':
5432                 if (PyUnicode_Check(v) && c == 's') {
5433                     temp = v;
5434                     Py_INCREF(temp);
5435                 }
5436                 else {
5437                     PyObject *unicode;
5438                     if (c == 's')
5439                         temp = PyObject_Str(v);
5440                     else
5441                         temp = PyObject_Repr(v);
5442                     if (temp == NULL)
5443                         goto onError;
5444                     if (!PyString_Check(temp)) {
5445                         /* XXX Note: this should never happen, since
5446                                PyObject_Repr() and PyObject_Str() assure
5447                                this */
5448                         Py_DECREF(temp);
5449                         PyErr_SetString(PyExc_TypeError,
5450                                         "%s argument has non-string str()");
5451                         goto onError;
5452                     }
5453                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5454                                                    PyString_GET_SIZE(temp),
5455                                                NULL,
5456                                                    "strict");
5457                     Py_DECREF(temp);
5458                     temp = unicode;
5459                     if (temp == NULL)
5460                         goto onError;
5461                 }
5462                 pbuf = PyUnicode_AS_UNICODE(temp);
5463                 len = PyUnicode_GET_SIZE(temp);
5464                 if (prec >= 0 && len > prec)
5465                     len = prec;
5466                 break;
5467
5468             case 'i':
5469             case 'd':
5470             case 'u':
5471             case 'o':
5472             case 'x':
5473             case 'X':
5474                 if (c == 'i')
5475                     c = 'd';
5476                 if (PyLong_Check(v)) {
5477                     temp = formatlong(v, flags, prec, c);
5478                     if (!temp)
5479                         goto onError;
5480                     pbuf = PyUnicode_AS_UNICODE(temp);
5481                     len = PyUnicode_GET_SIZE(temp);
5482                     /* unbounded ints can always produce
5483                        a sign character! */
5484                     sign = 1;
5485                 }
5486                 else {
5487                     pbuf = formatbuf;
5488                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5489                                     flags, prec, c, v);
5490                     if (len < 0)
5491                         goto onError;
5492                     /* only d conversion is signed */
5493                     sign = c == 'd';
5494                 }
5495                 if (flags & F_ZERO)
5496                     fill = '0';
5497                 break;
5498
5499             case 'e':
5500             case 'E':
5501             case 'f':
5502             case 'g':
5503             case 'G':
5504                 pbuf = formatbuf;
5505                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5506                         flags, prec, c, v);
5507                 if (len < 0)
5508                     goto onError;
5509                 sign = 1;
5510                 if (flags & F_ZERO)
5511                     fill = '0';
5512                 break;
5513
5514             case 'c':
5515                 pbuf = formatbuf;
5516                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5517                 if (len < 0)
5518                     goto onError;
5519                 break;
5520
5521             default:
5522                 PyErr_Format(PyExc_ValueError,
5523                              "unsupported format character '%c' (0x%x) "
5524                              "at index %i",
5525                              (31<=c && c<=126) ? c : '?',
5526                              c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5527                 goto onError;
5528             }
5529             if (sign) {
5530                 if (*pbuf == '-' || *pbuf == '+') {
5531                     sign = *pbuf++;
5532                     len--;
5533                 }
5534                 else if (flags & F_SIGN)
5535                     sign = '+';
5536                 else if (flags & F_BLANK)
5537                     sign = ' ';
5538                 else
5539                     sign = 0;
5540             }
5541             if (width < len)
5542                 width = len;
5543             if (rescnt < width + (sign != 0)) {
5544                 reslen -= rescnt;
5545                 rescnt = width + fmtcnt + 100;
5546                 reslen += rescnt;
5547                 if (_PyUnicode_Resize(&result, reslen) < 0)
5548                     return NULL;
5549                 res = PyUnicode_AS_UNICODE(result)
5550                     + reslen - rescnt;
5551             }
5552             if (sign) {
5553                 if (fill != ' ')
5554                     *res++ = sign;
5555                 rescnt--;
5556                 if (width > len)
5557                     width--;
5558             }
5559             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5560                 assert(pbuf[0] == '0');
5561                 assert(pbuf[1] == c);
5562                 if (fill != ' ') {
5563                     *res++ = *pbuf++;
5564                     *res++ = *pbuf++;
5565                 }
5566                 rescnt -= 2;
5567                 width -= 2;
5568                 if (width < 0)
5569                     width = 0;
5570                 len -= 2;
5571             }
5572             if (width > len && !(flags & F_LJUST)) {
5573                 do {
5574                     --rescnt;
5575                     *res++ = fill;
5576                 } while (--width > len);
5577             }
5578             if (fill == ' ') {
5579                 if (sign)
5580                     *res++ = sign;
5581                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5582                     assert(pbuf[0] == '0');
5583                     assert(pbuf[1] == c);
5584                     *res++ = *pbuf++;
5585                     *res++ = *pbuf++;
5586                 }
5587             }
5588             Py_UNICODE_COPY(res, pbuf, len);
5589             res += len;
5590             rescnt -= len;
5591             while (--width >= len) {
5592                 --rescnt;
5593                 *res++ = ' ';
5594             }
5595             if (dict && (argidx < arglen) && c != '%') {
5596                 PyErr_SetString(PyExc_TypeError,
5597                                 "not all arguments converted");
5598                 goto onError;
5599             }
5600             Py_XDECREF(temp);
5601         } /* '%' */
5602     } /* until end */
5603     if (argidx < arglen && !dict) {
5604         PyErr_SetString(PyExc_TypeError,
5605                         "not all arguments converted");
5606         goto onError;
5607     }
5608
5609     if (args_owned) {
5610         Py_DECREF(args);
5611     }
5612     Py_DECREF(uformat);
5613     if (_PyUnicode_Resize(&result, reslen - rescnt))
5614         goto onError;
5615     return (PyObject *)result;
5616
5617  onError:
5618     Py_XDECREF(result);
5619     Py_DECREF(uformat);
5620     if (args_owned) {
5621         Py_DECREF(args);
5622     }
5623     return NULL;
5624 }
5625
5626 static PyBufferProcs unicode_as_buffer = {
5627     (getreadbufferproc) unicode_buffer_getreadbuf,
5628     (getwritebufferproc) unicode_buffer_getwritebuf,
5629     (getsegcountproc) unicode_buffer_getsegcount,
5630     (getcharbufferproc) unicode_buffer_getcharbuf,
5631 };
5632
5633 staticforward PyObject *
5634 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5635
5636 static PyObject *
5637 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5638 {
5639         PyObject *x = NULL;
5640         static char *kwlist[] = {"string", "encoding", "errors", 0};
5641         char *encoding = NULL;
5642         char *errors = NULL;
5643
5644         if (type != &PyUnicode_Type)
5645                 return unicode_subtype_new(type, args, kwds);
5646         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5647                                           kwlist, &x, &encoding, &errors))
5648             return NULL;
5649         if (x == NULL)
5650                 return (PyObject *)_PyUnicode_New(0);
5651         if (encoding == NULL && errors == NULL)
5652             return PyObject_Unicode(x);
5653         else
5654         return PyUnicode_FromEncodedObject(x, encoding, errors);
5655 }
5656
5657 static PyObject *
5658 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5659 {
5660         PyUnicodeObject *tmp, *pnew;
5661         int n;
5662
5663         assert(PyType_IsSubtype(type, &PyUnicode_Type));
5664         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5665         if (tmp == NULL)
5666                 return NULL;
5667         assert(PyUnicode_Check(tmp));
5668         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5669         if (pnew == NULL)
5670                 return NULL;
5671         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5672         if (pnew->str == NULL) {
5673                 _Py_ForgetReference((PyObject *)pnew);
5674                 PyObject_DEL(pnew);
5675                 return NULL;
5676         }
5677         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5678         pnew->length = n;
5679         pnew->hash = tmp->hash;
5680         Py_DECREF(tmp);
5681         return (PyObject *)pnew;
5682 }
5683
5684 static char unicode_doc[] =
5685 "unicode(string [, encoding[, errors]]) -> object\n\
5686 \n\
5687 Create a new Unicode object from the given encoded string.\n\
5688 encoding defaults to the current default string encoding and \n\
5689 errors, defining the error handling, to 'strict'.";
5690
5691 PyTypeObject PyUnicode_Type = {
5692     PyObject_HEAD_INIT(&PyType_Type)
5693     0,                                  /* ob_size */
5694     "unicode",                          /* tp_name */
5695     sizeof(PyUnicodeObject),            /* tp_size */
5696     0,                                  /* tp_itemsize */
5697     /* Slots */
5698     (destructor)unicode_dealloc,        /* tp_dealloc */
5699     0,                                  /* tp_print */
5700     0,                                  /* tp_getattr */
5701     0,                                  /* tp_setattr */
5702     (cmpfunc) unicode_compare,          /* tp_compare */
5703     (reprfunc) unicode_repr,            /* tp_repr */
5704     0,                                  /* tp_as_number */
5705     &unicode_as_sequence,               /* tp_as_sequence */
5706     0,                                  /* tp_as_mapping */
5707     (hashfunc) unicode_hash,            /* tp_hash*/
5708     0,                                  /* tp_call*/
5709     (reprfunc) unicode_str,             /* tp_str */
5710     PyObject_GenericGetAttr,            /* tp_getattro */
5711     0,                                  /* tp_setattro */
5712     &unicode_as_buffer,                 /* tp_as_buffer */
5713     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5714     unicode_doc,                        /* tp_doc */
5715     0,                                  /* tp_traverse */
5716     0,                                  /* tp_clear */
5717     0,                                  /* tp_richcompare */
5718     0,                                  /* tp_weaklistoffset */
5719     0,                                  /* tp_iter */
5720     0,                                  /* tp_iternext */
5721     unicode_methods,                    /* tp_methods */
5722     0,                                  /* tp_members */
5723     0,                                  /* tp_getset */
5724     0,                                  /* tp_base */
5725     0,                                  /* tp_dict */
5726     0,                                  /* tp_descr_get */
5727     0,                                  /* tp_descr_set */
5728     0,                                  /* tp_dictoffset */
5729     0,                                  /* tp_init */
5730     0,                                  /* tp_alloc */
5731     unicode_new,                        /* tp_new */
5732     _PyObject_Del,                      /* tp_free */
5733 };
5734
5735 /* Initialize the Unicode implementation */
5736
5737 void _PyUnicode_Init(void)
5738 {
5739     int i;
5740
5741     /* Init the implementation */
5742     unicode_freelist = NULL;
5743     unicode_freelist_size = 0;
5744     unicode_empty = _PyUnicode_New(0);
5745     strcpy(unicode_default_encoding, "ascii");
5746     for (i = 0; i < 256; i++)
5747         unicode_latin1[i] = NULL;
5748 }
5749
5750 /* Finalize the Unicode implementation */
5751
5752 void
5753 _PyUnicode_Fini(void)
5754 {
5755     PyUnicodeObject *u;
5756     int i;
5757
5758     Py_XDECREF(unicode_empty);
5759     unicode_empty = NULL;
5760
5761     for (i = 0; i < 256; i++) {
5762         if (unicode_latin1[i]) {
5763             Py_DECREF(unicode_latin1[i]);
5764             unicode_latin1[i] = NULL;
5765         }
5766     }
5767
5768     for (u = unicode_freelist; u != NULL;) {
5769         PyUnicodeObject *v = u;
5770         u = *(PyUnicodeObject **)u;
5771         if (v->str)
5772             PyMem_DEL(v->str);
5773         Py_XDECREF(v->defenc);
5774         PyObject_DEL(v);
5775     }
5776     unicode_freelist = NULL;
5777     unicode_freelist_size = 0;
5778 }