Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax()
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_DEL(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 228 {
 229     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 230         /* Keep-Alive optimization */
 231         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 232             PyMem_DEL(unicode->str);
 233             unicode->str = NULL;
 234             unicode->length = 0;
 235         }
 236         if (unicode->defenc) {
 237             Py_DECREF(unicode->defenc);
 238             unicode->defenc = NULL;
 239         }
 240         /* Add to free list */
 241         *(PyUnicodeObject **)unicode = unicode_freelist;
 242         unicode_freelist = unicode;
 243         unicode_freelist_size++;
 244     }
 245     else {
 246         PyMem_DEL(unicode->str);
 247         Py_XDECREF(unicode->defenc);
 248         PyObject_DEL(unicode);
 249     }
 250 }
 251
 252 int PyUnicode_Resize(PyObject **unicode,
 253                      int length)
 254 {
 255     register PyUnicodeObject *v;
 256
 257     /* Argument checks */
 258     if (unicode == NULL) {
 259         PyErr_BadInternalCall();
 260         return -1;
 261     }
 262     v = (PyUnicodeObject *)*unicode;
 263     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 264         PyErr_BadInternalCall();
 265         return -1;
 266     }
 267
 268     /* Resizing unicode_empty and single character objects is not
 269        possible since these are being shared. We simply return a fresh
 270        copy with the same Unicode content. */
 271     if (v->length != length &&
 272         (v == unicode_empty || v->length == 1)) {
 273         PyUnicodeObject *w = _PyUnicode_New(length);
 274         if (w == NULL)
 275             return -1;
 276         Py_UNICODE_COPY(w->str, v->str,
 277                         length < v->length ? length : v->length);
 278         *unicode = (PyObject *)w;
 279         return 0;
 280     }
 281
 282     /* Note that we don't have to modify *unicode for unshared Unicode
 283        objects, since we can modify them in-place. */
 284     return unicode_resize(v, length);
 285 }
 286
 287 /* Internal API for use in unicodeobject.c only ! */
 288 #define _PyUnicode_Resize(unicodevar, length) \
 289         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 290
 291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 292                                 int size)
 293 {
 294     PyUnicodeObject *unicode;
 295
 296     /* If the Unicode data is known at construction time, we can apply
 297        some optimizations which share commonly used objects. */
 298     if (u != NULL) {
 299
 300         /* Optimization for empty strings */
 301         if (size == 0 && unicode_empty != NULL) {
 302             Py_INCREF(unicode_empty);
 303             return (PyObject *)unicode_empty;
 304         }
 305
 306         /* Single character Unicode objects in the Latin-1 range are
 307            shared when using this constructor */
 308         if (size == 1 && *u < 256) {
 309             unicode = unicode_latin1[*u];
 310             if (!unicode) {
 311                 unicode = _PyUnicode_New(1);
 312                 if (!unicode)
 313                     return NULL;
 314                 unicode->str[0] = *u;
 315                 unicode_latin1[*u] = unicode;
 316             }
 317             Py_INCREF(unicode);
 318             return (PyObject *)unicode;
 319         }
 320     }
 321
 322     unicode = _PyUnicode_New(size);
 323     if (!unicode)
 324         return NULL;
 325
 326     /* Copy the Unicode data into the new object */
 327     if (u != NULL)
 328         Py_UNICODE_COPY(unicode->str, u, size);
 329
 330     return (PyObject *)unicode;
 331 }
 332
 333 #ifdef HAVE_WCHAR_H
 334
 335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 336                                  int size)
 337 {
 338     PyUnicodeObject *unicode;
 339
 340     if (w == NULL) {
 341         PyErr_BadInternalCall();
 342         return NULL;
 343     }
 344
 345     unicode = _PyUnicode_New(size);
 346     if (!unicode)
 347         return NULL;
 348
 349     /* Copy the wchar_t data into the new object */
 350 #ifdef HAVE_USABLE_WCHAR_T
 351     memcpy(unicode->str, w, size * sizeof(wchar_t));
 352 #else
 353     {
 354         register Py_UNICODE *u;
 355         register int i;
 356         u = PyUnicode_AS_UNICODE(unicode);
 357         for (i = size; i >= 0; i--)
 358             *u++ = *w++;
 359     }
 360 #endif
 361
 362     return (PyObject *)unicode;
 363 }
 364
 365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 366                          register wchar_t *w,
 367                          int size)
 368 {
 369     if (unicode == NULL) {
 370         PyErr_BadInternalCall();
 371         return -1;
 372     }
 373     if (size > PyUnicode_GET_SIZE(unicode))
 374         size = PyUnicode_GET_SIZE(unicode);
 375 #ifdef HAVE_USABLE_WCHAR_T
 376     memcpy(w, unicode->str, size * sizeof(wchar_t));
 377 #else
 378     {
 379         register Py_UNICODE *u;
 380         register int i;
 381         u = PyUnicode_AS_UNICODE(unicode);
 382         for (i = size; i >= 0; i--)
 383             *w++ = *u++;
 384     }
 385 #endif
 386
 387     return size;
 388 }
 389
 390 #endif
 391
 392 PyObject *PyUnicode_FromObject(register PyObject *obj)
 393 {
 394     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 395 }
 396
 397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 398                                       const char *encoding,
 399                                       const char *errors)
 400 {
 401     const char *s;
 402     int len;
 403     int owned = 0;
 404     PyObject *v;
 405
 406     if (obj == NULL) {
 407         PyErr_BadInternalCall();
 408         return NULL;
 409     }
 410
 411     /* Coerce object */
 412     if (PyInstance_Check(obj)) {
 413         PyObject *func;
 414         func = PyObject_GetAttrString(obj, "__str__");
 415         if (func == NULL) {
 416             PyErr_SetString(PyExc_TypeError,
 417                   "coercing to Unicode: instance doesn't define __str__");
 418             return NULL;
 419         }
 420         obj = PyEval_CallObject(func, NULL);
 421         Py_DECREF(func);
 422         if (obj == NULL)
 423             return NULL;
 424         owned = 1;
 425     }
 426     if (PyUnicode_Check(obj)) {
 427         Py_INCREF(obj);
 428         v = obj;
 429         if (encoding) {
 430             PyErr_SetString(PyExc_TypeError,
 431                             "decoding Unicode is not supported");
 432             return NULL;
 433         }
 434         goto done;
 435     }
 436     else if (PyString_Check(obj)) {
 437         s = PyString_AS_STRING(obj);
 438         len = PyString_GET_SIZE(obj);
 439     }
 440     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 441         /* Overwrite the error message with something more useful in
 442            case of a TypeError. */
 443         if (PyErr_ExceptionMatches(PyExc_TypeError))
 444             PyErr_Format(PyExc_TypeError,
 445                          "coercing to Unicode: need string or buffer, "
 446                          "%.80s found",
 447                          obj->ob_type->tp_name);
 448         goto onError;
 449     }
 450
 451     /* Convert to Unicode */
 452     if (len == 0) {
 453         Py_INCREF(unicode_empty);
 454         v = (PyObject *)unicode_empty;
 455     }
 456     else
 457         v = PyUnicode_Decode(s, len, encoding, errors);
 458
 459  done:
 460     if (owned) {
 461         Py_DECREF(obj);
 462     }
 463     return v;
 464
 465  onError:
 466     if (owned) {
 467         Py_DECREF(obj);
 468     }
 469     return NULL;
 470 }
 471
 472 PyObject *PyUnicode_Decode(const char *s,
 473                            int size,
 474                            const char *encoding,
 475                            const char *errors)
 476 {
 477     PyObject *buffer = NULL, *unicode;
 478
 479     if (encoding == NULL)
 480         encoding = PyUnicode_GetDefaultEncoding();
 481
 482     /* Shortcuts for common default encodings */
 483     if (strcmp(encoding, "utf-8") == 0)
 484         return PyUnicode_DecodeUTF8(s, size, errors);
 485     else if (strcmp(encoding, "latin-1") == 0)
 486         return PyUnicode_DecodeLatin1(s, size, errors);
 487     else if (strcmp(encoding, "ascii") == 0)
 488         return PyUnicode_DecodeASCII(s, size, errors);
 489
 490     /* Decode via the codec registry */
 491     buffer = PyBuffer_FromMemory((void *)s, size);
 492     if (buffer == NULL)
 493         goto onError;
 494     unicode = PyCodec_Decode(buffer, encoding, errors);
 495     if (unicode == NULL)
 496         goto onError;
 497     if (!PyUnicode_Check(unicode)) {
 498         PyErr_Format(PyExc_TypeError,
 499                      "decoder did not return an unicode object (type=%.400s)",
 500                      unicode->ob_type->tp_name);
 501         Py_DECREF(unicode);
 502         goto onError;
 503     }
 504     Py_DECREF(buffer);
 505     return unicode;
 506
 507  onError:
 508     Py_XDECREF(buffer);
 509     return NULL;
 510 }
 511
 512 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 513                            int size,
 514                            const char *encoding,
 515                            const char *errors)
 516 {
 517     PyObject *v, *unicode;
 518
 519     unicode = PyUnicode_FromUnicode(s, size);
 520     if (unicode == NULL)
 521         return NULL;
 522     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 523     Py_DECREF(unicode);
 524     return v;
 525 }
 526
 527 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 528                                     const char *encoding,
 529                                     const char *errors)
 530 {
 531     PyObject *v;
 532
 533     if (!PyUnicode_Check(unicode)) {
 534         PyErr_BadArgument();
 535         goto onError;
 536     }
 537
 538     if (encoding == NULL)
 539         encoding = PyUnicode_GetDefaultEncoding();
 540
 541     /* Shortcuts for common default encodings */
 542     if (errors == NULL) {
 543         if (strcmp(encoding, "utf-8") == 0)
 544             return PyUnicode_AsUTF8String(unicode);
 545         else if (strcmp(encoding, "latin-1") == 0)
 546             return PyUnicode_AsLatin1String(unicode);
 547         else if (strcmp(encoding, "ascii") == 0)
 548             return PyUnicode_AsASCIIString(unicode);
 549     }
 550
 551     /* Encode via the codec registry */
 552     v = PyCodec_Encode(unicode, encoding, errors);
 553     if (v == NULL)
 554         goto onError;
 555     /* XXX Should we really enforce this ? */
 556     if (!PyString_Check(v)) {
 557         PyErr_Format(PyExc_TypeError,
 558                      "encoder did not return a string object (type=%.400s)",
 559                      v->ob_type->tp_name);
 560         Py_DECREF(v);
 561         goto onError;
 562     }
 563     return v;
 564
 565  onError:
 566     return NULL;
 567 }
 568
 569 /* Return a Python string holding the default encoded value of the
 570    Unicode object.
 571
 572    The resulting string is cached in the Unicode object for subsequent
 573    usage by this function. The cached version is needed to implement
 574    the character buffer interface and will live (at least) as long as
 575    the Unicode object itself.
 576
 577    The refcount of the string is *not* incremented.
 578
 579    *** Exported for internal use by the interpreter only !!! ***
 580
 581 */
 582
 583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 584                                             const char *errors)
 585 {
 586     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 587
 588     if (v)
 589         return v;
 590     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 591     if (v && errors == NULL)
 592         ((PyUnicodeObject *)unicode)->defenc = v;
 593     return v;
 594 }
 595
 596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 597 {
 598     if (!PyUnicode_Check(unicode)) {
 599         PyErr_BadArgument();
 600         goto onError;
 601     }
 602     return PyUnicode_AS_UNICODE(unicode);
 603
 604  onError:
 605     return NULL;
 606 }
 607
 608 int PyUnicode_GetSize(PyObject *unicode)
 609 {
 610     if (!PyUnicode_Check(unicode)) {
 611         PyErr_BadArgument();
 612         goto onError;
 613     }
 614     return PyUnicode_GET_SIZE(unicode);
 615
 616  onError:
 617     return -1;
 618 }
 619
 620 const char *PyUnicode_GetDefaultEncoding(void)
 621 {
 622     return unicode_default_encoding;
 623 }
 624
 625 int PyUnicode_SetDefaultEncoding(const char *encoding)
 626 {
 627     PyObject *v;
 628
 629     /* Make sure the encoding is valid. As side effect, this also
 630        loads the encoding into the codec registry cache. */
 631     v = _PyCodec_Lookup(encoding);
 632     if (v == NULL)
 633         goto onError;
 634     Py_DECREF(v);
 635     strncpy(unicode_default_encoding,
 636             encoding,
 637             sizeof(unicode_default_encoding));
 638     return 0;
 639
 640  onError:
 641     return -1;
 642 }
 643
 644 /* --- UTF-8 Codec -------------------------------------------------------- */
 645
 646 static
 647 char utf8_code_length[256] = {
 648     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 649        illegal prefix.  see RFC 2279 for details */
 650     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 651     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 652     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 653     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 654     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 655     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 656     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 657     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 658     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 659     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 660     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 661     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 662     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 663     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 664     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 665     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 666 };
 667
 668 static
 669 int utf8_decoding_error(const char **source,
 670                         Py_UNICODE **dest,
 671                         const char *errors,
 672                         const char *details)
 673 {
 674     if ((errors == NULL) ||
 675         (strcmp(errors,"strict") == 0)) {
 676         PyErr_Format(PyExc_UnicodeError,
 677                      "UTF-8 decoding error: %.400s",
 678                      details);
 679         return -1;
 680     }
 681     else if (strcmp(errors,"ignore") == 0) {
 682         (*source)++;
 683         return 0;
 684     }
 685     else if (strcmp(errors,"replace") == 0) {
 686         (*source)++;
 687         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 688         (*dest)++;
 689         return 0;
 690     }
 691     else {
 692         PyErr_Format(PyExc_ValueError,
 693                      "UTF-8 decoding error; unknown error handling code: %.400s",
 694                      errors);
 695         return -1;
 696     }
 697 }
 698
 699 PyObject *PyUnicode_DecodeUTF8(const char *s,
 700                                int size,
 701                                const char *errors)
 702 {
 703     int n;
 704     const char *e;
 705     PyUnicodeObject *unicode;
 706     Py_UNICODE *p;
 707     const char *errmsg = "";
 708
 709     /* Note: size will always be longer than the resulting Unicode
 710        character count */
 711     unicode = _PyUnicode_New(size);
 712     if (!unicode)
 713         return NULL;
 714     if (size == 0)
 715         return (PyObject *)unicode;
 716
 717     /* Unpack UTF-8 encoded data */
 718     p = unicode->str;
 719     e = s + size;
 720
 721     while (s < e) {
 722         Py_UCS4 ch = (unsigned char)*s;
 723
 724         if (ch < 0x80) {
 725             *p++ = (Py_UNICODE)ch;
 726             s++;
 727             continue;
 728         }
 729
 730         n = utf8_code_length[ch];
 731
 732         if (s + n > e) {
 733             errmsg = "unexpected end of data";
 734             goto utf8Error;
 735         }
 736
 737         switch (n) {
 738
 739         case 0:
 740             errmsg = "unexpected code byte";
 741             goto utf8Error;
 742
 743         case 1:
 744             errmsg = "internal error";
 745             goto utf8Error;
 746
 747         case 2:
 748             if ((s[1] & 0xc0) != 0x80) {
 749                 errmsg = "invalid data";
 750                 goto utf8Error;
 751             }
 752             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
 753             if (ch < 0x80) {
 754                 errmsg = "illegal encoding";
 755                 goto utf8Error;
 756             }
 757             else
 758                 *p++ = (Py_UNICODE)ch;
 759             break;
 760
 761         case 3:
 762             if ((s[1] & 0xc0) != 0x80 ||
 763                 (s[2] & 0xc0) != 0x80) {
 764                 errmsg = "invalid data";
 765                 goto utf8Error;
 766             }
 767             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
 768             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
 769                 errmsg = "illegal encoding";
 770                 goto utf8Error;
 771             }
 772             else
 773                                 *p++ = (Py_UNICODE)ch;
 774             break;
 775
 776         case 4:
 777             if ((s[1] & 0xc0) != 0x80 ||
 778                 (s[2] & 0xc0) != 0x80 ||
 779                 (s[3] & 0xc0) != 0x80) {
 780                 errmsg = "invalid data";
 781                 goto utf8Error;
 782             }
 783             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
 784                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
 785             /* validate and convert to UTF-16 */
 786             if ((ch < 0x10000)        /* minimum value allowed for 4
 787                                        byte encoding */
 788                 || (ch > 0x10ffff))   /* maximum value allowed for
 789                                        UTF-16 */
 790             {
 791                 errmsg = "illegal encoding";
 792                 goto utf8Error;
 793             }
 794 #ifdef Py_UNICODE_WIDE
 795             *p++ = (Py_UNICODE)ch;
 796 #else
 797             /*  compute and append the two surrogates: */
 798
 799             /*  translate from 10000..10FFFF to 0..FFFF */
 800             ch -= 0x10000;
 801
 802             /*  high surrogate = top 10 bits added to D800 */
 803             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
 804
 805             /*  low surrogate = bottom 10 bits added to DC00 */
 806             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
 807 #endif
 808             break;
 809
 810         default:
 811             /* Other sizes are only needed for UCS-4 */
 812             errmsg = "unsupported Unicode code range";
 813             goto utf8Error;
 814         }
 815         s += n;
 816         continue;
 817
 818     utf8Error:
 819       if (utf8_decoding_error(&s, &p, errors, errmsg))
 820           goto onError;
 821     }
 822
 823     /* Adjust length */
 824     if (_PyUnicode_Resize(&unicode, p - unicode->str))
 825         goto onError;
 826
 827     return (PyObject *)unicode;
 828
 829 onError:
 830     Py_DECREF(unicode);
 831     return NULL;
 832 }
 833
 834 /* Not used anymore, now that the encoder supports UTF-16
 835    surrogates. */
 836 #if 0
 837 static
 838 int utf8_encoding_error(const Py_UNICODE **source,
 839                         char **dest,
 840                         const char *errors,
 841                         const char *details)
 842 {
 843     if ((errors == NULL) ||
 844         (strcmp(errors,"strict") == 0)) {
 845         PyErr_Format(PyExc_UnicodeError,
 846                      "UTF-8 encoding error: %.400s",
 847                      details);
 848         return -1;
 849     }
 850     else if (strcmp(errors,"ignore") == 0) {
 851         return 0;
 852     }
 853     else if (strcmp(errors,"replace") == 0) {
 854         **dest = '?';
 855         (*dest)++;
 856         return 0;
 857     }
 858     else {
 859         PyErr_Format(PyExc_ValueError,
 860                      "UTF-8 encoding error; "
 861                      "unknown error handling code: %.400s",
 862                      errors);
 863         return -1;
 864     }
 865 }
 866 #endif
 867
 868 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 869                                int size,
 870                                const char *errors)
 871 {
 872     PyObject *v;
 873     char *p;
 874     char *q;
 875     Py_UCS4 ch2;
 876     unsigned int cbAllocated = 3 * size;
 877     unsigned int cbWritten = 0;
 878     int i = 0;
 879
 880     v = PyString_FromStringAndSize(NULL, cbAllocated);
 881     if (v == NULL)
 882         return NULL;
 883     if (size == 0)
 884         return v;
 885
 886     p = q = PyString_AS_STRING(v);
 887     while (i < size) {
 888         Py_UCS4 ch = s[i++];
 889         if (ch < 0x80) {
 890             *p++ = (char) ch;
 891             cbWritten++;
 892         }
 893         else if (ch < 0x0800) {
 894             *p++ = 0xc0 | (ch >> 6);
 895             *p++ = 0x80 | (ch & 0x3f);
 896             cbWritten += 2;
 897         }
 898         else if (ch < 0x10000) {
 899             /* Check for high surrogate */
 900             if (0xD800 <= ch && ch <= 0xDBFF) {
 901                 if (i != size) {
 902                     ch2 = s[i];
 903                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 904
 905                         if (cbWritten >= (cbAllocated - 4)) {
 906                             /* Provide enough room for some more
 907                                surrogates */
 908                             cbAllocated += 4*10;
 909                             if (_PyString_Resize(&v, cbAllocated))
 910                                 goto onError;
 911                         }
 912
 913                         /* combine the two values */
 914                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
 915
 916                         *p++ = (char)((ch >> 18) | 0xf0);
 917                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 918                         i++;
 919                         cbWritten += 4;
 920                     }
 921                 }
 922             }
 923             else {
 924                 *p++ = (char)(0xe0 | (ch >> 12));
 925                 cbWritten += 3;
 926             }
 927             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 928             *p++ = (char)(0x80 | (ch & 0x3f));
 929         } else {
 930             *p++ = 0xf0 | (ch>>18);
 931             *p++ = 0x80 | ((ch>>12) & 0x3f);
 932             *p++ = 0x80 | ((ch>>6) & 0x3f);
 933             *p++ = 0x80 | (ch & 0x3f);
 934             cbWritten += 4;
 935         }
 936     }
 937     *p = '\0';
 938     if (_PyString_Resize(&v, p - q))
 939         goto onError;
 940     return v;
 941
 942  onError:
 943     Py_DECREF(v);
 944     return NULL;
 945 }
 946
 947 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 948 {
 949     if (!PyUnicode_Check(unicode)) {
 950         PyErr_BadArgument();
 951         return NULL;
 952     }
 953     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
 954                                 PyUnicode_GET_SIZE(unicode),
 955                                 NULL);
 956 }
 957
 958 /* --- UTF-16 Codec ------------------------------------------------------- */
 959
 960 static
 961 int utf16_decoding_error(const Py_UCS2 **source,
 962                          Py_UNICODE **dest,
 963                          const char *errors,
 964                          const char *details)
 965 {
 966     if ((errors == NULL) ||
 967         (strcmp(errors,"strict") == 0)) {
 968         PyErr_Format(PyExc_UnicodeError,
 969                      "UTF-16 decoding error: %.400s",
 970                      details);
 971         return -1;
 972     }
 973     else if (strcmp(errors,"ignore") == 0) {
 974         return 0;
 975     }
 976     else if (strcmp(errors,"replace") == 0) {
 977         if (dest) {
 978             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 979             (*dest)++;
 980         }
 981         return 0;
 982     }
 983     else {
 984         PyErr_Format(PyExc_ValueError,
 985                      "UTF-16 decoding error; "
 986                      "unknown error handling code: %.400s",
 987                      errors);
 988         return -1;
 989     }
 990 }
 991
 992 PyObject *PyUnicode_DecodeUTF16(const char *s,
 993                                 int size,
 994                                 const char *errors,
 995                                 int *byteorder)
 996 {
 997     PyUnicodeObject *unicode;
 998     Py_UNICODE *p;
 999     const Py_UCS2 *q, *e;
1000     int bo = 0;
1001     const char *errmsg = "";
1002
1003     /* size should be an even number */
1004     if (size % sizeof(Py_UCS2) != 0) {
1005         if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1006             return NULL;
1007         /* The remaining input chars are ignored if we fall through
1008            here... */
1009     }
1010
1011     /* Note: size will always be longer than the resulting Unicode
1012        character count */
1013     unicode = _PyUnicode_New(size);
1014     if (!unicode)
1015         return NULL;
1016     if (size == 0)
1017         return (PyObject *)unicode;
1018
1019     /* Unpack UTF-16 encoded data */
1020     p = unicode->str;
1021     q = (Py_UCS2 *)s;
1022     e = q + (size / sizeof(Py_UCS2));
1023
1024     if (byteorder)
1025         bo = *byteorder;
1026
1027     /* Check for BOM marks (U+FEFF) in the input and adjust current
1028        byte order setting accordingly. In native mode, the leading BOM
1029        mark is skipped, in all other modes, it is copied to the output
1030        stream as-is (giving a ZWNBSP character). */
1031     if (bo == 0) {
1032 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1033         if (*q == 0xFEFF) {
1034             q++;
1035             bo = -1;
1036         } else if (*q == 0xFFFE) {
1037             q++;
1038             bo = 1;
1039         }
1040 #else
1041         if (*q == 0xFEFF) {
1042             q++;
1043             bo = 1;
1044         } else if (*q == 0xFFFE) {
1045             q++;
1046             bo = -1;
1047         }
1048 #endif
1049     }
1050
1051     while (q < e) {
1052         register Py_UCS2 ch = *q++;
1053
1054         /* Swap input bytes if needed. (This assumes
1055            sizeof(Py_UNICODE) == 2 !) */
1056 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1057         if (bo == 1)
1058             ch = (ch >> 8) | (ch << 8);
1059 #else
1060         if (bo == -1)
1061             ch = (ch >> 8) | (ch << 8);
1062 #endif
1063         if (ch < 0xD800 || ch > 0xDFFF) {
1064             *p++ = ch;
1065             continue;
1066         }
1067
1068         /* UTF-16 code pair: */
1069         if (q >= e) {
1070             errmsg = "unexpected end of data";
1071             goto utf16Error;
1072         }
1073         if (0xD800 <= ch && ch <= 0xDBFF) {
1074             Py_UCS2 ch2 = *q++;
1075 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1076             if (bo == 1)
1077                     ch2 = (ch2 >> 8) | (ch2 << 8);
1078 #else
1079             if (bo == -1)
1080                     ch2 = (ch2 >> 8) | (ch2 << 8);
1081 #endif
1082             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1083 #ifndef Py_UNICODE_WIDE
1084                 /* This is valid data (a UTF-16 surrogate pair), but
1085                    we are not able to store this information since our
1086                    Py_UNICODE type only has 16 bits... this might
1087                    change someday, even though it's unlikely. */
1088                 errmsg = "code pairs are not supported";
1089                 goto utf16Error;
1090 #else
1091                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1092                 continue;
1093 #endif
1094
1095             }
1096             else {
1097                 errmsg = "illegal UTF-16 surrogate";
1098                 goto utf16Error;
1099             }
1100
1101         }
1102         errmsg = "illegal encoding";
1103         /* Fall through to report the error */
1104
1105     utf16Error:
1106         if (utf16_decoding_error(&q, &p, errors, errmsg))
1107             goto onError;
1108     }
1109
1110     if (byteorder)
1111         *byteorder = bo;
1112
1113     /* Adjust length */
1114     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1115         goto onError;
1116
1117     return (PyObject *)unicode;
1118
1119 onError:
1120     Py_DECREF(unicode);
1121     return NULL;
1122 }
1123
1124 #undef UTF16_ERROR
1125
1126 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1127                                 int size,
1128                                 const char *errors,
1129                                 int byteorder)
1130 {
1131     PyObject *v;
1132     Py_UCS2 *p;
1133     char *q;
1134     int i, pairs, doswap = 1;
1135
1136     for (i = pairs = 0; i < size; i++)
1137         if (s[i] >= 0x10000)
1138             pairs++;
1139     v = PyString_FromStringAndSize(NULL,
1140                   sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
1141     if (v == NULL)
1142         return NULL;
1143
1144     q = PyString_AS_STRING(v);
1145     p = (Py_UCS2 *)q;
1146     if (byteorder == 0)
1147         *p++ = 0xFEFF;
1148     if (size == 0)
1149         return v;
1150     if (byteorder == 0 ||
1151 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1152         byteorder == -1
1153 #else
1154         byteorder == 1
1155 #endif
1156         )
1157         doswap = 0;
1158     while (size-- > 0) {
1159         Py_UNICODE ch = *s++;
1160         Py_UNICODE ch2 = 0;
1161         if (ch >= 0x10000) {
1162             ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1163             ch  = 0xD800|((ch-0x10000)>>10);
1164         }
1165         if (doswap){
1166             *p++ = (ch >> 8) | (ch << 8);
1167             if (ch2)
1168                 *p++ = (ch2 >> 8) | (ch2 << 8);
1169         }else{
1170             *p++ = ch;
1171             if(ch2)
1172                 *p++ = ch2;
1173         }
1174     }
1175     return v;
1176 }
1177
1178 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1179 {
1180     if (!PyUnicode_Check(unicode)) {
1181         PyErr_BadArgument();
1182         return NULL;
1183     }
1184     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1185                                  PyUnicode_GET_SIZE(unicode),
1186                                  NULL,
1187                                  0);
1188 }
1189
1190 /* --- Unicode Escape Codec ----------------------------------------------- */
1191
1192 static
1193 int unicodeescape_decoding_error(const char **source,
1194                                  Py_UNICODE *x,
1195                                  const char *errors,
1196                                  const char *details)
1197 {
1198     if ((errors == NULL) ||
1199         (strcmp(errors,"strict") == 0)) {
1200         PyErr_Format(PyExc_UnicodeError,
1201                      "Unicode-Escape decoding error: %.400s",
1202                      details);
1203         return -1;
1204     }
1205     else if (strcmp(errors,"ignore") == 0) {
1206         return 0;
1207     }
1208     else if (strcmp(errors,"replace") == 0) {
1209         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1210         return 0;
1211     }
1212     else {
1213         PyErr_Format(PyExc_ValueError,
1214                      "Unicode-Escape decoding error; "
1215                      "unknown error handling code: %.400s",
1216                      errors);
1217         return -1;
1218     }
1219 }
1220
1221 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1222
1223 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1224                                         int size,
1225                                         const char *errors)
1226 {
1227     PyUnicodeObject *v;
1228     Py_UNICODE *p, *buf;
1229     const char *end;
1230     char* message;
1231     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1232
1233     /* Escaped strings will always be longer than the resulting
1234        Unicode string, so we start with size here and then reduce the
1235        length after conversion to the true value. */
1236     v = _PyUnicode_New(size);
1237     if (v == NULL)
1238         goto onError;
1239     if (size == 0)
1240         return (PyObject *)v;
1241
1242     p = buf = PyUnicode_AS_UNICODE(v);
1243     end = s + size;
1244
1245     while (s < end) {
1246         unsigned char c;
1247         Py_UNICODE x;
1248         int i, digits;
1249
1250         /* Non-escape characters are interpreted as Unicode ordinals */
1251         if (*s != '\\') {
1252             *p++ = (unsigned char) *s++;
1253             continue;
1254         }
1255
1256         /* \ - Escapes */
1257         s++;
1258         switch (*s++) {
1259
1260         /* \x escapes */
1261         case '\n': break;
1262         case '\\': *p++ = '\\'; break;
1263         case '\'': *p++ = '\''; break;
1264         case '\"': *p++ = '\"'; break;
1265         case 'b': *p++ = '\b'; break;
1266         case 'f': *p++ = '\014'; break; /* FF */
1267         case 't': *p++ = '\t'; break;
1268         case 'n': *p++ = '\n'; break;
1269         case 'r': *p++ = '\r'; break;
1270         case 'v': *p++ = '\013'; break; /* VT */
1271         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1272
1273         /* \OOO (octal) escapes */
1274         case '0': case '1': case '2': case '3':
1275         case '4': case '5': case '6': case '7':
1276             x = s[-1] - '0';
1277             if ('0' <= *s && *s <= '7') {
1278                 x = (x<<3) + *s++ - '0';
1279                 if ('0' <= *s && *s <= '7')
1280                     x = (x<<3) + *s++ - '0';
1281             }
1282             *p++ = x;
1283             break;
1284
1285         /* hex escapes */
1286         /* \xXX */
1287         case 'x':
1288             digits = 2;
1289             message = "truncated \\xXX escape";
1290             goto hexescape;
1291
1292         /* \uXXXX */
1293         case 'u':
1294             digits = 4;
1295             message = "truncated \\uXXXX escape";
1296             goto hexescape;
1297
1298         /* \UXXXXXXXX */
1299         case 'U':
1300             digits = 8;
1301             message = "truncated \\UXXXXXXXX escape";
1302         hexescape:
1303             chr = 0;
1304             for (i = 0; i < digits; i++) {
1305                 c = (unsigned char) s[i];
1306                 if (!isxdigit(c)) {
1307                     if (unicodeescape_decoding_error(&s, &x, errors, message))
1308                         goto onError;
1309                     chr = x;
1310                     i++;
1311                     break;
1312                 }
1313                 chr = (chr<<4) & ~0xF;
1314                 if (c >= '0' && c <= '9')
1315                     chr += c - '0';
1316                 else if (c >= 'a' && c <= 'f')
1317                     chr += 10 + c - 'a';
1318                 else
1319                     chr += 10 + c - 'A';
1320             }
1321             s += i;
1322         store:
1323             /* when we get here, chr is a 32-bit unicode character */
1324             if (chr <= 0xffff)
1325                 /* UCS-2 character */
1326                 *p++ = (Py_UNICODE) chr;
1327             else if (chr <= 0x10ffff) {
1328                 /* UCS-4 character. Either store directly, or as surrogate pair. */
1329 #ifdef Py_UNICODE_WIDE
1330                 *p++ = chr;
1331 #else
1332                 chr -= 0x10000L;
1333                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1334                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1335 #endif
1336             } else {
1337                 if (unicodeescape_decoding_error(
1338                     &s, &x, errors,
1339                     "illegal Unicode character")
1340                     )
1341                     goto onError;
1342                 *p++ = x; /* store replacement character */
1343             }
1344             break;
1345
1346         /* \N{name} */
1347         case 'N':
1348             message = "malformed \\N character escape";
1349             if (ucnhash_CAPI == NULL) {
1350                 /* load the unicode data module */
1351                 PyObject *m, *v;
1352                 m = PyImport_ImportModule("unicodedata");
1353                 if (m == NULL)
1354                     goto ucnhashError;
1355                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1356                 Py_DECREF(m);
1357                 if (v == NULL)
1358                     goto ucnhashError;
1359                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1360                 Py_DECREF(v);
1361                 if (ucnhash_CAPI == NULL)
1362                     goto ucnhashError;
1363             }
1364             if (*s == '{') {
1365                 const char *start = s+1;
1366                 /* look for the closing brace */
1367                 while (*s != '}' && s < end)
1368                     s++;
1369                 if (s > start && s < end && *s == '}') {
1370                     /* found a name.  look it up in the unicode database */
1371                     message = "unknown Unicode character name";
1372                     s++;
1373                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1374                         goto store;
1375                 }
1376             }
1377             if (unicodeescape_decoding_error(&s, &x, errors, message))
1378                 goto onError;
1379             *p++ = x;
1380             break;
1381
1382         default:
1383             *p++ = '\\';
1384             *p++ = (unsigned char)s[-1];
1385             break;
1386         }
1387     }
1388     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1389                 goto onError;
1390     return (PyObject *)v;
1391
1392 ucnhashError:
1393     PyErr_SetString(
1394         PyExc_UnicodeError,
1395         "\\N escapes not supported (can't load unicodedata module)"
1396         );
1397     return NULL;
1398
1399 onError:
1400     Py_XDECREF(v);
1401     return NULL;
1402 }
1403
1404 /* Return a Unicode-Escape string version of the Unicode object.
1405
1406    If quotes is true, the string is enclosed in u"" or u'' quotes as
1407    appropriate.
1408
1409 */
1410
1411 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1412                                   int size,
1413                                   Py_UNICODE ch);
1414
1415 static
1416 PyObject *unicodeescape_string(const Py_UNICODE *s,
1417                                int size,
1418                                int quotes)
1419 {
1420     PyObject *repr;
1421     char *p;
1422     char *q;
1423
1424     static const char *hexdigit = "0123456789abcdef";
1425
1426     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1427     if (repr == NULL)
1428         return NULL;
1429
1430     p = q = PyString_AS_STRING(repr);
1431
1432     if (quotes) {
1433         *p++ = 'u';
1434         *p++ = (findchar(s, size, '\'') &&
1435                 !findchar(s, size, '"')) ? '"' : '\'';
1436     }
1437     while (size-- > 0) {
1438         Py_UNICODE ch = *s++;
1439         /* Escape quotes */
1440         if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
1441             *p++ = '\\';
1442             *p++ = (char) ch;
1443         }
1444         /* Map 21-bit characters to '\U00xxxxxx' */
1445         else if (ch >= 0x10000) {
1446             *p++ = '\\';
1447             *p++ = 'U';
1448             *p++ = hexdigit[(ch >> 28) & 0xf];
1449             *p++ = hexdigit[(ch >> 24) & 0xf];
1450             *p++ = hexdigit[(ch >> 20) & 0xf];
1451             *p++ = hexdigit[(ch >> 16) & 0xf];
1452             *p++ = hexdigit[(ch >> 12) & 0xf];
1453             *p++ = hexdigit[(ch >> 8) & 0xf];
1454             *p++ = hexdigit[(ch >> 4) & 0xf];
1455             *p++ = hexdigit[ch & 15];
1456         }
1457         /* Map 16-bit characters to '\uxxxx' */
1458         else if (ch >= 256) {
1459             *p++ = '\\';
1460             *p++ = 'u';
1461             *p++ = hexdigit[(ch >> 12) & 0xf];
1462             *p++ = hexdigit[(ch >> 8) & 0xf];
1463             *p++ = hexdigit[(ch >> 4) & 0xf];
1464             *p++ = hexdigit[ch & 15];
1465         }
1466         /* Map special whitespace to '\t', \n', '\r' */
1467         else if (ch == '\t') {
1468             *p++ = '\\';
1469             *p++ = 't';
1470         }
1471         else if (ch == '\n') {
1472             *p++ = '\\';
1473             *p++ = 'n';
1474         }
1475         else if (ch == '\r') {
1476             *p++ = '\\';
1477             *p++ = 'r';
1478         }
1479         /* Map non-printable US ASCII to '\xhh' */
1480         else if (ch < ' ' || ch >= 128) {
1481             *p++ = '\\';
1482             *p++ = 'x';
1483             *p++ = hexdigit[(ch >> 4) & 0xf];
1484             *p++ = hexdigit[ch & 15];
1485         }
1486         /* Copy everything else as-is */
1487         else
1488             *p++ = (char) ch;
1489     }
1490     if (quotes)
1491         *p++ = q[1];
1492
1493     *p = '\0';
1494     if (_PyString_Resize(&repr, p - q))
1495         goto onError;
1496
1497     return repr;
1498
1499  onError:
1500     Py_DECREF(repr);
1501     return NULL;
1502 }
1503
1504 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1505                                         int size)
1506 {
1507     return unicodeescape_string(s, size, 0);
1508 }
1509
1510 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1511 {
1512     if (!PyUnicode_Check(unicode)) {
1513         PyErr_BadArgument();
1514         return NULL;
1515     }
1516     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1517                                          PyUnicode_GET_SIZE(unicode));
1518 }
1519
1520 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1521
1522 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1523                                            int size,
1524                                            const char *errors)
1525 {
1526     PyUnicodeObject *v;
1527     Py_UNICODE *p, *buf;
1528     const char *end;
1529     const char *bs;
1530
1531     /* Escaped strings will always be longer than the resulting
1532        Unicode string, so we start with size here and then reduce the
1533        length after conversion to the true value. */
1534     v = _PyUnicode_New(size);
1535     if (v == NULL)
1536         goto onError;
1537     if (size == 0)
1538         return (PyObject *)v;
1539     p = buf = PyUnicode_AS_UNICODE(v);
1540     end = s + size;
1541     while (s < end) {
1542         unsigned char c;
1543         Py_UNICODE x;
1544         int i;
1545
1546         /* Non-escape characters are interpreted as Unicode ordinals */
1547         if (*s != '\\') {
1548             *p++ = (unsigned char)*s++;
1549             continue;
1550         }
1551
1552         /* \u-escapes are only interpreted iff the number of leading
1553            backslashes if odd */
1554         bs = s;
1555         for (;s < end;) {
1556             if (*s != '\\')
1557                 break;
1558             *p++ = (unsigned char)*s++;
1559         }
1560         if (((s - bs) & 1) == 0 ||
1561             s >= end ||
1562             *s != 'u') {
1563             continue;
1564         }
1565         p--;
1566         s++;
1567
1568         /* \uXXXX with 4 hex digits */
1569         for (x = 0, i = 0; i < 4; i++) {
1570             c = (unsigned char)s[i];
1571             if (!isxdigit(c)) {
1572                 if (unicodeescape_decoding_error(&s, &x, errors,
1573                                                  "truncated \\uXXXX"))
1574                     goto onError;
1575                 i++;
1576                 break;
1577             }
1578             x = (x<<4) & ~0xF;
1579             if (c >= '0' && c <= '9')
1580                 x += c - '0';
1581             else if (c >= 'a' && c <= 'f')
1582                 x += 10 + c - 'a';
1583             else
1584                 x += 10 + c - 'A';
1585         }
1586         s += i;
1587         *p++ = x;
1588     }
1589     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1590         goto onError;
1591     return (PyObject *)v;
1592
1593  onError:
1594     Py_XDECREF(v);
1595     return NULL;
1596 }
1597
1598 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1599                                            int size)
1600 {
1601     PyObject *repr;
1602     char *p;
1603     char *q;
1604
1605     static const char *hexdigit = "0123456789abcdef";
1606
1607     repr = PyString_FromStringAndSize(NULL, 6 * size);
1608     if (repr == NULL)
1609         return NULL;
1610     if (size == 0)
1611         return repr;
1612
1613     p = q = PyString_AS_STRING(repr);
1614     while (size-- > 0) {
1615         Py_UNICODE ch = *s++;
1616         /* Map 16-bit characters to '\uxxxx' */
1617         if (ch >= 256) {
1618             *p++ = '\\';
1619             *p++ = 'u';
1620             *p++ = hexdigit[(ch >> 12) & 0xf];
1621             *p++ = hexdigit[(ch >> 8) & 0xf];
1622             *p++ = hexdigit[(ch >> 4) & 0xf];
1623             *p++ = hexdigit[ch & 15];
1624         }
1625         /* Copy everything else as-is */
1626         else
1627             *p++ = (char) ch;
1628     }
1629     *p = '\0';
1630     if (_PyString_Resize(&repr, p - q))
1631         goto onError;
1632
1633     return repr;
1634
1635  onError:
1636     Py_DECREF(repr);
1637     return NULL;
1638 }
1639
1640 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1641 {
1642     if (!PyUnicode_Check(unicode)) {
1643         PyErr_BadArgument();
1644         return NULL;
1645     }
1646     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1647                                             PyUnicode_GET_SIZE(unicode));
1648 }
1649
1650 /* --- Latin-1 Codec ------------------------------------------------------ */
1651
1652 PyObject *PyUnicode_DecodeLatin1(const char *s,
1653                                  int size,
1654                                  const char *errors)
1655 {
1656     PyUnicodeObject *v;
1657     Py_UNICODE *p;
1658
1659     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1660     if (size == 1 && *(unsigned char*)s < 256) {
1661         Py_UNICODE r = *(unsigned char*)s;
1662         return PyUnicode_FromUnicode(&r, 1);
1663     }
1664
1665     v = _PyUnicode_New(size);
1666     if (v == NULL)
1667         goto onError;
1668     if (size == 0)
1669         return (PyObject *)v;
1670     p = PyUnicode_AS_UNICODE(v);
1671     while (size-- > 0)
1672         *p++ = (unsigned char)*s++;
1673     return (PyObject *)v;
1674
1675  onError:
1676     Py_XDECREF(v);
1677     return NULL;
1678 }
1679
1680 static
1681 int latin1_encoding_error(const Py_UNICODE **source,
1682                           char **dest,
1683                           const char *errors,
1684                           const char *details)
1685 {
1686     if ((errors == NULL) ||
1687         (strcmp(errors,"strict") == 0)) {
1688         PyErr_Format(PyExc_UnicodeError,
1689                      "Latin-1 encoding error: %.400s",
1690                      details);
1691         return -1;
1692     }
1693     else if (strcmp(errors,"ignore") == 0) {
1694         return 0;
1695     }
1696     else if (strcmp(errors,"replace") == 0) {
1697         **dest = '?';
1698         (*dest)++;
1699         return 0;
1700     }
1701     else {
1702         PyErr_Format(PyExc_ValueError,
1703                      "Latin-1 encoding error; "
1704                      "unknown error handling code: %.400s",
1705                      errors);
1706         return -1;
1707     }
1708 }
1709
1710 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1711                                  int size,
1712                                  const char *errors)
1713 {
1714     PyObject *repr;
1715     char *s, *start;
1716
1717     repr = PyString_FromStringAndSize(NULL, size);
1718     if (repr == NULL)
1719         return NULL;
1720     if (size == 0)
1721         return repr;
1722
1723     s = PyString_AS_STRING(repr);
1724     start = s;
1725     while (size-- > 0) {
1726         Py_UNICODE ch = *p++;
1727         if (ch >= 256) {
1728             if (latin1_encoding_error(&p, &s, errors,
1729                                       "ordinal not in range(256)"))
1730                 goto onError;
1731         }
1732         else
1733             *s++ = (char)ch;
1734     }
1735     /* Resize if error handling skipped some characters */
1736     if (s - start < PyString_GET_SIZE(repr))
1737         if (_PyString_Resize(&repr, s - start))
1738             goto onError;
1739     return repr;
1740
1741  onError:
1742     Py_DECREF(repr);
1743     return NULL;
1744 }
1745
1746 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1747 {
1748     if (!PyUnicode_Check(unicode)) {
1749         PyErr_BadArgument();
1750         return NULL;
1751     }
1752     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1753                                   PyUnicode_GET_SIZE(unicode),
1754                                   NULL);
1755 }
1756
1757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1758
1759 static
1760 int ascii_decoding_error(const char **source,
1761                          Py_UNICODE **dest,
1762                          const char *errors,
1763                          const char *details)
1764 {
1765     if ((errors == NULL) ||
1766         (strcmp(errors,"strict") == 0)) {
1767         PyErr_Format(PyExc_UnicodeError,
1768                      "ASCII decoding error: %.400s",
1769                      details);
1770         return -1;
1771     }
1772     else if (strcmp(errors,"ignore") == 0) {
1773         return 0;
1774     }
1775     else if (strcmp(errors,"replace") == 0) {
1776         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1777         (*dest)++;
1778         return 0;
1779     }
1780     else {
1781         PyErr_Format(PyExc_ValueError,
1782                      "ASCII decoding error; "
1783                      "unknown error handling code: %.400s",
1784                      errors);
1785         return -1;
1786     }
1787 }
1788
1789 PyObject *PyUnicode_DecodeASCII(const char *s,
1790                                 int size,
1791                                 const char *errors)
1792 {
1793     PyUnicodeObject *v;
1794     Py_UNICODE *p;
1795
1796     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1797     if (size == 1 && *(unsigned char*)s < 128) {
1798         Py_UNICODE r = *(unsigned char*)s;
1799         return PyUnicode_FromUnicode(&r, 1);
1800     }
1801
1802     v = _PyUnicode_New(size);
1803     if (v == NULL)
1804         goto onError;
1805     if (size == 0)
1806         return (PyObject *)v;
1807     p = PyUnicode_AS_UNICODE(v);
1808     while (size-- > 0) {
1809         register unsigned char c;
1810
1811         c = (unsigned char)*s++;
1812         if (c < 128)
1813             *p++ = c;
1814         else if (ascii_decoding_error(&s, &p, errors,
1815                                       "ordinal not in range(128)"))
1816                 goto onError;
1817     }
1818     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1819         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1820             goto onError;
1821     return (PyObject *)v;
1822
1823  onError:
1824     Py_XDECREF(v);
1825     return NULL;
1826 }
1827
1828 static
1829 int ascii_encoding_error(const Py_UNICODE **source,
1830                          char **dest,
1831                          const char *errors,
1832                          const char *details)
1833 {
1834     if ((errors == NULL) ||
1835         (strcmp(errors,"strict") == 0)) {
1836         PyErr_Format(PyExc_UnicodeError,
1837                      "ASCII encoding error: %.400s",
1838                      details);
1839         return -1;
1840     }
1841     else if (strcmp(errors,"ignore") == 0) {
1842         return 0;
1843     }
1844     else if (strcmp(errors,"replace") == 0) {
1845         **dest = '?';
1846         (*dest)++;
1847         return 0;
1848     }
1849     else {
1850         PyErr_Format(PyExc_ValueError,
1851                      "ASCII encoding error; "
1852                      "unknown error handling code: %.400s",
1853                      errors);
1854         return -1;
1855     }
1856 }
1857
1858 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1859                                 int size,
1860                                 const char *errors)
1861 {
1862     PyObject *repr;
1863     char *s, *start;
1864
1865     repr = PyString_FromStringAndSize(NULL, size);
1866     if (repr == NULL)
1867         return NULL;
1868     if (size == 0)
1869         return repr;
1870
1871     s = PyString_AS_STRING(repr);
1872     start = s;
1873     while (size-- > 0) {
1874         Py_UNICODE ch = *p++;
1875         if (ch >= 128) {
1876             if (ascii_encoding_error(&p, &s, errors,
1877                                       "ordinal not in range(128)"))
1878                 goto onError;
1879         }
1880         else
1881             *s++ = (char)ch;
1882     }
1883     /* Resize if error handling skipped some characters */
1884     if (s - start < PyString_GET_SIZE(repr))
1885         if (_PyString_Resize(&repr, s - start))
1886             goto onError;
1887     return repr;
1888
1889  onError:
1890     Py_DECREF(repr);
1891     return NULL;
1892 }
1893
1894 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1895 {
1896     if (!PyUnicode_Check(unicode)) {
1897         PyErr_BadArgument();
1898         return NULL;
1899     }
1900     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1901                                  PyUnicode_GET_SIZE(unicode),
1902                                  NULL);
1903 }
1904
1905 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1906
1907 /* --- MBCS codecs for Windows -------------------------------------------- */
1908
1909 PyObject *PyUnicode_DecodeMBCS(const char *s,
1910                                 int size,
1911                                 const char *errors)
1912 {
1913     PyUnicodeObject *v;
1914     Py_UNICODE *p;
1915
1916     /* First get the size of the result */
1917     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1918     if (size > 0 && usize==0)
1919         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1920
1921     v = _PyUnicode_New(usize);
1922     if (v == NULL)
1923         return NULL;
1924     if (usize == 0)
1925         return (PyObject *)v;
1926     p = PyUnicode_AS_UNICODE(v);
1927     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1928         Py_DECREF(v);
1929         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1930     }
1931
1932     return (PyObject *)v;
1933 }
1934
1935 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1936                                 int size,
1937                                 const char *errors)
1938 {
1939     PyObject *repr;
1940     char *s;
1941     DWORD mbcssize;
1942
1943     /* If there are no characters, bail now! */
1944     if (size==0)
1945             return PyString_FromString("");
1946
1947     /* First get the size of the result */
1948     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1949     if (mbcssize==0)
1950         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1951
1952     repr = PyString_FromStringAndSize(NULL, mbcssize);
1953     if (repr == NULL)
1954         return NULL;
1955     if (mbcssize == 0)
1956         return repr;
1957
1958     /* Do the conversion */
1959     s = PyString_AS_STRING(repr);
1960     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1961         Py_DECREF(repr);
1962         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1963     }
1964     return repr;
1965 }
1966
1967 #endif /* MS_WIN32 */
1968
1969 /* --- Character Mapping Codec -------------------------------------------- */
1970
1971 static
1972 int charmap_decoding_error(const char **source,
1973                          Py_UNICODE **dest,
1974                          const char *errors,
1975                          const char *details)
1976 {
1977     if ((errors == NULL) ||
1978         (strcmp(errors,"strict") == 0)) {
1979         PyErr_Format(PyExc_UnicodeError,
1980                      "charmap decoding error: %.400s",
1981                      details);
1982         return -1;
1983     }
1984     else if (strcmp(errors,"ignore") == 0) {
1985         return 0;
1986     }
1987     else if (strcmp(errors,"replace") == 0) {
1988         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1989         (*dest)++;
1990         return 0;
1991     }
1992     else {
1993         PyErr_Format(PyExc_ValueError,
1994                      "charmap decoding error; "
1995                      "unknown error handling code: %.400s",
1996                      errors);
1997         return -1;
1998     }
1999 }
2000
2001 PyObject *PyUnicode_DecodeCharmap(const char *s,
2002                                   int size,
2003                                   PyObject *mapping,
2004                                   const char *errors)
2005 {
2006     PyUnicodeObject *v;
2007     Py_UNICODE *p;
2008     int extrachars = 0;
2009
2010     /* Default to Latin-1 */
2011     if (mapping == NULL)
2012         return PyUnicode_DecodeLatin1(s, size, errors);
2013
2014     v = _PyUnicode_New(size);
2015     if (v == NULL)
2016         goto onError;
2017     if (size == 0)
2018         return (PyObject *)v;
2019     p = PyUnicode_AS_UNICODE(v);
2020     while (size-- > 0) {
2021         unsigned char ch = *s++;
2022         PyObject *w, *x;
2023
2024         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2025         w = PyInt_FromLong((long)ch);
2026         if (w == NULL)
2027             goto onError;
2028         x = PyObject_GetItem(mapping, w);
2029         Py_DECREF(w);
2030         if (x == NULL) {
2031             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2032                 /* No mapping found means: mapping is undefined. */
2033                 PyErr_Clear();
2034                 x = Py_None;
2035                 Py_INCREF(x);
2036             } else
2037                 goto onError;
2038         }
2039
2040         /* Apply mapping */
2041         if (PyInt_Check(x)) {
2042             long value = PyInt_AS_LONG(x);
2043             if (value < 0 || value > 65535) {
2044                 PyErr_SetString(PyExc_TypeError,
2045                                 "character mapping must be in range(65536)");
2046                 Py_DECREF(x);
2047                 goto onError;
2048             }
2049             *p++ = (Py_UNICODE)value;
2050         }
2051         else if (x == Py_None) {
2052             /* undefined mapping */
2053             if (charmap_decoding_error(&s, &p, errors,
2054                                        "character maps to <undefined>")) {
2055                 Py_DECREF(x);
2056                 goto onError;
2057             }
2058         }
2059         else if (PyUnicode_Check(x)) {
2060             int targetsize = PyUnicode_GET_SIZE(x);
2061
2062             if (targetsize == 1)
2063                 /* 1-1 mapping */
2064                 *p++ = *PyUnicode_AS_UNICODE(x);
2065
2066             else if (targetsize > 1) {
2067                 /* 1-n mapping */
2068                 if (targetsize > extrachars) {
2069                     /* resize first */
2070                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2071                     int needed = (targetsize - extrachars) + \
2072                                  (targetsize << 2);
2073                     extrachars += needed;
2074                     if (_PyUnicode_Resize(&v,
2075                                          PyUnicode_GET_SIZE(v) + needed)) {
2076                         Py_DECREF(x);
2077                         goto onError;
2078                     }
2079                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2080                 }
2081                 Py_UNICODE_COPY(p,
2082                                 PyUnicode_AS_UNICODE(x),
2083                                 targetsize);
2084                 p += targetsize;
2085                 extrachars -= targetsize;
2086             }
2087             /* 1-0 mapping: skip the character */
2088         }
2089         else {
2090             /* wrong return value */
2091             PyErr_SetString(PyExc_TypeError,
2092                   "character mapping must return integer, None or unicode");
2093             Py_DECREF(x);
2094             goto onError;
2095         }
2096         Py_DECREF(x);
2097     }
2098     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2099         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2100             goto onError;
2101     return (PyObject *)v;
2102
2103  onError:
2104     Py_XDECREF(v);
2105     return NULL;
2106 }
2107
2108 static
2109 int charmap_encoding_error(const Py_UNICODE **source,
2110                            char **dest,
2111                            const char *errors,
2112                            const char *details)
2113 {
2114     if ((errors == NULL) ||
2115         (strcmp(errors,"strict") == 0)) {
2116         PyErr_Format(PyExc_UnicodeError,
2117                      "charmap encoding error: %.400s",
2118                      details);
2119         return -1;
2120     }
2121     else if (strcmp(errors,"ignore") == 0) {
2122         return 0;
2123     }
2124     else if (strcmp(errors,"replace") == 0) {
2125         **dest = '?';
2126         (*dest)++;
2127         return 0;
2128     }
2129     else {
2130         PyErr_Format(PyExc_ValueError,
2131                      "charmap encoding error; "
2132                      "unknown error handling code: %.400s",
2133                      errors);
2134         return -1;
2135     }
2136 }
2137
2138 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2139                                   int size,
2140                                   PyObject *mapping,
2141                                   const char *errors)
2142 {
2143     PyObject *v;
2144     char *s;
2145     int extrachars = 0;
2146
2147     /* Default to Latin-1 */
2148     if (mapping == NULL)
2149         return PyUnicode_EncodeLatin1(p, size, errors);
2150
2151     v = PyString_FromStringAndSize(NULL, size);
2152     if (v == NULL)
2153         return NULL;
2154     if (size == 0)
2155         return v;
2156     s = PyString_AS_STRING(v);
2157     while (size-- > 0) {
2158         Py_UNICODE ch = *p++;
2159         PyObject *w, *x;
2160
2161         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2162         w = PyInt_FromLong((long)ch);
2163         if (w == NULL)
2164             goto onError;
2165         x = PyObject_GetItem(mapping, w);
2166         Py_DECREF(w);
2167         if (x == NULL) {
2168             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2169                 /* No mapping found means: mapping is undefined. */
2170                 PyErr_Clear();
2171                 x = Py_None;
2172                 Py_INCREF(x);
2173             } else
2174                 goto onError;
2175         }
2176
2177         /* Apply mapping */
2178         if (PyInt_Check(x)) {
2179             long value = PyInt_AS_LONG(x);
2180             if (value < 0 || value > 255) {
2181                 PyErr_SetString(PyExc_TypeError,
2182                                 "character mapping must be in range(256)");
2183                 Py_DECREF(x);
2184                 goto onError;
2185             }
2186             *s++ = (char)value;
2187         }
2188         else if (x == Py_None) {
2189             /* undefined mapping */
2190             if (charmap_encoding_error(&p, &s, errors,
2191                                        "character maps to <undefined>")) {
2192                 Py_DECREF(x);
2193                 goto onError;
2194             }
2195         }
2196         else if (PyString_Check(x)) {
2197             int targetsize = PyString_GET_SIZE(x);
2198
2199             if (targetsize == 1)
2200                 /* 1-1 mapping */
2201                 *s++ = *PyString_AS_STRING(x);
2202
2203             else if (targetsize > 1) {
2204                 /* 1-n mapping */
2205                 if (targetsize > extrachars) {
2206                     /* resize first */
2207                     int oldpos = (int)(s - PyString_AS_STRING(v));
2208                     int needed = (targetsize - extrachars) + \
2209                                  (targetsize << 2);
2210                     extrachars += needed;
2211                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2212                         Py_DECREF(x);
2213                         goto onError;
2214                     }
2215                     s = PyString_AS_STRING(v) + oldpos;
2216                 }
2217                 memcpy(s, PyString_AS_STRING(x), targetsize);
2218                 s += targetsize;
2219                 extrachars -= targetsize;
2220             }
2221             /* 1-0 mapping: skip the character */
2222         }
2223         else {
2224             /* wrong return value */
2225             PyErr_SetString(PyExc_TypeError,
2226                   "character mapping must return integer, None or unicode");
2227             Py_DECREF(x);
2228             goto onError;
2229         }
2230         Py_DECREF(x);
2231     }
2232     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2233         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2234             goto onError;
2235     return v;
2236
2237  onError:
2238     Py_DECREF(v);
2239     return NULL;
2240 }
2241
2242 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2243                                     PyObject *mapping)
2244 {
2245     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2246         PyErr_BadArgument();
2247         return NULL;
2248     }
2249     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2250                                    PyUnicode_GET_SIZE(unicode),
2251                                    mapping,
2252                                    NULL);
2253 }
2254
2255 static
2256 int translate_error(const Py_UNICODE **source,
2257                     Py_UNICODE **dest,
2258                     const char *errors,
2259                     const char *details)
2260 {
2261     if ((errors == NULL) ||
2262         (strcmp(errors,"strict") == 0)) {
2263         PyErr_Format(PyExc_UnicodeError,
2264                      "translate error: %.400s",
2265                      details);
2266         return -1;
2267     }
2268     else if (strcmp(errors,"ignore") == 0) {
2269         return 0;
2270     }
2271     else if (strcmp(errors,"replace") == 0) {
2272         **dest = '?';
2273         (*dest)++;
2274         return 0;
2275     }
2276     else {
2277         PyErr_Format(PyExc_ValueError,
2278                      "translate error; "
2279                      "unknown error handling code: %.400s",
2280                      errors);
2281         return -1;
2282     }
2283 }
2284
2285 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2286                                      int size,
2287                                      PyObject *mapping,
2288                                      const char *errors)
2289 {
2290     PyUnicodeObject *v;
2291     Py_UNICODE *p;
2292
2293     if (mapping == NULL) {
2294         PyErr_BadArgument();
2295         return NULL;
2296     }
2297
2298     /* Output will never be longer than input */
2299     v = _PyUnicode_New(size);
2300     if (v == NULL)
2301         goto onError;
2302     if (size == 0)
2303         goto done;
2304     p = PyUnicode_AS_UNICODE(v);
2305     while (size-- > 0) {
2306         Py_UNICODE ch = *s++;
2307         PyObject *w, *x;
2308
2309         /* Get mapping */
2310         w = PyInt_FromLong(ch);
2311         if (w == NULL)
2312             goto onError;
2313         x = PyObject_GetItem(mapping, w);
2314         Py_DECREF(w);
2315         if (x == NULL) {
2316             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2317                 /* No mapping found: default to 1-1 mapping */
2318                 PyErr_Clear();
2319                 *p++ = ch;
2320                 continue;
2321             }
2322             goto onError;
2323         }
2324
2325         /* Apply mapping */
2326         if (PyInt_Check(x))
2327             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2328         else if (x == Py_None) {
2329             /* undefined mapping */
2330             if (translate_error(&s, &p, errors,
2331                                 "character maps to <undefined>")) {
2332                 Py_DECREF(x);
2333                 goto onError;
2334             }
2335         }
2336         else if (PyUnicode_Check(x)) {
2337             if (PyUnicode_GET_SIZE(x) != 1) {
2338                 /* 1-n mapping */
2339                 PyErr_SetString(PyExc_NotImplementedError,
2340                                 "1-n mappings are currently not implemented");
2341                 Py_DECREF(x);
2342                 goto onError;
2343             }
2344             *p++ = *PyUnicode_AS_UNICODE(x);
2345         }
2346         else {
2347             /* wrong return value */
2348             PyErr_SetString(PyExc_TypeError,
2349                   "translate mapping must return integer, None or unicode");
2350             Py_DECREF(x);
2351             goto onError;
2352         }
2353         Py_DECREF(x);
2354     }
2355     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2356         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2357             goto onError;
2358
2359  done:
2360     return (PyObject *)v;
2361
2362  onError:
2363     Py_XDECREF(v);
2364     return NULL;
2365 }
2366
2367 PyObject *PyUnicode_Translate(PyObject *str,
2368                               PyObject *mapping,
2369                               const char *errors)
2370 {
2371     PyObject *result;
2372
2373     str = PyUnicode_FromObject(str);
2374     if (str == NULL)
2375         goto onError;
2376     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2377                                         PyUnicode_GET_SIZE(str),
2378                                         mapping,
2379                                         errors);
2380     Py_DECREF(str);
2381     return result;
2382
2383  onError:
2384     Py_XDECREF(str);
2385     return NULL;
2386 }
2387
2388 /* --- Decimal Encoder ---------------------------------------------------- */
2389
2390 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2391                             int length,
2392                             char *output,
2393                             const char *errors)
2394 {
2395     Py_UNICODE *p, *end;
2396
2397     if (output == NULL) {
2398         PyErr_BadArgument();
2399         return -1;
2400     }
2401
2402     p = s;
2403     end = s + length;
2404     while (p < end) {
2405         register Py_UNICODE ch = *p++;
2406         int decimal;
2407
2408         if (Py_UNICODE_ISSPACE(ch)) {
2409             *output++ = ' ';
2410             continue;
2411         }
2412         decimal = Py_UNICODE_TODECIMAL(ch);
2413         if (decimal >= 0) {
2414             *output++ = '0' + decimal;
2415             continue;
2416         }
2417         if (0 < ch && ch < 256) {
2418             *output++ = (char)ch;
2419             continue;
2420         }
2421         /* All other characters are considered invalid */
2422         if (errors == NULL || strcmp(errors, "strict") == 0) {
2423             PyErr_SetString(PyExc_ValueError,
2424                             "invalid decimal Unicode string");
2425             goto onError;
2426         }
2427         else if (strcmp(errors, "ignore") == 0)
2428             continue;
2429         else if (strcmp(errors, "replace") == 0) {
2430             *output++ = '?';
2431             continue;
2432         }
2433     }
2434     /* 0-terminate the output string */
2435     *output++ = '\0';
2436     return 0;
2437
2438  onError:
2439     return -1;
2440 }
2441
2442 /* --- Helpers ------------------------------------------------------------ */
2443
2444 static
2445 int count(PyUnicodeObject *self,
2446           int start,
2447           int end,
2448           PyUnicodeObject *substring)
2449 {
2450     int count = 0;
2451
2452     if (start < 0)
2453         start += self->length;
2454     if (start < 0)
2455         start = 0;
2456     if (end > self->length)
2457         end = self->length;
2458     if (end < 0)
2459         end += self->length;
2460     if (end < 0)
2461         end = 0;
2462
2463     if (substring->length == 0)
2464         return (end - start + 1);
2465
2466     end -= substring->length;
2467
2468     while (start <= end)
2469         if (Py_UNICODE_MATCH(self, start, substring)) {
2470             count++;
2471             start += substring->length;
2472         } else
2473             start++;
2474
2475     return count;
2476 }
2477
2478 int PyUnicode_Count(PyObject *str,
2479                     PyObject *substr,
2480                     int start,
2481                     int end)
2482 {
2483     int result;
2484
2485     str = PyUnicode_FromObject(str);
2486     if (str == NULL)
2487         return -1;
2488     substr = PyUnicode_FromObject(substr);
2489     if (substr == NULL) {
2490         Py_DECREF(str);
2491         return -1;
2492     }
2493
2494     result = count((PyUnicodeObject *)str,
2495                    start, end,
2496                    (PyUnicodeObject *)substr);
2497
2498     Py_DECREF(str);
2499     Py_DECREF(substr);
2500     return result;
2501 }
2502
2503 static
2504 int findstring(PyUnicodeObject *self,
2505                PyUnicodeObject *substring,
2506                int start,
2507                int end,
2508                int direction)
2509 {
2510     if (start < 0)
2511         start += self->length;
2512     if (start < 0)
2513         start = 0;
2514
2515     if (substring->length == 0)
2516         return start;
2517
2518     if (end > self->length)
2519         end = self->length;
2520     if (end < 0)
2521         end += self->length;
2522     if (end < 0)
2523         end = 0;
2524
2525     end -= substring->length;
2526
2527     if (direction < 0) {
2528         for (; end >= start; end--)
2529             if (Py_UNICODE_MATCH(self, end, substring))
2530                 return end;
2531     } else {
2532         for (; start <= end; start++)
2533             if (Py_UNICODE_MATCH(self, start, substring))
2534                 return start;
2535     }
2536
2537     return -1;
2538 }
2539
2540 int PyUnicode_Find(PyObject *str,
2541                    PyObject *substr,
2542                    int start,
2543                    int end,
2544                    int direction)
2545 {
2546     int result;
2547
2548     str = PyUnicode_FromObject(str);
2549     if (str == NULL)
2550         return -1;
2551     substr = PyUnicode_FromObject(substr);
2552     if (substr == NULL) {
2553         Py_DECREF(substr);
2554         return -1;
2555     }
2556
2557     result = findstring((PyUnicodeObject *)str,
2558                         (PyUnicodeObject *)substr,
2559                         start, end, direction);
2560     Py_DECREF(str);
2561     Py_DECREF(substr);
2562     return result;
2563 }
2564
2565 static
2566 int tailmatch(PyUnicodeObject *self,
2567               PyUnicodeObject *substring,
2568               int start,
2569               int end,
2570               int direction)
2571 {
2572     if (start < 0)
2573         start += self->length;
2574     if (start < 0)
2575         start = 0;
2576
2577     if (substring->length == 0)
2578         return 1;
2579
2580     if (end > self->length)
2581         end = self->length;
2582     if (end < 0)
2583         end += self->length;
2584     if (end < 0)
2585         end = 0;
2586
2587     end -= substring->length;
2588     if (end < start)
2589         return 0;
2590
2591     if (direction > 0) {
2592         if (Py_UNICODE_MATCH(self, end, substring))
2593             return 1;
2594     } else {
2595         if (Py_UNICODE_MATCH(self, start, substring))
2596             return 1;
2597     }
2598
2599     return 0;
2600 }
2601
2602 int PyUnicode_Tailmatch(PyObject *str,
2603                         PyObject *substr,
2604                         int start,
2605                         int end,
2606                         int direction)
2607 {
2608     int result;
2609
2610     str = PyUnicode_FromObject(str);
2611     if (str == NULL)
2612         return -1;
2613     substr = PyUnicode_FromObject(substr);
2614     if (substr == NULL) {
2615         Py_DECREF(substr);
2616         return -1;
2617     }
2618
2619     result = tailmatch((PyUnicodeObject *)str,
2620                        (PyUnicodeObject *)substr,
2621                        start, end, direction);
2622     Py_DECREF(str);
2623     Py_DECREF(substr);
2624     return result;
2625 }
2626
2627 static
2628 const Py_UNICODE *findchar(const Py_UNICODE *s,
2629                      int size,
2630                      Py_UNICODE ch)
2631 {
2632     /* like wcschr, but doesn't stop at NULL characters */
2633
2634     while (size-- > 0) {
2635         if (*s == ch)
2636             return s;
2637         s++;
2638     }
2639
2640     return NULL;
2641 }
2642
2643 /* Apply fixfct filter to the Unicode object self and return a
2644    reference to the modified object */
2645
2646 static
2647 PyObject *fixup(PyUnicodeObject *self,
2648                 int (*fixfct)(PyUnicodeObject *s))
2649 {
2650
2651     PyUnicodeObject *u;
2652
2653     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2654     if (u == NULL)
2655         return NULL;
2656
2657     Py_UNICODE_COPY(u->str, self->str, self->length);
2658
2659     if (!fixfct(u)) {
2660         /* fixfct should return TRUE if it modified the buffer. If
2661            FALSE, return a reference to the original buffer instead
2662            (to save space, not time) */
2663         Py_INCREF(self);
2664         Py_DECREF(u);
2665         return (PyObject*) self;
2666     }
2667     return (PyObject*) u;
2668 }
2669
2670 static
2671 int fixupper(PyUnicodeObject *self)
2672 {
2673     int len = self->length;
2674     Py_UNICODE *s = self->str;
2675     int status = 0;
2676
2677     while (len-- > 0) {
2678         register Py_UNICODE ch;
2679
2680         ch = Py_UNICODE_TOUPPER(*s);
2681         if (ch != *s) {
2682             status = 1;
2683             *s = ch;
2684         }
2685         s++;
2686     }
2687
2688     return status;
2689 }
2690
2691 static
2692 int fixlower(PyUnicodeObject *self)
2693 {
2694     int len = self->length;
2695     Py_UNICODE *s = self->str;
2696     int status = 0;
2697
2698     while (len-- > 0) {
2699         register Py_UNICODE ch;
2700
2701         ch = Py_UNICODE_TOLOWER(*s);
2702         if (ch != *s) {
2703             status = 1;
2704             *s = ch;
2705         }
2706         s++;
2707     }
2708
2709     return status;
2710 }
2711
2712 static
2713 int fixswapcase(PyUnicodeObject *self)
2714 {
2715     int len = self->length;
2716     Py_UNICODE *s = self->str;
2717     int status = 0;
2718
2719     while (len-- > 0) {
2720         if (Py_UNICODE_ISUPPER(*s)) {
2721             *s = Py_UNICODE_TOLOWER(*s);
2722             status = 1;
2723         } else if (Py_UNICODE_ISLOWER(*s)) {
2724             *s = Py_UNICODE_TOUPPER(*s);
2725             status = 1;
2726         }
2727         s++;
2728     }
2729
2730     return status;
2731 }
2732
2733 static
2734 int fixcapitalize(PyUnicodeObject *self)
2735 {
2736     int len = self->length;
2737     Py_UNICODE *s = self->str;
2738     int status = 0;
2739
2740     if (len == 0)
2741         return 0;
2742     if (Py_UNICODE_ISLOWER(*s)) {
2743         *s = Py_UNICODE_TOUPPER(*s);
2744         status = 1;
2745     }
2746     s++;
2747     while (--len > 0) {
2748         if (Py_UNICODE_ISUPPER(*s)) {
2749             *s = Py_UNICODE_TOLOWER(*s);
2750             status = 1;
2751         }
2752         s++;
2753     }
2754     return status;
2755 }
2756
2757 static
2758 int fixtitle(PyUnicodeObject *self)
2759 {
2760     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2761     register Py_UNICODE *e;
2762     int previous_is_cased;
2763
2764     /* Shortcut for single character strings */
2765     if (PyUnicode_GET_SIZE(self) == 1) {
2766         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2767         if (*p != ch) {
2768             *p = ch;
2769             return 1;
2770         }
2771         else
2772             return 0;
2773     }
2774
2775     e = p + PyUnicode_GET_SIZE(self);
2776     previous_is_cased = 0;
2777     for (; p < e; p++) {
2778         register const Py_UNICODE ch = *p;
2779
2780         if (previous_is_cased)
2781             *p = Py_UNICODE_TOLOWER(ch);
2782         else
2783             *p = Py_UNICODE_TOTITLE(ch);
2784
2785         if (Py_UNICODE_ISLOWER(ch) ||
2786             Py_UNICODE_ISUPPER(ch) ||
2787             Py_UNICODE_ISTITLE(ch))
2788             previous_is_cased = 1;
2789         else
2790             previous_is_cased = 0;
2791     }
2792     return 1;
2793 }
2794
2795 PyObject *PyUnicode_Join(PyObject *separator,
2796                          PyObject *seq)
2797 {
2798     Py_UNICODE *sep;
2799     int seplen;
2800     PyUnicodeObject *res = NULL;
2801     int reslen = 0;
2802     Py_UNICODE *p;
2803     int sz = 100;
2804     int i;
2805     PyObject *it;
2806
2807     it = PyObject_GetIter(seq);
2808     if (it == NULL)
2809         return NULL;
2810
2811     if (separator == NULL) {
2812         Py_UNICODE blank = ' ';
2813         sep = &blank;
2814         seplen = 1;
2815     }
2816     else {
2817         separator = PyUnicode_FromObject(separator);
2818         if (separator == NULL)
2819             goto onError;
2820         sep = PyUnicode_AS_UNICODE(separator);
2821         seplen = PyUnicode_GET_SIZE(separator);
2822     }
2823
2824     res = _PyUnicode_New(sz);
2825     if (res == NULL)
2826         goto onError;
2827     p = PyUnicode_AS_UNICODE(res);
2828     reslen = 0;
2829
2830     for (i = 0; ; ++i) {
2831         int itemlen;
2832         PyObject *item = PyIter_Next(it);
2833         if (item == NULL) {
2834             if (PyErr_Occurred())
2835                 goto onError;
2836             break;
2837         }
2838         if (!PyUnicode_Check(item)) {
2839             PyObject *v;
2840             v = PyUnicode_FromObject(item);
2841             Py_DECREF(item);
2842             item = v;
2843             if (item == NULL)
2844                 goto onError;
2845         }
2846         itemlen = PyUnicode_GET_SIZE(item);
2847         while (reslen + itemlen + seplen >= sz) {
2848             if (_PyUnicode_Resize(&res, sz*2))
2849                 goto onError;
2850             sz *= 2;
2851             p = PyUnicode_AS_UNICODE(res) + reslen;
2852         }
2853         if (i > 0) {
2854             Py_UNICODE_COPY(p, sep, seplen);
2855             p += seplen;
2856             reslen += seplen;
2857         }
2858         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2859         p += itemlen;
2860         reslen += itemlen;
2861         Py_DECREF(item);
2862     }
2863     if (_PyUnicode_Resize(&res, reslen))
2864         goto onError;
2865
2866     Py_XDECREF(separator);
2867     Py_DECREF(it);
2868     return (PyObject *)res;
2869
2870  onError:
2871     Py_XDECREF(separator);
2872     Py_XDECREF(res);
2873     Py_DECREF(it);
2874     return NULL;
2875 }
2876
2877 static
2878 PyUnicodeObject *pad(PyUnicodeObject *self,
2879                      int left,
2880                      int right,
2881                      Py_UNICODE fill)
2882 {
2883     PyUnicodeObject *u;
2884
2885     if (left < 0)
2886         left = 0;
2887     if (right < 0)
2888         right = 0;
2889
2890     if (left == 0 && right == 0) {
2891         Py_INCREF(self);
2892         return self;
2893     }
2894
2895     u = _PyUnicode_New(left + self->length + right);
2896     if (u) {
2897         if (left)
2898             Py_UNICODE_FILL(u->str, fill, left);
2899         Py_UNICODE_COPY(u->str + left, self->str, self->length);
2900         if (right)
2901             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2902     }
2903
2904     return u;
2905 }
2906
2907 #define SPLIT_APPEND(data, left, right)                                 \
2908         str = PyUnicode_FromUnicode(data + left, right - left);         \
2909         if (!str)                                                       \
2910             goto onError;                                               \
2911         if (PyList_Append(list, str)) {                                 \
2912             Py_DECREF(str);                                             \
2913             goto onError;                                               \
2914         }                                                               \
2915         else                                                            \
2916             Py_DECREF(str);
2917
2918 static
2919 PyObject *split_whitespace(PyUnicodeObject *self,
2920                            PyObject *list,
2921                            int maxcount)
2922 {
2923     register int i;
2924     register int j;
2925     int len = self->length;
2926     PyObject *str;
2927
2928     for (i = j = 0; i < len; ) {
2929         /* find a token */
2930         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2931             i++;
2932         j = i;
2933         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2934             i++;
2935         if (j < i) {
2936             if (maxcount-- <= 0)
2937                 break;
2938             SPLIT_APPEND(self->str, j, i);
2939             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2940                 i++;
2941             j = i;
2942         }
2943     }
2944     if (j < len) {
2945         SPLIT_APPEND(self->str, j, len);
2946     }
2947     return list;
2948
2949  onError:
2950     Py_DECREF(list);
2951     return NULL;
2952 }
2953
2954 PyObject *PyUnicode_Splitlines(PyObject *string,
2955                                int keepends)
2956 {
2957     register int i;
2958     register int j;
2959     int len;
2960     PyObject *list;
2961     PyObject *str;
2962     Py_UNICODE *data;
2963
2964     string = PyUnicode_FromObject(string);
2965     if (string == NULL)
2966         return NULL;
2967     data = PyUnicode_AS_UNICODE(string);
2968     len = PyUnicode_GET_SIZE(string);
2969
2970     list = PyList_New(0);
2971     if (!list)
2972         goto onError;
2973
2974     for (i = j = 0; i < len; ) {
2975         int eol;
2976
2977         /* Find a line and append it */
2978         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2979             i++;
2980
2981         /* Skip the line break reading CRLF as one line break */
2982         eol = i;
2983         if (i < len) {
2984             if (data[i] == '\r' && i + 1 < len &&
2985                 data[i+1] == '\n')
2986                 i += 2;
2987             else
2988                 i++;
2989             if (keepends)
2990                 eol = i;
2991         }
2992         SPLIT_APPEND(data, j, eol);
2993         j = i;
2994     }
2995     if (j < len) {
2996         SPLIT_APPEND(data, j, len);
2997     }
2998
2999     Py_DECREF(string);
3000     return list;
3001
3002  onError:
3003     Py_DECREF(list);
3004     Py_DECREF(string);
3005     return NULL;
3006 }
3007
3008 static
3009 PyObject *split_char(PyUnicodeObject *self,
3010                      PyObject *list,
3011                      Py_UNICODE ch,
3012                      int maxcount)
3013 {
3014     register int i;
3015     register int j;
3016     int len = self->length;
3017     PyObject *str;
3018
3019     for (i = j = 0; i < len; ) {
3020         if (self->str[i] == ch) {
3021             if (maxcount-- <= 0)
3022                 break;
3023             SPLIT_APPEND(self->str, j, i);
3024             i = j = i + 1;
3025         } else
3026             i++;
3027     }
3028     if (j <= len) {
3029         SPLIT_APPEND(self->str, j, len);
3030     }
3031     return list;
3032
3033  onError:
3034     Py_DECREF(list);
3035     return NULL;
3036 }
3037
3038 static
3039 PyObject *split_substring(PyUnicodeObject *self,
3040                           PyObject *list,
3041                           PyUnicodeObject *substring,
3042                           int maxcount)
3043 {
3044     register int i;
3045     register int j;
3046     int len = self->length;
3047     int sublen = substring->length;
3048     PyObject *str;
3049
3050     for (i = j = 0; i <= len - sublen; ) {
3051         if (Py_UNICODE_MATCH(self, i, substring)) {
3052             if (maxcount-- <= 0)
3053                 break;
3054             SPLIT_APPEND(self->str, j, i);
3055             i = j = i + sublen;
3056         } else
3057             i++;
3058     }
3059     if (j <= len) {
3060         SPLIT_APPEND(self->str, j, len);
3061     }
3062     return list;
3063
3064  onError:
3065     Py_DECREF(list);
3066     return NULL;
3067 }
3068
3069 #undef SPLIT_APPEND
3070
3071 static
3072 PyObject *split(PyUnicodeObject *self,
3073                 PyUnicodeObject *substring,
3074                 int maxcount)
3075 {
3076     PyObject *list;
3077
3078     if (maxcount < 0)
3079         maxcount = INT_MAX;
3080
3081     list = PyList_New(0);
3082     if (!list)
3083         return NULL;
3084
3085     if (substring == NULL)
3086         return split_whitespace(self,list,maxcount);
3087
3088     else if (substring->length == 1)
3089         return split_char(self,list,substring->str[0],maxcount);
3090
3091     else if (substring->length == 0) {
3092         Py_DECREF(list);
3093         PyErr_SetString(PyExc_ValueError, "empty separator");
3094         return NULL;
3095     }
3096     else
3097         return split_substring(self,list,substring,maxcount);
3098 }
3099
3100 static
3101 PyObject *strip(PyUnicodeObject *self,
3102                 int left,
3103                 int right)
3104 {
3105     Py_UNICODE *p = self->str;
3106     int start = 0;
3107     int end = self->length;
3108
3109     if (left)
3110         while (start < end && Py_UNICODE_ISSPACE(p[start]))
3111             start++;
3112
3113     if (right)
3114         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3115             end--;
3116
3117     if (start == 0 && end == self->length) {
3118         /* couldn't strip anything off, return original string */
3119         Py_INCREF(self);
3120         return (PyObject*) self;
3121     }
3122
3123     return (PyObject*) PyUnicode_FromUnicode(
3124         self->str + start,
3125         end - start
3126         );
3127 }
3128
3129 static
3130 PyObject *replace(PyUnicodeObject *self,
3131                   PyUnicodeObject *str1,
3132                   PyUnicodeObject *str2,
3133                   int maxcount)
3134 {
3135     PyUnicodeObject *u;
3136
3137     if (maxcount < 0)
3138         maxcount = INT_MAX;
3139
3140     if (str1->length == 1 && str2->length == 1) {
3141         int i;
3142
3143         /* replace characters */
3144         if (!findchar(self->str, self->length, str1->str[0])) {
3145             /* nothing to replace, return original string */
3146             Py_INCREF(self);
3147             u = self;
3148         } else {
3149             Py_UNICODE u1 = str1->str[0];
3150             Py_UNICODE u2 = str2->str[0];
3151
3152             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3153                 NULL,
3154                 self->length
3155                 );
3156             if (u != NULL) {
3157                 Py_UNICODE_COPY(u->str, self->str,
3158                                 self->length);
3159                 for (i = 0; i < u->length; i++)
3160                     if (u->str[i] == u1) {
3161                         if (--maxcount < 0)
3162                             break;
3163                         u->str[i] = u2;
3164                     }
3165         }
3166         }
3167
3168     } else {
3169         int n, i;
3170         Py_UNICODE *p;
3171
3172         /* replace strings */
3173         n = count(self, 0, self->length, str1);
3174         if (n > maxcount)
3175             n = maxcount;
3176         if (n == 0) {
3177             /* nothing to replace, return original string */
3178             Py_INCREF(self);
3179             u = self;
3180         } else {
3181             u = _PyUnicode_New(
3182                 self->length + n * (str2->length - str1->length));
3183             if (u) {
3184                 i = 0;
3185                 p = u->str;
3186                 while (i <= self->length - str1->length)
3187                     if (Py_UNICODE_MATCH(self, i, str1)) {
3188                         /* replace string segment */
3189                         Py_UNICODE_COPY(p, str2->str, str2->length);
3190                         p += str2->length;
3191                         i += str1->length;
3192                         if (--n <= 0) {
3193                             /* copy remaining part */
3194                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3195                             break;
3196                         }
3197                     } else
3198                         *p++ = self->str[i++];
3199             }
3200         }
3201     }
3202
3203     return (PyObject *) u;
3204 }
3205
3206 /* --- Unicode Object Methods --------------------------------------------- */
3207
3208 static char title__doc__[] =
3209 "S.title() -> unicode\n\
3210 \n\
3211 Return a titlecased version of S, i.e. words start with title case\n\
3212 characters, all remaining cased characters have lower case.";
3213
3214 static PyObject*
3215 unicode_title(PyUnicodeObject *self, PyObject *args)
3216 {
3217     if (!PyArg_NoArgs(args))
3218         return NULL;
3219     return fixup(self, fixtitle);
3220 }
3221
3222 static char capitalize__doc__[] =
3223 "S.capitalize() -> unicode\n\
3224 \n\
3225 Return a capitalized version of S, i.e. make the first character\n\
3226 have upper case.";
3227
3228 static PyObject*
3229 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3230 {
3231     if (!PyArg_NoArgs(args))
3232         return NULL;
3233     return fixup(self, fixcapitalize);
3234 }
3235
3236 #if 0
3237 static char capwords__doc__[] =
3238 "S.capwords() -> unicode\n\
3239 \n\
3240 Apply .capitalize() to all words in S and return the result with\n\
3241 normalized whitespace (all whitespace strings are replaced by ' ').";
3242
3243 static PyObject*
3244 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3245 {
3246     PyObject *list;
3247     PyObject *item;
3248     int i;
3249
3250     if (!PyArg_NoArgs(args))
3251         return NULL;
3252
3253     /* Split into words */
3254     list = split(self, NULL, -1);
3255     if (!list)
3256         return NULL;
3257
3258     /* Capitalize each word */
3259     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3260         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3261                      fixcapitalize);
3262         if (item == NULL)
3263             goto onError;
3264         Py_DECREF(PyList_GET_ITEM(list, i));
3265         PyList_SET_ITEM(list, i, item);
3266     }
3267
3268     /* Join the words to form a new string */
3269     item = PyUnicode_Join(NULL, list);
3270
3271 onError:
3272     Py_DECREF(list);
3273     return (PyObject *)item;
3274 }
3275 #endif
3276
3277 static char center__doc__[] =
3278 "S.center(width) -> unicode\n\
3279 \n\
3280 Return S centered in a Unicode string of length width. Padding is done\n\
3281 using spaces.";
3282
3283 static PyObject *
3284 unicode_center(PyUnicodeObject *self, PyObject *args)
3285 {
3286     int marg, left;
3287     int width;
3288
3289     if (!PyArg_ParseTuple(args, "i:center", &width))
3290         return NULL;
3291
3292     if (self->length >= width) {
3293         Py_INCREF(self);
3294         return (PyObject*) self;
3295     }
3296
3297     marg = width - self->length;
3298     left = marg / 2 + (marg & width & 1);
3299
3300     return (PyObject*) pad(self, left, marg - left, ' ');
3301 }
3302
3303 #if 0
3304
3305 /* This code should go into some future Unicode collation support
3306    module. The basic comparison should compare ordinals on a naive
3307    basis (this is what Java does and thus JPython too). */
3308
3309 /* speedy UTF-16 code point order comparison */
3310 /* gleaned from: */
3311 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3312
3313 static short utf16Fixup[32] =
3314 {
3315     0, 0, 0, 0, 0, 0, 0, 0,
3316     0, 0, 0, 0, 0, 0, 0, 0,
3317     0, 0, 0, 0, 0, 0, 0, 0,
3318     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3319 };
3320
3321 static int
3322 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3323 {
3324     int len1, len2;
3325
3326     Py_UNICODE *s1 = str1->str;
3327     Py_UNICODE *s2 = str2->str;
3328
3329     len1 = str1->length;
3330     len2 = str2->length;
3331
3332     while (len1 > 0 && len2 > 0) {
3333         Py_UNICODE c1, c2;
3334
3335         c1 = *s1++;
3336         c2 = *s2++;
3337
3338         if (c1 > (1<<11) * 26)
3339             c1 += utf16Fixup[c1>>11];
3340         if (c2 > (1<<11) * 26)
3341             c2 += utf16Fixup[c2>>11];
3342         /* now c1 and c2 are in UTF-32-compatible order */
3343
3344         if (c1 != c2)
3345             return (c1 < c2) ? -1 : 1;
3346
3347         len1--; len2--;
3348     }
3349
3350     return (len1 < len2) ? -1 : (len1 != len2);
3351 }
3352
3353 #else
3354
3355 static int
3356 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3357 {
3358     register int len1, len2;
3359
3360     Py_UNICODE *s1 = str1->str;
3361     Py_UNICODE *s2 = str2->str;
3362
3363     len1 = str1->length;
3364     len2 = str2->length;
3365
3366     while (len1 > 0 && len2 > 0) {
3367         Py_UNICODE c1, c2;
3368
3369         c1 = *s1++;
3370         c2 = *s2++;
3371
3372         if (c1 != c2)
3373             return (c1 < c2) ? -1 : 1;
3374
3375         len1--; len2--;
3376     }
3377
3378     return (len1 < len2) ? -1 : (len1 != len2);
3379 }
3380
3381 #endif
3382
3383 int PyUnicode_Compare(PyObject *left,
3384                       PyObject *right)
3385 {
3386     PyUnicodeObject *u = NULL, *v = NULL;
3387     int result;
3388
3389     /* Coerce the two arguments */
3390     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3391     if (u == NULL)
3392         goto onError;
3393     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3394     if (v == NULL)
3395         goto onError;
3396
3397     /* Shortcut for empty or interned objects */
3398     if (v == u) {
3399         Py_DECREF(u);
3400         Py_DECREF(v);
3401         return 0;
3402     }
3403
3404     result = unicode_compare(u, v);
3405
3406     Py_DECREF(u);
3407     Py_DECREF(v);
3408     return result;
3409
3410 onError:
3411     Py_XDECREF(u);
3412     Py_XDECREF(v);
3413     return -1;
3414 }
3415
3416 int PyUnicode_Contains(PyObject *container,
3417                        PyObject *element)
3418 {
3419     PyUnicodeObject *u = NULL, *v = NULL;
3420     int result;
3421     register const Py_UNICODE *p, *e;
3422     register Py_UNICODE ch;
3423
3424     /* Coerce the two arguments */
3425     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3426     if (v == NULL) {
3427         PyErr_SetString(PyExc_TypeError,
3428             "'in <string>' requires character as left operand");
3429         goto onError;
3430     }
3431     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3432     if (u == NULL) {
3433         Py_DECREF(v);
3434         goto onError;
3435     }
3436
3437     /* Check v in u */
3438     if (PyUnicode_GET_SIZE(v) != 1) {
3439         PyErr_SetString(PyExc_TypeError,
3440             "'in <string>' requires character as left operand");
3441         goto onError;
3442     }
3443     ch = *PyUnicode_AS_UNICODE(v);
3444     p = PyUnicode_AS_UNICODE(u);
3445     e = p + PyUnicode_GET_SIZE(u);
3446     result = 0;
3447     while (p < e) {
3448         if (*p++ == ch) {
3449             result = 1;
3450             break;
3451         }
3452     }
3453
3454     Py_DECREF(u);
3455     Py_DECREF(v);
3456     return result;
3457
3458 onError:
3459     Py_XDECREF(u);
3460     Py_XDECREF(v);
3461     return -1;
3462 }
3463
3464 /* Concat to string or Unicode object giving a new Unicode object. */
3465
3466 PyObject *PyUnicode_Concat(PyObject *left,
3467                            PyObject *right)
3468 {
3469     PyUnicodeObject *u = NULL, *v = NULL, *w;
3470
3471     /* Coerce the two arguments */
3472     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3473     if (u == NULL)
3474         goto onError;
3475     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3476     if (v == NULL)
3477         goto onError;
3478
3479     /* Shortcuts */
3480     if (v == unicode_empty) {
3481         Py_DECREF(v);
3482         return (PyObject *)u;
3483     }
3484     if (u == unicode_empty) {
3485         Py_DECREF(u);
3486         return (PyObject *)v;
3487     }
3488
3489     /* Concat the two Unicode strings */
3490     w = _PyUnicode_New(u->length + v->length);
3491     if (w == NULL)
3492         goto onError;
3493     Py_UNICODE_COPY(w->str, u->str, u->length);
3494     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3495
3496     Py_DECREF(u);
3497     Py_DECREF(v);
3498     return (PyObject *)w;
3499
3500 onError:
3501     Py_XDECREF(u);
3502     Py_XDECREF(v);
3503     return NULL;
3504 }
3505
3506 static char count__doc__[] =
3507 "S.count(sub[, start[, end]]) -> int\n\
3508 \n\
3509 Return the number of occurrences of substring sub in Unicode string\n\
3510 S[start:end].  Optional arguments start and end are\n\
3511 interpreted as in slice notation.";
3512
3513 static PyObject *
3514 unicode_count(PyUnicodeObject *self, PyObject *args)
3515 {
3516     PyUnicodeObject *substring;
3517     int start = 0;
3518     int end = INT_MAX;
3519     PyObject *result;
3520
3521     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3522                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3523         return NULL;
3524
3525     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3526                                                 (PyObject *)substring);
3527     if (substring == NULL)
3528         return NULL;
3529
3530     if (start < 0)
3531         start += self->length;
3532     if (start < 0)
3533         start = 0;
3534     if (end > self->length)
3535         end = self->length;
3536     if (end < 0)
3537         end += self->length;
3538     if (end < 0)
3539         end = 0;
3540
3541     result = PyInt_FromLong((long) count(self, start, end, substring));
3542
3543     Py_DECREF(substring);
3544     return result;
3545 }
3546
3547 static char encode__doc__[] =
3548 "S.encode([encoding[,errors]]) -> string\n\
3549 \n\
3550 Return an encoded string version of S. Default encoding is the current\n\
3551 default string encoding. errors may be given to set a different error\n\
3552 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3553 a ValueError. Other possible values are 'ignore' and 'replace'.";
3554
3555 static PyObject *
3556 unicode_encode(PyUnicodeObject *self, PyObject *args)
3557 {
3558     char *encoding = NULL;
3559     char *errors = NULL;
3560     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3561         return NULL;
3562     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3563 }
3564
3565 static char expandtabs__doc__[] =
3566 "S.expandtabs([tabsize]) -> unicode\n\
3567 \n\
3568 Return a copy of S where all tab characters are expanded using spaces.\n\
3569 If tabsize is not given, a tab size of 8 characters is assumed.";
3570
3571 static PyObject*
3572 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3573 {
3574     Py_UNICODE *e;
3575     Py_UNICODE *p;
3576     Py_UNICODE *q;
3577     int i, j;
3578     PyUnicodeObject *u;
3579     int tabsize = 8;
3580
3581     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3582         return NULL;
3583
3584     /* First pass: determine size of output string */
3585     i = j = 0;
3586     e = self->str + self->length;
3587     for (p = self->str; p < e; p++)
3588         if (*p == '\t') {
3589             if (tabsize > 0)
3590                 j += tabsize - (j % tabsize);
3591         }
3592         else {
3593             j++;
3594             if (*p == '\n' || *p == '\r') {
3595                 i += j;
3596                 j = 0;
3597             }
3598         }
3599
3600     /* Second pass: create output string and fill it */
3601     u = _PyUnicode_New(i + j);
3602     if (!u)
3603         return NULL;
3604
3605     j = 0;
3606     q = u->str;
3607
3608     for (p = self->str; p < e; p++)
3609         if (*p == '\t') {
3610             if (tabsize > 0) {
3611                 i = tabsize - (j % tabsize);
3612                 j += i;
3613                 while (i--)
3614                     *q++ = ' ';
3615             }
3616         }
3617         else {
3618             j++;
3619             *q++ = *p;
3620             if (*p == '\n' || *p == '\r')
3621                 j = 0;
3622         }
3623
3624     return (PyObject*) u;
3625 }
3626
3627 static char find__doc__[] =
3628 "S.find(sub [,start [,end]]) -> int\n\
3629 \n\
3630 Return the lowest index in S where substring sub is found,\n\
3631 such that sub is contained within s[start,end].  Optional\n\
3632 arguments start and end are interpreted as in slice notation.\n\
3633 \n\
3634 Return -1 on failure.";
3635
3636 static PyObject *
3637 unicode_find(PyUnicodeObject *self, PyObject *args)
3638 {
3639     PyUnicodeObject *substring;
3640     int start = 0;
3641     int end = INT_MAX;
3642     PyObject *result;
3643
3644     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3645                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3646         return NULL;
3647     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3648                                                 (PyObject *)substring);
3649     if (substring == NULL)
3650         return NULL;
3651
3652     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3653
3654     Py_DECREF(substring);
3655     return result;
3656 }
3657
3658 static PyObject *
3659 unicode_getitem(PyUnicodeObject *self, int index)
3660 {
3661     if (index < 0 || index >= self->length) {
3662         PyErr_SetString(PyExc_IndexError, "string index out of range");
3663         return NULL;
3664     }
3665
3666     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3667 }
3668
3669 static long
3670 unicode_hash(PyUnicodeObject *self)
3671 {
3672     /* Since Unicode objects compare equal to their ASCII string
3673        counterparts, they should use the individual character values
3674        as basis for their hash value.  This is needed to assure that
3675        strings and Unicode objects behave in the same way as
3676        dictionary keys. */
3677
3678     register int len;
3679     register Py_UNICODE *p;
3680     register long x;
3681
3682     if (self->hash != -1)
3683         return self->hash;
3684     len = PyUnicode_GET_SIZE(self);
3685     p = PyUnicode_AS_UNICODE(self);
3686     x = *p << 7;
3687     while (--len >= 0)
3688         x = (1000003*x) ^ *p++;
3689     x ^= PyUnicode_GET_SIZE(self);
3690     if (x == -1)
3691         x = -2;
3692     self->hash = x;
3693     return x;
3694 }
3695
3696 static char index__doc__[] =
3697 "S.index(sub [,start [,end]]) -> int\n\
3698 \n\
3699 Like S.find() but raise ValueError when the substring is not found.";
3700
3701 static PyObject *
3702 unicode_index(PyUnicodeObject *self, PyObject *args)
3703 {
3704     int result;
3705     PyUnicodeObject *substring;
3706     int start = 0;
3707     int end = INT_MAX;
3708
3709     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3710                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3711         return NULL;
3712
3713     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3714                                                 (PyObject *)substring);
3715     if (substring == NULL)
3716         return NULL;
3717
3718     result = findstring(self, substring, start, end, 1);
3719
3720     Py_DECREF(substring);
3721     if (result < 0) {
3722         PyErr_SetString(PyExc_ValueError, "substring not found");
3723         return NULL;
3724     }
3725     return PyInt_FromLong(result);
3726 }
3727
3728 static char islower__doc__[] =
3729 "S.islower() -> int\n\
3730 \n\
3731 Return 1 if  all cased characters in S are lowercase and there is\n\
3732 at least one cased character in S, 0 otherwise.";
3733
3734 static PyObject*
3735 unicode_islower(PyUnicodeObject *self, PyObject *args)
3736 {
3737     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3738     register const Py_UNICODE *e;
3739     int cased;
3740
3741     if (!PyArg_NoArgs(args))
3742         return NULL;
3743
3744     /* Shortcut for single character strings */
3745     if (PyUnicode_GET_SIZE(self) == 1)
3746         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3747
3748     /* Special case for empty strings */
3749     if (PyString_GET_SIZE(self) == 0)
3750         return PyInt_FromLong(0);
3751
3752     e = p + PyUnicode_GET_SIZE(self);
3753     cased = 0;
3754     for (; p < e; p++) {
3755         register const Py_UNICODE ch = *p;
3756
3757         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3758             return PyInt_FromLong(0);
3759         else if (!cased && Py_UNICODE_ISLOWER(ch))
3760             cased = 1;
3761     }
3762     return PyInt_FromLong(cased);
3763 }
3764
3765 static char isupper__doc__[] =
3766 "S.isupper() -> int\n\
3767 \n\
3768 Return 1 if  all cased characters in S are uppercase and there is\n\
3769 at least one cased character in S, 0 otherwise.";
3770
3771 static PyObject*
3772 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3773 {
3774     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3775     register const Py_UNICODE *e;
3776     int cased;
3777
3778     if (!PyArg_NoArgs(args))
3779         return NULL;
3780
3781     /* Shortcut for single character strings */
3782     if (PyUnicode_GET_SIZE(self) == 1)
3783         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3784
3785     /* Special case for empty strings */
3786     if (PyString_GET_SIZE(self) == 0)
3787         return PyInt_FromLong(0);
3788
3789     e = p + PyUnicode_GET_SIZE(self);
3790     cased = 0;
3791     for (; p < e; p++) {
3792         register const Py_UNICODE ch = *p;
3793
3794         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3795             return PyInt_FromLong(0);
3796         else if (!cased && Py_UNICODE_ISUPPER(ch))
3797             cased = 1;
3798     }
3799     return PyInt_FromLong(cased);
3800 }
3801
3802 static char istitle__doc__[] =
3803 "S.istitle() -> int\n\
3804 \n\
3805 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3806 may only follow uncased characters and lowercase characters only cased\n\
3807 ones. Return 0 otherwise.";
3808
3809 static PyObject*
3810 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3811 {
3812     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813     register const Py_UNICODE *e;
3814     int cased, previous_is_cased;
3815
3816     if (!PyArg_NoArgs(args))
3817         return NULL;
3818
3819     /* Shortcut for single character strings */
3820     if (PyUnicode_GET_SIZE(self) == 1)
3821         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3822                               (Py_UNICODE_ISUPPER(*p) != 0));
3823
3824     /* Special case for empty strings */
3825     if (PyString_GET_SIZE(self) == 0)
3826         return PyInt_FromLong(0);
3827
3828     e = p + PyUnicode_GET_SIZE(self);
3829     cased = 0;
3830     previous_is_cased = 0;
3831     for (; p < e; p++) {
3832         register const Py_UNICODE ch = *p;
3833
3834         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3835             if (previous_is_cased)
3836                 return PyInt_FromLong(0);
3837             previous_is_cased = 1;
3838             cased = 1;
3839         }
3840         else if (Py_UNICODE_ISLOWER(ch)) {
3841             if (!previous_is_cased)
3842                 return PyInt_FromLong(0);
3843             previous_is_cased = 1;
3844             cased = 1;
3845         }
3846         else
3847             previous_is_cased = 0;
3848     }
3849     return PyInt_FromLong(cased);
3850 }
3851
3852 static char isspace__doc__[] =
3853 "S.isspace() -> int\n\
3854 \n\
3855 Return 1 if there are only whitespace characters in S,\n\
3856 0 otherwise.";
3857
3858 static PyObject*
3859 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3860 {
3861     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862     register const Py_UNICODE *e;
3863
3864     if (!PyArg_NoArgs(args))
3865         return NULL;
3866
3867     /* Shortcut for single character strings */
3868     if (PyUnicode_GET_SIZE(self) == 1 &&
3869         Py_UNICODE_ISSPACE(*p))
3870         return PyInt_FromLong(1);
3871
3872     /* Special case for empty strings */
3873     if (PyString_GET_SIZE(self) == 0)
3874         return PyInt_FromLong(0);
3875
3876     e = p + PyUnicode_GET_SIZE(self);
3877     for (; p < e; p++) {
3878         if (!Py_UNICODE_ISSPACE(*p))
3879             return PyInt_FromLong(0);
3880     }
3881     return PyInt_FromLong(1);
3882 }
3883
3884 static char isalpha__doc__[] =
3885 "S.isalpha() -> int\n\
3886 \n\
3887 Return 1 if  all characters in S are alphabetic\n\
3888 and there is at least one character in S, 0 otherwise.";
3889
3890 static PyObject*
3891 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3892 {
3893     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894     register const Py_UNICODE *e;
3895
3896     if (!PyArg_NoArgs(args))
3897         return NULL;
3898
3899     /* Shortcut for single character strings */
3900     if (PyUnicode_GET_SIZE(self) == 1 &&
3901         Py_UNICODE_ISALPHA(*p))
3902         return PyInt_FromLong(1);
3903
3904     /* Special case for empty strings */
3905     if (PyString_GET_SIZE(self) == 0)
3906         return PyInt_FromLong(0);
3907
3908     e = p + PyUnicode_GET_SIZE(self);
3909     for (; p < e; p++) {
3910         if (!Py_UNICODE_ISALPHA(*p))
3911             return PyInt_FromLong(0);
3912     }
3913     return PyInt_FromLong(1);
3914 }
3915
3916 static char isalnum__doc__[] =
3917 "S.isalnum() -> int\n\
3918 \n\
3919 Return 1 if  all characters in S are alphanumeric\n\
3920 and there is at least one character in S, 0 otherwise.";
3921
3922 static PyObject*
3923 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3924 {
3925     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3926     register const Py_UNICODE *e;
3927
3928     if (!PyArg_NoArgs(args))
3929         return NULL;
3930
3931     /* Shortcut for single character strings */
3932     if (PyUnicode_GET_SIZE(self) == 1 &&
3933         Py_UNICODE_ISALNUM(*p))
3934         return PyInt_FromLong(1);
3935
3936     /* Special case for empty strings */
3937     if (PyString_GET_SIZE(self) == 0)
3938         return PyInt_FromLong(0);
3939
3940     e = p + PyUnicode_GET_SIZE(self);
3941     for (; p < e; p++) {
3942         if (!Py_UNICODE_ISALNUM(*p))
3943             return PyInt_FromLong(0);
3944     }
3945     return PyInt_FromLong(1);
3946 }
3947
3948 static char isdecimal__doc__[] =
3949 "S.isdecimal() -> int\n\
3950 \n\
3951 Return 1 if there are only decimal characters in S,\n\
3952 0 otherwise.";
3953
3954 static PyObject*
3955 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3956 {
3957     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3958     register const Py_UNICODE *e;
3959
3960     if (!PyArg_NoArgs(args))
3961         return NULL;
3962
3963     /* Shortcut for single character strings */
3964     if (PyUnicode_GET_SIZE(self) == 1 &&
3965         Py_UNICODE_ISDECIMAL(*p))
3966         return PyInt_FromLong(1);
3967
3968     /* Special case for empty strings */
3969     if (PyString_GET_SIZE(self) == 0)
3970         return PyInt_FromLong(0);
3971
3972     e = p + PyUnicode_GET_SIZE(self);
3973     for (; p < e; p++) {
3974         if (!Py_UNICODE_ISDECIMAL(*p))
3975             return PyInt_FromLong(0);
3976     }
3977     return PyInt_FromLong(1);
3978 }
3979
3980 static char isdigit__doc__[] =
3981 "S.isdigit() -> int\n\
3982 \n\
3983 Return 1 if there are only digit characters in S,\n\
3984 0 otherwise.";
3985
3986 static PyObject*
3987 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3988 {
3989     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3990     register const Py_UNICODE *e;
3991
3992     if (!PyArg_NoArgs(args))
3993         return NULL;
3994
3995     /* Shortcut for single character strings */
3996     if (PyUnicode_GET_SIZE(self) == 1 &&
3997         Py_UNICODE_ISDIGIT(*p))
3998         return PyInt_FromLong(1);
3999
4000     /* Special case for empty strings */
4001     if (PyString_GET_SIZE(self) == 0)
4002         return PyInt_FromLong(0);
4003
4004     e = p + PyUnicode_GET_SIZE(self);
4005     for (; p < e; p++) {
4006         if (!Py_UNICODE_ISDIGIT(*p))
4007             return PyInt_FromLong(0);
4008     }
4009     return PyInt_FromLong(1);
4010 }
4011
4012 static char isnumeric__doc__[] =
4013 "S.isnumeric() -> int\n\
4014 \n\
4015 Return 1 if there are only numeric characters in S,\n\
4016 0 otherwise.";
4017
4018 static PyObject*
4019 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4020 {
4021     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4022     register const Py_UNICODE *e;
4023
4024     if (!PyArg_NoArgs(args))
4025         return NULL;
4026
4027     /* Shortcut for single character strings */
4028     if (PyUnicode_GET_SIZE(self) == 1 &&
4029         Py_UNICODE_ISNUMERIC(*p))
4030         return PyInt_FromLong(1);
4031
4032     /* Special case for empty strings */
4033     if (PyString_GET_SIZE(self) == 0)
4034         return PyInt_FromLong(0);
4035
4036     e = p + PyUnicode_GET_SIZE(self);
4037     for (; p < e; p++) {
4038         if (!Py_UNICODE_ISNUMERIC(*p))
4039             return PyInt_FromLong(0);
4040     }
4041     return PyInt_FromLong(1);
4042 }
4043
4044 static char join__doc__[] =
4045 "S.join(sequence) -> unicode\n\
4046 \n\
4047 Return a string which is the concatenation of the strings in the\n\
4048 sequence.  The separator between elements is S.";
4049
4050 static PyObject*
4051 unicode_join(PyUnicodeObject *self, PyObject *args)
4052 {
4053     PyObject *data;
4054     if (!PyArg_ParseTuple(args, "O:join", &data))
4055         return NULL;
4056
4057     return PyUnicode_Join((PyObject *)self, data);
4058 }
4059
4060 static int
4061 unicode_length(PyUnicodeObject *self)
4062 {
4063     return self->length;
4064 }
4065
4066 static char ljust__doc__[] =
4067 "S.ljust(width) -> unicode\n\
4068 \n\
4069 Return S left justified in a Unicode string of length width. Padding is\n\
4070 done using spaces.";
4071
4072 static PyObject *
4073 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4074 {
4075     int width;
4076     if (!PyArg_ParseTuple(args, "i:ljust", &width))
4077         return NULL;
4078
4079     if (self->length >= width) {
4080         Py_INCREF(self);
4081         return (PyObject*) self;
4082     }
4083
4084     return (PyObject*) pad(self, 0, width - self->length, ' ');
4085 }
4086
4087 static char lower__doc__[] =
4088 "S.lower() -> unicode\n\
4089 \n\
4090 Return a copy of the string S converted to lowercase.";
4091
4092 static PyObject*
4093 unicode_lower(PyUnicodeObject *self, PyObject *args)
4094 {
4095     if (!PyArg_NoArgs(args))
4096         return NULL;
4097     return fixup(self, fixlower);
4098 }
4099
4100 static char lstrip__doc__[] =
4101 "S.lstrip() -> unicode\n\
4102 \n\
4103 Return a copy of the string S with leading whitespace removed.";
4104
4105 static PyObject *
4106 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4107 {
4108     if (!PyArg_NoArgs(args))
4109         return NULL;
4110     return strip(self, 1, 0);
4111 }
4112
4113 static PyObject*
4114 unicode_repeat(PyUnicodeObject *str, int len)
4115 {
4116     PyUnicodeObject *u;
4117     Py_UNICODE *p;
4118     int nchars;
4119     size_t nbytes;
4120
4121     if (len < 0)
4122         len = 0;
4123
4124     if (len == 1) {
4125         /* no repeat, return original string */
4126         Py_INCREF(str);
4127         return (PyObject*) str;
4128     }
4129
4130     /* ensure # of chars needed doesn't overflow int and # of bytes
4131      * needed doesn't overflow size_t
4132      */
4133     nchars = len * str->length;
4134     if (len && nchars / len != str->length) {
4135         PyErr_SetString(PyExc_OverflowError,
4136                         "repeated string is too long");
4137         return NULL;
4138     }
4139     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4140     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4141         PyErr_SetString(PyExc_OverflowError,
4142                         "repeated string is too long");
4143         return NULL;
4144     }
4145     u = _PyUnicode_New(nchars);
4146     if (!u)
4147         return NULL;
4148
4149     p = u->str;
4150
4151     while (len-- > 0) {
4152         Py_UNICODE_COPY(p, str->str, str->length);
4153         p += str->length;
4154     }
4155
4156     return (PyObject*) u;
4157 }
4158
4159 PyObject *PyUnicode_Replace(PyObject *obj,
4160                             PyObject *subobj,
4161                             PyObject *replobj,
4162                             int maxcount)
4163 {
4164     PyObject *self;
4165     PyObject *str1;
4166     PyObject *str2;
4167     PyObject *result;
4168
4169     self = PyUnicode_FromObject(obj);
4170     if (self == NULL)
4171         return NULL;
4172     str1 = PyUnicode_FromObject(subobj);
4173     if (str1 == NULL) {
4174         Py_DECREF(self);
4175         return NULL;
4176     }
4177     str2 = PyUnicode_FromObject(replobj);
4178     if (str2 == NULL) {
4179         Py_DECREF(self);
4180         Py_DECREF(str1);
4181         return NULL;
4182     }
4183     result = replace((PyUnicodeObject *)self,
4184                      (PyUnicodeObject *)str1,
4185                      (PyUnicodeObject *)str2,
4186                      maxcount);
4187     Py_DECREF(self);
4188     Py_DECREF(str1);
4189     Py_DECREF(str2);
4190     return result;
4191 }
4192
4193 static char replace__doc__[] =
4194 "S.replace (old, new[, maxsplit]) -> unicode\n\
4195 \n\
4196 Return a copy of S with all occurrences of substring\n\
4197 old replaced by new.  If the optional argument maxsplit is\n\
4198 given, only the first maxsplit occurrences are replaced.";
4199
4200 static PyObject*
4201 unicode_replace(PyUnicodeObject *self, PyObject *args)
4202 {
4203     PyUnicodeObject *str1;
4204     PyUnicodeObject *str2;
4205     int maxcount = -1;
4206     PyObject *result;
4207
4208     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4209         return NULL;
4210     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4211     if (str1 == NULL)
4212         return NULL;
4213     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4214     if (str2 == NULL)
4215         return NULL;
4216
4217     result = replace(self, str1, str2, maxcount);
4218
4219     Py_DECREF(str1);
4220     Py_DECREF(str2);
4221     return result;
4222 }
4223
4224 static
4225 PyObject *unicode_repr(PyObject *unicode)
4226 {
4227     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4228                                 PyUnicode_GET_SIZE(unicode),
4229                                 1);
4230 }
4231
4232 static char rfind__doc__[] =
4233 "S.rfind(sub [,start [,end]]) -> int\n\
4234 \n\
4235 Return the highest index in S where substring sub is found,\n\
4236 such that sub is contained within s[start,end].  Optional\n\
4237 arguments start and end are interpreted as in slice notation.\n\
4238 \n\
4239 Return -1 on failure.";
4240
4241 static PyObject *
4242 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4243 {
4244     PyUnicodeObject *substring;
4245     int start = 0;
4246     int end = INT_MAX;
4247     PyObject *result;
4248
4249     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4250                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4251         return NULL;
4252     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4253                                                 (PyObject *)substring);
4254     if (substring == NULL)
4255         return NULL;
4256
4257     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4258
4259     Py_DECREF(substring);
4260     return result;
4261 }
4262
4263 static char rindex__doc__[] =
4264 "S.rindex(sub [,start [,end]]) -> int\n\
4265 \n\
4266 Like S.rfind() but raise ValueError when the substring is not found.";
4267
4268 static PyObject *
4269 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4270 {
4271     int result;
4272     PyUnicodeObject *substring;
4273     int start = 0;
4274     int end = INT_MAX;
4275
4276     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4277                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4278         return NULL;
4279     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4280                                                 (PyObject *)substring);
4281     if (substring == NULL)
4282         return NULL;
4283
4284     result = findstring(self, substring, start, end, -1);
4285
4286     Py_DECREF(substring);
4287     if (result < 0) {
4288         PyErr_SetString(PyExc_ValueError, "substring not found");
4289         return NULL;
4290     }
4291     return PyInt_FromLong(result);
4292 }
4293
4294 static char rjust__doc__[] =
4295 "S.rjust(width) -> unicode\n\
4296 \n\
4297 Return S right justified in a Unicode string of length width. Padding is\n\
4298 done using spaces.";
4299
4300 static PyObject *
4301 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4302 {
4303     int width;
4304     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4305         return NULL;
4306
4307     if (self->length >= width) {
4308         Py_INCREF(self);
4309         return (PyObject*) self;
4310     }
4311
4312     return (PyObject*) pad(self, width - self->length, 0, ' ');
4313 }
4314
4315 static char rstrip__doc__[] =
4316 "S.rstrip() -> unicode\n\
4317 \n\
4318 Return a copy of the string S with trailing whitespace removed.";
4319
4320 static PyObject *
4321 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4322 {
4323     if (!PyArg_NoArgs(args))
4324         return NULL;
4325     return strip(self, 0, 1);
4326 }
4327
4328 static PyObject*
4329 unicode_slice(PyUnicodeObject *self, int start, int end)
4330 {
4331     /* standard clamping */
4332     if (start < 0)
4333         start = 0;
4334     if (end < 0)
4335         end = 0;
4336     if (end > self->length)
4337         end = self->length;
4338     if (start == 0 && end == self->length) {
4339         /* full slice, return original string */
4340         Py_INCREF(self);
4341         return (PyObject*) self;
4342     }
4343     if (start > end)
4344         start = end;
4345     /* copy slice */
4346     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4347                                              end - start);
4348 }
4349
4350 PyObject *PyUnicode_Split(PyObject *s,
4351                           PyObject *sep,
4352                           int maxsplit)
4353 {
4354     PyObject *result;
4355
4356     s = PyUnicode_FromObject(s);
4357     if (s == NULL)
4358         return NULL;
4359     if (sep != NULL) {
4360         sep = PyUnicode_FromObject(sep);
4361         if (sep == NULL) {
4362             Py_DECREF(s);
4363             return NULL;
4364         }
4365     }
4366
4367     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4368
4369     Py_DECREF(s);
4370     Py_XDECREF(sep);
4371     return result;
4372 }
4373
4374 static char split__doc__[] =
4375 "S.split([sep [,maxsplit]]) -> list of strings\n\
4376 \n\
4377 Return a list of the words in S, using sep as the\n\
4378 delimiter string.  If maxsplit is given, at most maxsplit\n\
4379 splits are done. If sep is not specified, any whitespace string\n\
4380 is a separator.";
4381
4382 static PyObject*
4383 unicode_split(PyUnicodeObject *self, PyObject *args)
4384 {
4385     PyObject *substring = Py_None;
4386     int maxcount = -1;
4387
4388     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4389         return NULL;
4390
4391     if (substring == Py_None)
4392         return split(self, NULL, maxcount);
4393     else if (PyUnicode_Check(substring))
4394         return split(self, (PyUnicodeObject *)substring, maxcount);
4395     else
4396         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4397 }
4398
4399 static char splitlines__doc__[] =
4400 "S.splitlines([keepends]]) -> list of strings\n\
4401 \n\
4402 Return a list of the lines in S, breaking at line boundaries.\n\
4403 Line breaks are not included in the resulting list unless keepends\n\
4404 is given and true.";
4405
4406 static PyObject*
4407 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4408 {
4409     int keepends = 0;
4410
4411     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4412         return NULL;
4413
4414     return PyUnicode_Splitlines((PyObject *)self, keepends);
4415 }
4416
4417 static
4418 PyObject *unicode_str(PyUnicodeObject *self)
4419 {
4420     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4421 }
4422
4423 static char strip__doc__[] =
4424 "S.strip() -> unicode\n\
4425 \n\
4426 Return a copy of S with leading and trailing whitespace removed.";
4427
4428 static PyObject *
4429 unicode_strip(PyUnicodeObject *self, PyObject *args)
4430 {
4431     if (!PyArg_NoArgs(args))
4432         return NULL;
4433     return strip(self, 1, 1);
4434 }
4435
4436 static char swapcase__doc__[] =
4437 "S.swapcase() -> unicode\n\
4438 \n\
4439 Return a copy of S with uppercase characters converted to lowercase\n\
4440 and vice versa.";
4441
4442 static PyObject*
4443 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4444 {
4445     if (!PyArg_NoArgs(args))
4446         return NULL;
4447     return fixup(self, fixswapcase);
4448 }
4449
4450 static char translate__doc__[] =
4451 "S.translate(table) -> unicode\n\
4452 \n\
4453 Return a copy of the string S, where all characters have been mapped\n\
4454 through the given translation table, which must be a mapping of\n\
4455 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4456 are left untouched. Characters mapped to None are deleted.";
4457
4458 static PyObject*
4459 unicode_translate(PyUnicodeObject *self, PyObject *args)
4460 {
4461     PyObject *table;
4462
4463     if (!PyArg_ParseTuple(args, "O:translate", &table))
4464         return NULL;
4465     return PyUnicode_TranslateCharmap(self->str,
4466                                       self->length,
4467                                       table,
4468                                       "ignore");
4469 }
4470
4471 static char upper__doc__[] =
4472 "S.upper() -> unicode\n\
4473 \n\
4474 Return a copy of S converted to uppercase.";
4475
4476 static PyObject*
4477 unicode_upper(PyUnicodeObject *self, PyObject *args)
4478 {
4479     if (!PyArg_NoArgs(args))
4480         return NULL;
4481     return fixup(self, fixupper);
4482 }
4483
4484 #if 0
4485 static char zfill__doc__[] =
4486 "S.zfill(width) -> unicode\n\
4487 \n\
4488 Pad a numeric string x with zeros on the left, to fill a field\n\
4489 of the specified width. The string x is never truncated.";
4490
4491 static PyObject *
4492 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4493 {
4494     int fill;
4495     PyUnicodeObject *u;
4496
4497     int width;
4498     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4499         return NULL;
4500
4501     if (self->length >= width) {
4502         Py_INCREF(self);
4503         return (PyObject*) self;
4504     }
4505
4506     fill = width - self->length;
4507
4508     u = pad(self, fill, 0, '0');
4509
4510     if (u->str[fill] == '+' || u->str[fill] == '-') {
4511         /* move sign to beginning of string */
4512         u->str[0] = u->str[fill];
4513         u->str[fill] = '0';
4514     }
4515
4516     return (PyObject*) u;
4517 }
4518 #endif
4519
4520 #if 0
4521 static PyObject*
4522 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4523 {
4524     if (!PyArg_NoArgs(args))
4525         return NULL;
4526     return PyInt_FromLong(unicode_freelist_size);
4527 }
4528 #endif
4529
4530 static char startswith__doc__[] =
4531 "S.startswith(prefix[, start[, end]]) -> int\n\
4532 \n\
4533 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4534 optional start, test S beginning at that position.  With optional end, stop\n\
4535 comparing S at that position.";
4536
4537 static PyObject *
4538 unicode_startswith(PyUnicodeObject *self,
4539                    PyObject *args)
4540 {
4541     PyUnicodeObject *substring;
4542     int start = 0;
4543     int end = INT_MAX;
4544     PyObject *result;
4545
4546     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4547                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4548         return NULL;
4549     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4550                                                 (PyObject *)substring);
4551     if (substring == NULL)
4552         return NULL;
4553
4554     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4555
4556     Py_DECREF(substring);
4557     return result;
4558 }
4559
4560
4561 static char endswith__doc__[] =
4562 "S.endswith(suffix[, start[, end]]) -> int\n\
4563 \n\
4564 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4565 optional start, test S beginning at that position.  With optional end, stop\n\
4566 comparing S at that position.";
4567
4568 static PyObject *
4569 unicode_endswith(PyUnicodeObject *self,
4570                  PyObject *args)
4571 {
4572     PyUnicodeObject *substring;
4573     int start = 0;
4574     int end = INT_MAX;
4575     PyObject *result;
4576
4577     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4578                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4579         return NULL;
4580     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4581                                                 (PyObject *)substring);
4582     if (substring == NULL)
4583         return NULL;
4584
4585     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4586
4587     Py_DECREF(substring);
4588     return result;
4589 }
4590
4591
4592 static PyMethodDef unicode_methods[] = {
4593
4594     /* Order is according to common usage: often used methods should
4595        appear first, since lookup is done sequentially. */
4596
4597     {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4598     {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4599     {"split", (PyCFunction) unicode_split, 1, split__doc__},
4600     {"join", (PyCFunction) unicode_join, 1, join__doc__},
4601     {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4602     {"title", (PyCFunction) unicode_title, 0, title__doc__},
4603     {"center", (PyCFunction) unicode_center, 1, center__doc__},
4604     {"count", (PyCFunction) unicode_count, 1, count__doc__},
4605     {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4606     {"find", (PyCFunction) unicode_find, 1, find__doc__},
4607     {"index", (PyCFunction) unicode_index, 1, index__doc__},
4608     {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4609     {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4610     {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4611 /*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4612     {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4613     {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4614     {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4615     {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4616     {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4617     {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4618     {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4619     {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4620     {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4621     {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4622     {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4623     {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4624     {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4625     {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4626     {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4627     {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4628     {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4629     {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4630     {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4631     {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4632 #if 0
4633     {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4634     {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4635 #endif
4636
4637 #if 0
4638     /* This one is just used for debugging the implementation. */
4639     {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4640 #endif
4641
4642     {NULL, NULL}
4643 };
4644
4645 static PyObject *
4646 unicode_getattr(PyUnicodeObject *self, char *name)
4647 {
4648     return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4649 }
4650
4651 static PySequenceMethods unicode_as_sequence = {
4652     (inquiry) unicode_length,           /* sq_length */
4653     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4654     (intargfunc) unicode_repeat,        /* sq_repeat */
4655     (intargfunc) unicode_getitem,       /* sq_item */
4656     (intintargfunc) unicode_slice,      /* sq_slice */
4657     0,                                  /* sq_ass_item */
4658     0,                                  /* sq_ass_slice */
4659     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4660 };
4661
4662 static int
4663 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4664                           int index,
4665                           const void **ptr)
4666 {
4667     if (index != 0) {
4668         PyErr_SetString(PyExc_SystemError,
4669                         "accessing non-existent unicode segment");
4670         return -1;
4671     }
4672     *ptr = (void *) self->str;
4673     return PyUnicode_GET_DATA_SIZE(self);
4674 }
4675
4676 static int
4677 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4678                            const void **ptr)
4679 {
4680     PyErr_SetString(PyExc_TypeError,
4681                     "cannot use unicode as modifyable buffer");
4682     return -1;
4683 }
4684
4685 static int
4686 unicode_buffer_getsegcount(PyUnicodeObject *self,
4687                            int *lenp)
4688 {
4689     if (lenp)
4690         *lenp = PyUnicode_GET_DATA_SIZE(self);
4691     return 1;
4692 }
4693
4694 static int
4695 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4696                           int index,
4697                           const void **ptr)
4698 {
4699     PyObject *str;
4700
4701     if (index != 0) {
4702         PyErr_SetString(PyExc_SystemError,
4703                         "accessing non-existent unicode segment");
4704         return -1;
4705     }
4706     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4707     if (str == NULL)
4708         return -1;
4709     *ptr = (void *) PyString_AS_STRING(str);
4710     return PyString_GET_SIZE(str);
4711 }
4712
4713 /* Helpers for PyUnicode_Format() */
4714
4715 static PyObject *
4716 getnextarg(PyObject *args, int arglen, int *p_argidx)
4717 {
4718     int argidx = *p_argidx;
4719     if (argidx < arglen) {
4720         (*p_argidx)++;
4721         if (arglen < 0)
4722             return args;
4723         else
4724             return PyTuple_GetItem(args, argidx);
4725     }
4726     PyErr_SetString(PyExc_TypeError,
4727                     "not enough arguments for format string");
4728     return NULL;
4729 }
4730
4731 #define F_LJUST (1<<0)
4732 #define F_SIGN  (1<<1)
4733 #define F_BLANK (1<<2)
4734 #define F_ALT   (1<<3)
4735 #define F_ZERO  (1<<4)
4736
4737 static
4738 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4739 {
4740     register int i;
4741     int len;
4742     va_list va;
4743     char *charbuffer;
4744     va_start(va, format);
4745
4746     /* First, format the string as char array, then expand to Py_UNICODE
4747        array. */
4748     charbuffer = (char *)buffer;
4749     len = vsprintf(charbuffer, format, va);
4750     for (i = len - 1; i >= 0; i--)
4751         buffer[i] = (Py_UNICODE) charbuffer[i];
4752
4753     va_end(va);
4754     return len;
4755 }
4756
4757 static int
4758 formatfloat(Py_UNICODE *buf,
4759             size_t buflen,
4760             int flags,
4761             int prec,
4762             int type,
4763             PyObject *v)
4764 {
4765     /* fmt = '%#.' + `prec` + `type`
4766        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4767     char fmt[20];
4768     double x;
4769
4770     x = PyFloat_AsDouble(v);
4771     if (x == -1.0 && PyErr_Occurred())
4772         return -1;
4773     if (prec < 0)
4774         prec = 6;
4775     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4776         type = 'g';
4777     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4778     /* worst case length calc to ensure no buffer overrun:
4779          fmt = %#.<prec>g
4780          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4781             for any double rep.)
4782          len = 1 + prec + 1 + 2 + 5 = 9 + prec
4783        If prec=0 the effective precision is 1 (the leading digit is
4784        always given), therefore increase by one to 10+prec. */
4785     if (buflen <= (size_t)10 + (size_t)prec) {
4786         PyErr_SetString(PyExc_OverflowError,
4787             "formatted float is too long (precision too long?)");
4788         return -1;
4789     }
4790     return usprintf(buf, fmt, x);
4791 }
4792
4793 static PyObject*
4794 formatlong(PyObject *val, int flags, int prec, int type)
4795 {
4796         char *buf;
4797         int i, len;
4798         PyObject *str; /* temporary string object. */
4799         PyUnicodeObject *result;
4800
4801         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4802         if (!str)
4803                 return NULL;
4804         result = _PyUnicode_New(len);
4805         for (i = 0; i < len; i++)
4806                 result->str[i] = buf[i];
4807         result->str[len] = 0;
4808         Py_DECREF(str);
4809         return (PyObject*)result;
4810 }
4811
4812 static int
4813 formatint(Py_UNICODE *buf,
4814           size_t buflen,
4815           int flags,
4816           int prec,
4817           int type,
4818           PyObject *v)
4819 {
4820     /* fmt = '%#.' + `prec` + 'l' + `type`
4821        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4822        + 1 + 1 = 24*/
4823     char fmt[64]; /* plenty big enough! */
4824     long x;
4825     int use_native_c_format = 1;
4826
4827     x = PyInt_AsLong(v);
4828     if (x == -1 && PyErr_Occurred())
4829         return -1;
4830     if (prec < 0)
4831         prec = 1;
4832     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4833        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4834     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4835         PyErr_SetString(PyExc_OverflowError,
4836             "formatted integer is too long (precision too long?)");
4837         return -1;
4838     }
4839     /* When converting 0 under %#x or %#X, C leaves off the base marker,
4840      * but we want it (for consistency with other %#x conversions, and
4841      * for consistency with Python's hex() function).
4842      * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
4843      * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4844      * So add it only if the platform doesn't already.
4845      */
4846     if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4847         /* Only way to know what the platform does is to try it. */
4848         sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4849         if (fmt[1] != (char)type) {
4850             /* Supply our own leading 0x/0X -- needed under std C */
4851             use_native_c_format = 0;
4852             sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4853         }
4854     }
4855     if (use_native_c_format)
4856          sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4857     return usprintf(buf, fmt, x);
4858 }
4859
4860 static int
4861 formatchar(Py_UNICODE *buf,
4862            size_t buflen,
4863            PyObject *v)
4864 {
4865     /* presume that the buffer is at least 2 characters long */
4866     if (PyUnicode_Check(v)) {
4867         if (PyUnicode_GET_SIZE(v) != 1)
4868             goto onError;
4869         buf[0] = PyUnicode_AS_UNICODE(v)[0];
4870     }
4871
4872     else if (PyString_Check(v)) {
4873         if (PyString_GET_SIZE(v) != 1)
4874             goto onError;
4875         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4876     }
4877
4878     else {
4879         /* Integer input truncated to a character */
4880         long x;
4881         x = PyInt_AsLong(v);
4882         if (x == -1 && PyErr_Occurred())
4883             goto onError;
4884         buf[0] = (char) x;
4885     }
4886     buf[1] = '\0';
4887     return 1;
4888
4889  onError:
4890     PyErr_SetString(PyExc_TypeError,
4891                     "%c requires int or char");
4892     return -1;
4893 }
4894
4895 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4896
4897    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4898    chars are formatted. XXX This is a magic number. Each formatting
4899    routine does bounds checking to ensure no overflow, but a better
4900    solution may be to malloc a buffer of appropriate size for each
4901    format. For now, the current solution is sufficient.
4902 */
4903 #define FORMATBUFLEN (size_t)120
4904
4905 PyObject *PyUnicode_Format(PyObject *format,
4906                            PyObject *args)
4907 {
4908     Py_UNICODE *fmt, *res;
4909     int fmtcnt, rescnt, reslen, arglen, argidx;
4910     int args_owned = 0;
4911     PyUnicodeObject *result = NULL;
4912     PyObject *dict = NULL;
4913     PyObject *uformat;
4914
4915     if (format == NULL || args == NULL) {
4916         PyErr_BadInternalCall();
4917         return NULL;
4918     }
4919     uformat = PyUnicode_FromObject(format);
4920     if (uformat == NULL)
4921         return NULL;
4922     fmt = PyUnicode_AS_UNICODE(uformat);
4923     fmtcnt = PyUnicode_GET_SIZE(uformat);
4924
4925     reslen = rescnt = fmtcnt + 100;
4926     result = _PyUnicode_New(reslen);
4927     if (result == NULL)
4928         goto onError;
4929     res = PyUnicode_AS_UNICODE(result);
4930
4931     if (PyTuple_Check(args)) {
4932         arglen = PyTuple_Size(args);
4933         argidx = 0;
4934     }
4935     else {
4936         arglen = -1;
4937         argidx = -2;
4938     }
4939     if (args->ob_type->tp_as_mapping)
4940         dict = args;
4941
4942     while (--fmtcnt >= 0) {
4943         if (*fmt != '%') {
4944             if (--rescnt < 0) {
4945                 rescnt = fmtcnt + 100;
4946                 reslen += rescnt;
4947                 if (_PyUnicode_Resize(&result, reslen) < 0)
4948                     return NULL;
4949                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4950                 --rescnt;
4951             }
4952             *res++ = *fmt++;
4953         }
4954         else {
4955             /* Got a format specifier */
4956             int flags = 0;
4957             int width = -1;
4958             int prec = -1;
4959             Py_UNICODE c = '\0';
4960             Py_UNICODE fill;
4961             PyObject *v = NULL;
4962             PyObject *temp = NULL;
4963             Py_UNICODE *pbuf;
4964             Py_UNICODE sign;
4965             int len;
4966             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4967
4968             fmt++;
4969             if (*fmt == '(') {
4970                 Py_UNICODE *keystart;
4971                 int keylen;
4972                 PyObject *key;
4973                 int pcount = 1;
4974
4975                 if (dict == NULL) {
4976                     PyErr_SetString(PyExc_TypeError,
4977                                     "format requires a mapping");
4978                     goto onError;
4979                 }
4980                 ++fmt;
4981                 --fmtcnt;
4982                 keystart = fmt;
4983                 /* Skip over balanced parentheses */
4984                 while (pcount > 0 && --fmtcnt >= 0) {
4985                     if (*fmt == ')')
4986                         --pcount;
4987                     else if (*fmt == '(')
4988                         ++pcount;
4989                     fmt++;
4990                 }
4991                 keylen = fmt - keystart - 1;
4992                 if (fmtcnt < 0 || pcount > 0) {
4993                     PyErr_SetString(PyExc_ValueError,
4994                                     "incomplete format key");
4995                     goto onError;
4996                 }
4997                 /* keys are converted to strings using UTF-8 and
4998                    then looked up since Python uses strings to hold
4999                    variables names etc. in its namespaces and we
5000                    wouldn't want to break common idioms. */
5001                 key = PyUnicode_EncodeUTF8(keystart,
5002                                            keylen,
5003                                            NULL);
5004                 if (key == NULL)
5005                     goto onError;
5006                 if (args_owned) {
5007                     Py_DECREF(args);
5008                     args_owned = 0;
5009                 }
5010                 args = PyObject_GetItem(dict, key);
5011                 Py_DECREF(key);
5012                 if (args == NULL) {
5013                     goto onError;
5014                 }
5015                 args_owned = 1;
5016                 arglen = -1;
5017                 argidx = -2;
5018             }
5019             while (--fmtcnt >= 0) {
5020                 switch (c = *fmt++) {
5021                 case '-': flags |= F_LJUST; continue;
5022                 case '+': flags |= F_SIGN; continue;
5023                 case ' ': flags |= F_BLANK; continue;
5024                 case '#': flags |= F_ALT; continue;
5025                 case '0': flags |= F_ZERO; continue;
5026                 }
5027                 break;
5028             }
5029             if (c == '*') {
5030                 v = getnextarg(args, arglen, &argidx);
5031                 if (v == NULL)
5032                     goto onError;
5033                 if (!PyInt_Check(v)) {
5034                     PyErr_SetString(PyExc_TypeError,
5035                                     "* wants int");
5036                     goto onError;
5037                 }
5038                 width = PyInt_AsLong(v);
5039                 if (width < 0) {
5040                     flags |= F_LJUST;
5041                     width = -width;
5042                 }
5043                 if (--fmtcnt >= 0)
5044                     c = *fmt++;
5045             }
5046             else if (c >= '0' && c <= '9') {
5047                 width = c - '0';
5048                 while (--fmtcnt >= 0) {
5049                     c = *fmt++;
5050                     if (c < '0' || c > '9')
5051                         break;
5052                     if ((width*10) / 10 != width) {
5053                         PyErr_SetString(PyExc_ValueError,
5054                                         "width too big");
5055                         goto onError;
5056                     }
5057                     width = width*10 + (c - '0');
5058                 }
5059             }
5060             if (c == '.') {
5061                 prec = 0;
5062                 if (--fmtcnt >= 0)
5063                     c = *fmt++;
5064                 if (c == '*') {
5065                     v = getnextarg(args, arglen, &argidx);
5066                     if (v == NULL)
5067                         goto onError;
5068                     if (!PyInt_Check(v)) {
5069                         PyErr_SetString(PyExc_TypeError,
5070                                         "* wants int");
5071                         goto onError;
5072                     }
5073                     prec = PyInt_AsLong(v);
5074                     if (prec < 0)
5075                         prec = 0;
5076                     if (--fmtcnt >= 0)
5077                         c = *fmt++;
5078                 }
5079                 else if (c >= '0' && c <= '9') {
5080                     prec = c - '0';
5081                     while (--fmtcnt >= 0) {
5082                         c = Py_CHARMASK(*fmt++);
5083                         if (c < '0' || c > '9')
5084                             break;
5085                         if ((prec*10) / 10 != prec) {
5086                             PyErr_SetString(PyExc_ValueError,
5087                                             "prec too big");
5088                             goto onError;
5089                         }
5090                         prec = prec*10 + (c - '0');
5091                     }
5092                 }
5093             } /* prec */
5094             if (fmtcnt >= 0) {
5095                 if (c == 'h' || c == 'l' || c == 'L') {
5096                     if (--fmtcnt >= 0)
5097                         c = *fmt++;
5098                 }
5099             }
5100             if (fmtcnt < 0) {
5101                 PyErr_SetString(PyExc_ValueError,
5102                                 "incomplete format");
5103                 goto onError;
5104             }
5105             if (c != '%') {
5106                 v = getnextarg(args, arglen, &argidx);
5107                 if (v == NULL)
5108                     goto onError;
5109             }
5110             sign = 0;
5111             fill = ' ';
5112             switch (c) {
5113
5114             case '%':
5115                 pbuf = formatbuf;
5116                 /* presume that buffer length is at least 1 */
5117                 pbuf[0] = '%';
5118                 len = 1;
5119                 break;
5120
5121             case 's':
5122             case 'r':
5123                 if (PyUnicode_Check(v) && c == 's') {
5124                     temp = v;
5125                     Py_INCREF(temp);
5126                 }
5127                 else {
5128                     PyObject *unicode;
5129                     if (c == 's')
5130                         temp = PyObject_Str(v);
5131                     else
5132                         temp = PyObject_Repr(v);
5133                     if (temp == NULL)
5134                         goto onError;
5135                     if (!PyString_Check(temp)) {
5136                         /* XXX Note: this should never happen, since
5137                                PyObject_Repr() and PyObject_Str() assure
5138                                this */
5139                         Py_DECREF(temp);
5140                         PyErr_SetString(PyExc_TypeError,
5141                                         "%s argument has non-string str()");
5142                         goto onError;
5143                     }
5144                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5145                                                    PyString_GET_SIZE(temp),
5146                                                NULL,
5147                                                    "strict");
5148                     Py_DECREF(temp);
5149                     temp = unicode;
5150                     if (temp == NULL)
5151                         goto onError;
5152                 }
5153                 pbuf = PyUnicode_AS_UNICODE(temp);
5154                 len = PyUnicode_GET_SIZE(temp);
5155                 if (prec >= 0 && len > prec)
5156                     len = prec;
5157                 break;
5158
5159             case 'i':
5160             case 'd':
5161             case 'u':
5162             case 'o':
5163             case 'x':
5164             case 'X':
5165                 if (c == 'i')
5166                     c = 'd';
5167                 if (PyLong_Check(v)) {
5168                     temp = formatlong(v, flags, prec, c);
5169                     if (!temp)
5170                         goto onError;
5171                     pbuf = PyUnicode_AS_UNICODE(temp);
5172                     len = PyUnicode_GET_SIZE(temp);
5173                     /* unbounded ints can always produce
5174                        a sign character! */
5175                     sign = 1;
5176                 }
5177                 else {
5178                     pbuf = formatbuf;
5179                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5180                                     flags, prec, c, v);
5181                     if (len < 0)
5182                         goto onError;
5183                     /* only d conversion is signed */
5184                     sign = c == 'd';
5185                 }
5186                 if (flags & F_ZERO)
5187                     fill = '0';
5188                 break;
5189
5190             case 'e':
5191             case 'E':
5192             case 'f':
5193             case 'g':
5194             case 'G':
5195                 pbuf = formatbuf;
5196                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5197                         flags, prec, c, v);
5198                 if (len < 0)
5199                     goto onError;
5200                 sign = 1;
5201                 if (flags & F_ZERO)
5202                     fill = '0';
5203                 break;
5204
5205             case 'c':
5206                 pbuf = formatbuf;
5207                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5208                 if (len < 0)
5209                     goto onError;
5210                 break;
5211
5212             default:
5213                 PyErr_Format(PyExc_ValueError,
5214                              "unsupported format character '%c' (0x%x) "
5215                              "at index %i",
5216                              (31<=c && c<=126) ? c : '?',
5217                              c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5218                 goto onError;
5219             }
5220             if (sign) {
5221                 if (*pbuf == '-' || *pbuf == '+') {
5222                     sign = *pbuf++;
5223                     len--;
5224                 }
5225                 else if (flags & F_SIGN)
5226                     sign = '+';
5227                 else if (flags & F_BLANK)
5228                     sign = ' ';
5229                 else
5230                     sign = 0;
5231             }
5232             if (width < len)
5233                 width = len;
5234             if (rescnt < width + (sign != 0)) {
5235                 reslen -= rescnt;
5236                 rescnt = width + fmtcnt + 100;
5237                 reslen += rescnt;
5238                 if (_PyUnicode_Resize(&result, reslen) < 0)
5239                     return NULL;
5240                 res = PyUnicode_AS_UNICODE(result)
5241                     + reslen - rescnt;
5242             }
5243             if (sign) {
5244                 if (fill != ' ')
5245                     *res++ = sign;
5246                 rescnt--;
5247                 if (width > len)
5248                     width--;
5249             }
5250             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5251                 assert(pbuf[0] == '0');
5252                 assert(pbuf[1] == c);
5253                 if (fill != ' ') {
5254                     *res++ = *pbuf++;
5255                     *res++ = *pbuf++;
5256                 }
5257                 rescnt -= 2;
5258                 width -= 2;
5259                 if (width < 0)
5260                     width = 0;
5261                 len -= 2;
5262             }
5263             if (width > len && !(flags & F_LJUST)) {
5264                 do {
5265                     --rescnt;
5266                     *res++ = fill;
5267                 } while (--width > len);
5268             }
5269             if (fill == ' ') {
5270                 if (sign)
5271                     *res++ = sign;
5272                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5273                     assert(pbuf[0] == '0');
5274                     assert(pbuf[1] == c);
5275                     *res++ = *pbuf++;
5276                     *res++ = *pbuf++;
5277                 }
5278             }
5279             Py_UNICODE_COPY(res, pbuf, len);
5280             res += len;
5281             rescnt -= len;
5282             while (--width >= len) {
5283                 --rescnt;
5284                 *res++ = ' ';
5285             }
5286             if (dict && (argidx < arglen) && c != '%') {
5287                 PyErr_SetString(PyExc_TypeError,
5288                                 "not all arguments converted");
5289                 goto onError;
5290             }
5291             Py_XDECREF(temp);
5292         } /* '%' */
5293     } /* until end */
5294     if (argidx < arglen && !dict) {
5295         PyErr_SetString(PyExc_TypeError,
5296                         "not all arguments converted");
5297         goto onError;
5298     }
5299
5300     if (args_owned) {
5301         Py_DECREF(args);
5302     }
5303     Py_DECREF(uformat);
5304     if (_PyUnicode_Resize(&result, reslen - rescnt))
5305         goto onError;
5306     return (PyObject *)result;
5307
5308  onError:
5309     Py_XDECREF(result);
5310     Py_DECREF(uformat);
5311     if (args_owned) {
5312         Py_DECREF(args);
5313     }
5314     return NULL;
5315 }
5316
5317 static PyBufferProcs unicode_as_buffer = {
5318     (getreadbufferproc) unicode_buffer_getreadbuf,
5319     (getwritebufferproc) unicode_buffer_getwritebuf,
5320     (getsegcountproc) unicode_buffer_getsegcount,
5321     (getcharbufferproc) unicode_buffer_getcharbuf,
5322 };
5323
5324 PyTypeObject PyUnicode_Type = {
5325     PyObject_HEAD_INIT(&PyType_Type)
5326     0,                                  /* ob_size */
5327     "unicode",                          /* tp_name */
5328     sizeof(PyUnicodeObject),            /* tp_size */
5329     0,                                  /* tp_itemsize */
5330     /* Slots */
5331     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5332     0,                                  /* tp_print */
5333     (getattrfunc)unicode_getattr,       /* tp_getattr */
5334     0,                                  /* tp_setattr */
5335     (cmpfunc) unicode_compare,          /* tp_compare */
5336     (reprfunc) unicode_repr,            /* tp_repr */
5337     0,                                  /* tp_as_number */
5338     &unicode_as_sequence,               /* tp_as_sequence */
5339     0,                                  /* tp_as_mapping */
5340     (hashfunc) unicode_hash,            /* tp_hash*/
5341     0,                                  /* tp_call*/
5342     (reprfunc) unicode_str,             /* tp_str */
5343     (getattrofunc) NULL,                /* tp_getattro */
5344     (setattrofunc) NULL,                /* tp_setattro */
5345     &unicode_as_buffer,                 /* tp_as_buffer */
5346     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
5347 };
5348
5349 /* Initialize the Unicode implementation */
5350
5351 void _PyUnicode_Init(void)
5352 {
5353     int i;
5354
5355     /* Init the implementation */
5356     unicode_freelist = NULL;
5357     unicode_freelist_size = 0;
5358     unicode_empty = _PyUnicode_New(0);
5359     strcpy(unicode_default_encoding, "ascii");
5360     for (i = 0; i < 256; i++)
5361         unicode_latin1[i] = NULL;
5362 }
5363
5364 /* Finalize the Unicode implementation */
5365
5366 void
5367 _PyUnicode_Fini(void)
5368 {
5369     PyUnicodeObject *u;
5370     int i;
5371
5372     Py_XDECREF(unicode_empty);
5373     unicode_empty = NULL;
5374
5375     for (i = 0; i < 256; i++) {
5376         if (unicode_latin1[i]) {
5377             Py_DECREF(unicode_latin1[i]);
5378             unicode_latin1[i] = NULL;
5379         }
5380     }
5381
5382     for (u = unicode_freelist; u != NULL;) {
5383         PyUnicodeObject *v = u;
5384         u = *(PyUnicodeObject **)u;
5385         if (v->str)
5386             PyMem_DEL(v->str);
5387         Py_XDECREF(v->defenc);
5388         PyObject_DEL(v);
5389     }
5390     unicode_freelist = NULL;
5391     unicode_freelist_size = 0;
5392 }