Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WINDOWS
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_Del(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void unicode_dealloc(register PyUnicodeObject *unicode)
 228 {
 229     if (PyUnicode_CheckExact(unicode) &&
 230         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 231         /* Keep-Alive optimization */
 232         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 233             PyMem_DEL(unicode->str);
 234             unicode->str = NULL;
 235             unicode->length = 0;
 236         }
 237         if (unicode->defenc) {
 238             Py_DECREF(unicode->defenc);
 239             unicode->defenc = NULL;
 240         }
 241         /* Add to free list */
 242         *(PyUnicodeObject **)unicode = unicode_freelist;
 243         unicode_freelist = unicode;
 244         unicode_freelist_size++;
 245     }
 246     else {
 247         PyMem_DEL(unicode->str);
 248         Py_XDECREF(unicode->defenc);
 249         unicode->ob_type->tp_free((PyObject *)unicode);
 250     }
 251 }
 252
 253 int PyUnicode_Resize(PyObject **unicode,
 254                      int length)
 255 {
 256     register PyUnicodeObject *v;
 257
 258     /* Argument checks */
 259     if (unicode == NULL) {
 260         PyErr_BadInternalCall();
 261         return -1;
 262     }
 263     v = (PyUnicodeObject *)*unicode;
 264     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
 265         PyErr_BadInternalCall();
 266         return -1;
 267     }
 268
 269     /* Resizing unicode_empty and single character objects is not
 270        possible since these are being shared. We simply return a fresh
 271        copy with the same Unicode content. */
 272     if (v->length != length &&
 273         (v == unicode_empty || v->length == 1)) {
 274         PyUnicodeObject *w = _PyUnicode_New(length);
 275         if (w == NULL)
 276             return -1;
 277         Py_UNICODE_COPY(w->str, v->str,
 278                         length < v->length ? length : v->length);
 279         Py_DECREF(*unicode);
 280         *unicode = (PyObject *)w;
 281         return 0;
 282     }
 283
 284     /* Note that we don't have to modify *unicode for unshared Unicode
 285        objects, since we can modify them in-place. */
 286     return unicode_resize(v, length);
 287 }
 288
 289 /* Internal API for use in unicodeobject.c only ! */
 290 #define _PyUnicode_Resize(unicodevar, length) \
 291         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 292
 293 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 294                                 int size)
 295 {
 296     PyUnicodeObject *unicode;
 297
 298     /* If the Unicode data is known at construction time, we can apply
 299        some optimizations which share commonly used objects. */
 300     if (u != NULL) {
 301
 302         /* Optimization for empty strings */
 303         if (size == 0 && unicode_empty != NULL) {
 304             Py_INCREF(unicode_empty);
 305             return (PyObject *)unicode_empty;
 306         }
 307
 308         /* Single character Unicode objects in the Latin-1 range are
 309            shared when using this constructor */
 310         if (size == 1 && *u < 256) {
 311             unicode = unicode_latin1[*u];
 312             if (!unicode) {
 313                 unicode = _PyUnicode_New(1);
 314                 if (!unicode)
 315                     return NULL;
 316                 unicode->str[0] = *u;
 317                 unicode_latin1[*u] = unicode;
 318             }
 319             Py_INCREF(unicode);
 320             return (PyObject *)unicode;
 321         }
 322     }
 323
 324     unicode = _PyUnicode_New(size);
 325     if (!unicode)
 326         return NULL;
 327
 328     /* Copy the Unicode data into the new object */
 329     if (u != NULL)
 330         Py_UNICODE_COPY(unicode->str, u, size);
 331
 332     return (PyObject *)unicode;
 333 }
 334
 335 #ifdef HAVE_WCHAR_H
 336
 337 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 338                                  int size)
 339 {
 340     PyUnicodeObject *unicode;
 341
 342     if (w == NULL) {
 343         PyErr_BadInternalCall();
 344         return NULL;
 345     }
 346
 347     unicode = _PyUnicode_New(size);
 348     if (!unicode)
 349         return NULL;
 350
 351     /* Copy the wchar_t data into the new object */
 352 #ifdef HAVE_USABLE_WCHAR_T
 353     memcpy(unicode->str, w, size * sizeof(wchar_t));
 354 #else
 355     {
 356         register Py_UNICODE *u;
 357         register int i;
 358         u = PyUnicode_AS_UNICODE(unicode);
 359         for (i = size; i >= 0; i--)
 360             *u++ = *w++;
 361     }
 362 #endif
 363
 364     return (PyObject *)unicode;
 365 }
 366
 367 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 368                          register wchar_t *w,
 369                          int size)
 370 {
 371     if (unicode == NULL) {
 372         PyErr_BadInternalCall();
 373         return -1;
 374     }
 375     if (size > PyUnicode_GET_SIZE(unicode))
 376         size = PyUnicode_GET_SIZE(unicode);
 377 #ifdef HAVE_USABLE_WCHAR_T
 378     memcpy(w, unicode->str, size * sizeof(wchar_t));
 379 #else
 380     {
 381         register Py_UNICODE *u;
 382         register int i;
 383         u = PyUnicode_AS_UNICODE(unicode);
 384         for (i = size; i >= 0; i--)
 385             *w++ = *u++;
 386     }
 387 #endif
 388
 389     return size;
 390 }
 391
 392 #endif
 393
 394 PyObject *PyUnicode_FromOrdinal(int ordinal)
 395 {
 396     Py_UNICODE s[2];
 397
 398 #ifdef Py_UNICODE_WIDE
 399     if (ordinal < 0 || ordinal > 0x10ffff) {
 400         PyErr_SetString(PyExc_ValueError,
 401                         "unichr() arg not in range(0x110000) "
 402                         "(wide Python build)");
 403         return NULL;
 404     }
 405 #else
 406     if (ordinal < 0 || ordinal > 0xffff) {
 407         PyErr_SetString(PyExc_ValueError,
 408                         "unichr() arg not in range(0x10000) "
 409                         "(narrow Python build)");
 410         return NULL;
 411     }
 412 #endif
 413
 414     if (ordinal <= 0xffff) {
 415         /* UCS-2 character */
 416         s[0] = (Py_UNICODE) ordinal;
 417         return PyUnicode_FromUnicode(s, 1);
 418     }
 419     else {
 420 #ifndef Py_UNICODE_WIDE
 421         /* UCS-4 character.  store as two surrogate characters */
 422         ordinal -= 0x10000L;
 423         s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
 424         s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
 425         return PyUnicode_FromUnicode(s, 2);
 426 #else
 427         s[0] = (Py_UNICODE)ordinal;
 428         return PyUnicode_FromUnicode(s, 1);
 429 #endif
 430     }
 431 }
 432
 433 PyObject *PyUnicode_FromObject(register PyObject *obj)
 434 {
 435     /* XXX Perhaps we should make this API an alias of
 436            PyObject_Unicode() instead ?! */
 437     if (PyUnicode_CheckExact(obj)) {
 438         Py_INCREF(obj);
 439         return obj;
 440     }
 441     if (PyUnicode_Check(obj)) {
 442         /* For a Unicode subtype that's not a Unicode object,
 443            return a true Unicode object with the same data. */
 444         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 445                                      PyUnicode_GET_SIZE(obj));
 446     }
 447     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 448 }
 449
 450 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 451                                       const char *encoding,
 452                                       const char *errors)
 453 {
 454     const char *s = NULL;
 455     int len;
 456     PyObject *v;
 457
 458     if (obj == NULL) {
 459         PyErr_BadInternalCall();
 460         return NULL;
 461     }
 462
 463 #if 0
 464     /* For b/w compatibility we also accept Unicode objects provided
 465        that no encodings is given and then redirect to
 466        PyObject_Unicode() which then applies the additional logic for
 467        Unicode subclasses.
 468
 469        NOTE: This API should really only be used for object which
 470              represent *encoded* Unicode !
 471
 472     */
 473         if (PyUnicode_Check(obj)) {
 474             if (encoding) {
 475                 PyErr_SetString(PyExc_TypeError,
 476                                 "decoding Unicode is not supported");
 477             return NULL;
 478             }
 479         return PyObject_Unicode(obj);
 480             }
 481 #else
 482     if (PyUnicode_Check(obj)) {
 483         PyErr_SetString(PyExc_TypeError,
 484                         "decoding Unicode is not supported");
 485         return NULL;
 486         }
 487 #endif
 488
 489     /* Coerce object */
 490     if (PyString_Check(obj)) {
 491             s = PyString_AS_STRING(obj);
 492             len = PyString_GET_SIZE(obj);
 493             }
 494     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 495         /* Overwrite the error message with something more useful in
 496            case of a TypeError. */
 497         if (PyErr_ExceptionMatches(PyExc_TypeError))
 498         PyErr_Format(PyExc_TypeError,
 499                          "coercing to Unicode: need string or buffer, "
 500                          "%.80s found",
 501                      obj->ob_type->tp_name);
 502         goto onError;
 503     }
 504
 505     /* Convert to Unicode */
 506     if (len == 0) {
 507         Py_INCREF(unicode_empty);
 508         v = (PyObject *)unicode_empty;
 509     }
 510     else
 511         v = PyUnicode_Decode(s, len, encoding, errors);
 512
 513     return v;
 514
 515  onError:
 516     return NULL;
 517 }
 518
 519 PyObject *PyUnicode_Decode(const char *s,
 520                            int size,
 521                            const char *encoding,
 522                            const char *errors)
 523 {
 524     PyObject *buffer = NULL, *unicode;
 525
 526     if (encoding == NULL)
 527         encoding = PyUnicode_GetDefaultEncoding();
 528
 529     /* Shortcuts for common default encodings */
 530     if (strcmp(encoding, "utf-8") == 0)
 531         return PyUnicode_DecodeUTF8(s, size, errors);
 532     else if (strcmp(encoding, "latin-1") == 0)
 533         return PyUnicode_DecodeLatin1(s, size, errors);
 534     else if (strcmp(encoding, "ascii") == 0)
 535         return PyUnicode_DecodeASCII(s, size, errors);
 536
 537     /* Decode via the codec registry */
 538     buffer = PyBuffer_FromMemory((void *)s, size);
 539     if (buffer == NULL)
 540         goto onError;
 541     unicode = PyCodec_Decode(buffer, encoding, errors);
 542     if (unicode == NULL)
 543         goto onError;
 544     if (!PyUnicode_Check(unicode)) {
 545         PyErr_Format(PyExc_TypeError,
 546                      "decoder did not return an unicode object (type=%.400s)",
 547                      unicode->ob_type->tp_name);
 548         Py_DECREF(unicode);
 549         goto onError;
 550     }
 551     Py_DECREF(buffer);
 552     return unicode;
 553
 554  onError:
 555     Py_XDECREF(buffer);
 556     return NULL;
 557 }
 558
 559 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 560                            int size,
 561                            const char *encoding,
 562                            const char *errors)
 563 {
 564     PyObject *v, *unicode;
 565
 566     unicode = PyUnicode_FromUnicode(s, size);
 567     if (unicode == NULL)
 568         return NULL;
 569     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 570     Py_DECREF(unicode);
 571     return v;
 572 }
 573
 574 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 575                                     const char *encoding,
 576                                     const char *errors)
 577 {
 578     PyObject *v;
 579
 580     if (!PyUnicode_Check(unicode)) {
 581         PyErr_BadArgument();
 582         goto onError;
 583     }
 584
 585     if (encoding == NULL)
 586         encoding = PyUnicode_GetDefaultEncoding();
 587
 588     /* Shortcuts for common default encodings */
 589     if (errors == NULL) {
 590         if (strcmp(encoding, "utf-8") == 0)
 591             return PyUnicode_AsUTF8String(unicode);
 592         else if (strcmp(encoding, "latin-1") == 0)
 593             return PyUnicode_AsLatin1String(unicode);
 594         else if (strcmp(encoding, "ascii") == 0)
 595             return PyUnicode_AsASCIIString(unicode);
 596     }
 597
 598     /* Encode via the codec registry */
 599     v = PyCodec_Encode(unicode, encoding, errors);
 600     if (v == NULL)
 601         goto onError;
 602     /* XXX Should we really enforce this ? */
 603     if (!PyString_Check(v)) {
 604         PyErr_Format(PyExc_TypeError,
 605                      "encoder did not return a string object (type=%.400s)",
 606                      v->ob_type->tp_name);
 607         Py_DECREF(v);
 608         goto onError;
 609     }
 610     return v;
 611
 612  onError:
 613     return NULL;
 614 }
 615
 616 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 617                                             const char *errors)
 618 {
 619     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 620
 621     if (v)
 622         return v;
 623     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 624     if (v && errors == NULL)
 625         ((PyUnicodeObject *)unicode)->defenc = v;
 626     return v;
 627 }
 628
 629 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 630 {
 631     if (!PyUnicode_Check(unicode)) {
 632         PyErr_BadArgument();
 633         goto onError;
 634     }
 635     return PyUnicode_AS_UNICODE(unicode);
 636
 637  onError:
 638     return NULL;
 639 }
 640
 641 int PyUnicode_GetSize(PyObject *unicode)
 642 {
 643     if (!PyUnicode_Check(unicode)) {
 644         PyErr_BadArgument();
 645         goto onError;
 646     }
 647     return PyUnicode_GET_SIZE(unicode);
 648
 649  onError:
 650     return -1;
 651 }
 652
 653 const char *PyUnicode_GetDefaultEncoding(void)
 654 {
 655     return unicode_default_encoding;
 656 }
 657
 658 int PyUnicode_SetDefaultEncoding(const char *encoding)
 659 {
 660     PyObject *v;
 661
 662     /* Make sure the encoding is valid. As side effect, this also
 663        loads the encoding into the codec registry cache. */
 664     v = _PyCodec_Lookup(encoding);
 665     if (v == NULL)
 666         goto onError;
 667     Py_DECREF(v);
 668     strncpy(unicode_default_encoding,
 669             encoding,
 670             sizeof(unicode_default_encoding));
 671     return 0;
 672
 673  onError:
 674     return -1;
 675 }
 676
 677 /* error handling callback helper:
 678    build arguments, call the callback and check the arguments,
 679    if no exception occured, copy the replacement to the output
 680    and adjust various state variables.
 681    return 0 on success, -1 on error
 682 */
 683
 684 static
 685 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
 686                  const char *encoding, const char *reason,
 687                  const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
 688                  PyObject **output, int *outpos, Py_UNICODE **outptr)
 689 {
 690     static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
 691
 692     PyObject *restuple = NULL;
 693     PyObject *repunicode = NULL;
 694     int outsize = PyUnicode_GET_SIZE(*output);
 695     int requiredsize;
 696     int newpos;
 697     Py_UNICODE *repptr;
 698     int repsize;
 699     int res = -1;
 700
 701     if (*errorHandler == NULL) {
 702         *errorHandler = PyCodec_LookupError(errors);
 703         if (*errorHandler == NULL)
 704            goto onError;
 705     }
 706
 707     if (*exceptionObject == NULL) {
 708         *exceptionObject = PyUnicodeDecodeError_Create(
 709             encoding, input, insize, *startinpos, *endinpos, reason);
 710         if (*exceptionObject == NULL)
 711            goto onError;
 712     }
 713     else {
 714         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
 715             goto onError;
 716         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
 717             goto onError;
 718         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
 719             goto onError;
 720     }
 721
 722     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
 723     if (restuple == NULL)
 724         goto onError;
 725     if (!PyTuple_Check(restuple)) {
 726         PyErr_Format(PyExc_TypeError, &argparse[4]);
 727         goto onError;
 728     }
 729     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
 730         goto onError;
 731     if (newpos<0)
 732         newpos = insize+newpos;
 733     if (newpos<0 || newpos>insize) {
 734         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
 735         goto onError;
 736     }
 737
 738     /* need more space? (at least enough for what we
 739        have+the replacement+the rest of the string (starting
 740        at the new input position), so we won't have to check space
 741        when there are no errors in the rest of the string) */
 742     repptr = PyUnicode_AS_UNICODE(repunicode);
 743     repsize = PyUnicode_GET_SIZE(repunicode);
 744     requiredsize = *outpos + repsize + insize-newpos;
 745     if (requiredsize > outsize) {
 746         if (requiredsize<2*outsize)
 747             requiredsize = 2*outsize;
 748         if (PyUnicode_Resize(output, requiredsize))
 749             goto onError;
 750         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
 751     }
 752     *endinpos = newpos;
 753     *inptr = input + newpos;
 754     Py_UNICODE_COPY(*outptr, repptr, repsize);
 755     *outptr += repsize;
 756     *outpos += repsize;
 757     /* we made it! */
 758     res = 0;
 759
 760     onError:
 761     Py_XDECREF(restuple);
 762     return res;
 763 }
 764
 765 /* --- UTF-7 Codec -------------------------------------------------------- */
 766
 767 /* see RFC2152 for details */
 768
 769 static
 770 char utf7_special[128] = {
 771     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 772        encoded:
 773            0 - not special
 774            1 - special
 775            2 - whitespace (optional)
 776            3 - RFC2152 Set O (optional) */
 777     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 778     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 779     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 780     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 781     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 782     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 783     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 784     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 785
 786 };
 787
 788 #define SPECIAL(c, encodeO, encodeWS) \
 789         (((c)>127 || utf7_special[(c)] == 1) || \
 790          (encodeWS && (utf7_special[(c)] == 2)) || \
 791      (encodeO && (utf7_special[(c)] == 3)))
 792
 793 #define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 794 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
 795 #define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
 796                         (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
 797
 798 #define ENCODE(out, ch, bits) \
 799     while (bits >= 6) { \
 800         *out++ = B64(ch >> (bits-6)); \
 801         bits -= 6; \
 802     }
 803
 804 #define DECODE(out, ch, bits, surrogate) \
 805     while (bits >= 16) { \
 806         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
 807         bits -= 16; \
 808                 if (surrogate) { \
 809                         /* We have already generated an error for the high surrogate
 810                so let's not bother seeing if the low surrogate is correct or not */\
 811                         surrogate = 0; \
 812                 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
 813             /* This is a surrogate pair. Unfortunately we can't represent \
 814                it in a 16-bit character */ \
 815                         surrogate = 1; \
 816             errmsg = "code pairs are not supported"; \
 817                 goto utf7Error; \
 818                 } else { \
 819                                 *out++ = outCh; \
 820                 } \
 821     } \
 822
 823 PyObject *PyUnicode_DecodeUTF7(const char *s,
 824                                int size,
 825                                const char *errors)
 826 {
 827     const char *starts = s;
 828     int startinpos;
 829     int endinpos;
 830     int outpos;
 831     const char *e;
 832     PyUnicodeObject *unicode;
 833     Py_UNICODE *p;
 834     const char *errmsg = "";
 835     int inShift = 0;
 836     unsigned int bitsleft = 0;
 837     unsigned long charsleft = 0;
 838     int surrogate = 0;
 839     PyObject *errorHandler = NULL;
 840     PyObject *exc = NULL;
 841
 842     unicode = _PyUnicode_New(size);
 843     if (!unicode)
 844         return NULL;
 845     if (size == 0)
 846         return (PyObject *)unicode;
 847
 848     p = unicode->str;
 849     e = s + size;
 850
 851     while (s < e) {
 852         Py_UNICODE ch;
 853         restart:
 854         ch = *s;
 855
 856         if (inShift) {
 857             if ((ch == '-') || !B64CHAR(ch)) {
 858                 inShift = 0;
 859                 s++;
 860
 861                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 862                 if (bitsleft >= 6) {
 863                     /* The shift sequence has a partial character in it. If
 864                        bitsleft < 6 then we could just classify it as padding
 865                        but that is not the case here */
 866
 867                     errmsg = "partial character in shift sequence";
 868                     goto utf7Error;
 869                 }
 870                 /* According to RFC2152 the remaining bits should be zero. We
 871                    choose to signal an error/insert a replacement character
 872                    here so indicate the potential of a misencoded character. */
 873
 874                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 875                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 876                     errmsg = "non-zero padding bits in shift sequence";
 877                     goto utf7Error;
 878                 }
 879
 880                 if (ch == '-') {
 881                     if ((s < e) && (*(s) == '-')) {
 882                         *p++ = '-';
 883                         inShift = 1;
 884                     }
 885                 } else if (SPECIAL(ch,0,0)) {
 886                     errmsg = "unexpected special character";
 887                         goto utf7Error;
 888                 } else  {
 889                     *p++ = ch;
 890                 }
 891             } else {
 892                 charsleft = (charsleft << 6) | UB64(ch);
 893                 bitsleft += 6;
 894                 s++;
 895                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 896             }
 897         }
 898         else if ( ch == '+' ) {
 899             startinpos = s-starts;
 900             s++;
 901             if (s < e && *s == '-') {
 902                 s++;
 903                 *p++ = '+';
 904             } else
 905             {
 906                 inShift = 1;
 907                 bitsleft = 0;
 908             }
 909         }
 910         else if (SPECIAL(ch,0,0)) {
 911             errmsg = "unexpected special character";
 912             s++;
 913                 goto utf7Error;
 914         }
 915         else {
 916             *p++ = ch;
 917             s++;
 918         }
 919         continue;
 920     utf7Error:
 921         outpos = p-PyUnicode_AS_UNICODE(unicode);
 922         endinpos = s-starts;
 923         if (unicode_decode_call_errorhandler(
 924              errors, &errorHandler,
 925              "utf7", errmsg,
 926              starts, size, &startinpos, &endinpos, &exc, &s,
 927              (PyObject **)&unicode, &outpos, &p))
 928         goto onError;
 929     }
 930
 931     if (inShift) {
 932         outpos = p-PyUnicode_AS_UNICODE(unicode);
 933         endinpos = size;
 934         if (unicode_decode_call_errorhandler(
 935              errors, &errorHandler,
 936              "utf7", "unterminated shift sequence",
 937              starts, size, &startinpos, &endinpos, &exc, &s,
 938              (PyObject **)&unicode, &outpos, &p))
 939             goto onError;
 940         if (s < e)
 941            goto restart;
 942     }
 943
 944     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
 945         goto onError;
 946
 947     Py_XDECREF(errorHandler);
 948     Py_XDECREF(exc);
 949     return (PyObject *)unicode;
 950
 951 onError:
 952     Py_XDECREF(errorHandler);
 953     Py_XDECREF(exc);
 954     Py_DECREF(unicode);
 955     return NULL;
 956 }
 957
 958
 959 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
 960                    int size,
 961                    int encodeSetO,
 962                    int encodeWhiteSpace,
 963                    const char *errors)
 964 {
 965     PyObject *v;
 966     /* It might be possible to tighten this worst case */
 967     unsigned int cbAllocated = 5 * size;
 968     int inShift = 0;
 969     int i = 0;
 970     unsigned int bitsleft = 0;
 971     unsigned long charsleft = 0;
 972     char * out;
 973     char * start;
 974
 975     if (size == 0)
 976                 return PyString_FromStringAndSize(NULL, 0);
 977
 978     v = PyString_FromStringAndSize(NULL, cbAllocated);
 979     if (v == NULL)
 980         return NULL;
 981
 982     start = out = PyString_AS_STRING(v);
 983     for (;i < size; ++i) {
 984         Py_UNICODE ch = s[i];
 985
 986         if (!inShift) {
 987                         if (ch == '+') {
 988                                 *out++ = '+';
 989                 *out++ = '-';
 990             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 991                 charsleft = ch;
 992                 bitsleft = 16;
 993                 *out++ = '+';
 994                                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 995                 inShift = bitsleft > 0;
 996                         } else {
 997                                 *out++ = (char) ch;
 998                         }
 999                 } else {
1000             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1001                 *out++ = B64(charsleft << (6-bitsleft));
1002                 charsleft = 0;
1003                 bitsleft = 0;
1004                 /* Characters not in the BASE64 set implicitly unshift the sequence
1005                    so no '-' is required, except if the character is itself a '-' */
1006                 if (B64CHAR(ch) || ch == '-') {
1007                     *out++ = '-';
1008                 }
1009                 inShift = 0;
1010                 *out++ = (char) ch;
1011             } else {
1012                 bitsleft += 16;
1013                 charsleft = (charsleft << 16) | ch;
1014                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015
1016                 /* If the next character is special then we dont' need to terminate
1017                    the shift sequence. If the next character is not a BASE64 character
1018                    or '-' then the shift sequence will be terminated implicitly and we
1019                    don't have to insert a '-'. */
1020
1021                 if (bitsleft == 0) {
1022                     if (i + 1 < size) {
1023                         Py_UNICODE ch2 = s[i+1];
1024
1025                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1026
1027                         } else if (B64CHAR(ch2) || ch2 == '-') {
1028                             *out++ = '-';
1029                             inShift = 0;
1030                         } else {
1031                             inShift = 0;
1032                         }
1033
1034                     }
1035                     else {
1036                         *out++ = '-';
1037                         inShift = 0;
1038                     }
1039                 }
1040             }
1041         }
1042         }
1043     if (bitsleft) {
1044         *out++= B64(charsleft << (6-bitsleft) );
1045         *out++ = '-';
1046     }
1047
1048     _PyString_Resize(&v, out - start);
1049     return v;
1050 }
1051
1052 #undef SPECIAL
1053 #undef B64
1054 #undef B64CHAR
1055 #undef UB64
1056 #undef ENCODE
1057 #undef DECODE
1058
1059 /* --- UTF-8 Codec -------------------------------------------------------- */
1060
1061 static
1062 char utf8_code_length[256] = {
1063     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
1064        illegal prefix.  see RFC 2279 for details */
1065     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1073     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1077     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1079     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1080     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1081 };
1082
1083 PyObject *PyUnicode_DecodeUTF8(const char *s,
1084                                int size,
1085                                const char *errors)
1086 {
1087     const char *starts = s;
1088     int n;
1089     int startinpos;
1090     int endinpos;
1091     int outpos;
1092     const char *e;
1093     PyUnicodeObject *unicode;
1094     Py_UNICODE *p;
1095     const char *errmsg = "";
1096     PyObject *errorHandler = NULL;
1097     PyObject *exc = NULL;
1098
1099     /* Note: size will always be longer than the resulting Unicode
1100        character count */
1101     unicode = _PyUnicode_New(size);
1102     if (!unicode)
1103         return NULL;
1104     if (size == 0)
1105         return (PyObject *)unicode;
1106
1107     /* Unpack UTF-8 encoded data */
1108     p = unicode->str;
1109     e = s + size;
1110
1111     while (s < e) {
1112         Py_UCS4 ch = (unsigned char)*s;
1113
1114         if (ch < 0x80) {
1115             *p++ = (Py_UNICODE)ch;
1116             s++;
1117             continue;
1118         }
1119
1120         n = utf8_code_length[ch];
1121
1122         if (s + n > e) {
1123             errmsg = "unexpected end of data";
1124             startinpos = s-starts;
1125             endinpos = size;
1126             goto utf8Error;
1127         }
1128
1129         switch (n) {
1130
1131         case 0:
1132             errmsg = "unexpected code byte";
1133             startinpos = s-starts;
1134             endinpos = startinpos+1;
1135             goto utf8Error;
1136
1137         case 1:
1138             errmsg = "internal error";
1139             startinpos = s-starts;
1140             endinpos = startinpos+1;
1141             goto utf8Error;
1142
1143         case 2:
1144             if ((s[1] & 0xc0) != 0x80) {
1145                 errmsg = "invalid data";
1146                 startinpos = s-starts;
1147                 endinpos = startinpos+2;
1148                 goto utf8Error;
1149             }
1150             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1151             if (ch < 0x80) {
1152                 startinpos = s-starts;
1153                 endinpos = startinpos+2;
1154                 errmsg = "illegal encoding";
1155                 goto utf8Error;
1156             }
1157             else
1158                 *p++ = (Py_UNICODE)ch;
1159             break;
1160
1161         case 3:
1162             if ((s[1] & 0xc0) != 0x80 ||
1163                 (s[2] & 0xc0) != 0x80) {
1164                 errmsg = "invalid data";
1165                 startinpos = s-starts;
1166                 endinpos = startinpos+3;
1167                 goto utf8Error;
1168             }
1169             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1170             if (ch < 0x0800) {
1171                 /* Note: UTF-8 encodings of surrogates are considered
1172                    legal UTF-8 sequences;
1173
1174                    XXX For wide builds (UCS-4) we should probably try
1175                        to recombine the surrogates into a single code
1176                        unit.
1177                 */
1178                 errmsg = "illegal encoding";
1179                 startinpos = s-starts;
1180                 endinpos = startinpos+3;
1181                 goto utf8Error;
1182             }
1183             else
1184                 *p++ = (Py_UNICODE)ch;
1185             break;
1186
1187         case 4:
1188             if ((s[1] & 0xc0) != 0x80 ||
1189                 (s[2] & 0xc0) != 0x80 ||
1190                 (s[3] & 0xc0) != 0x80) {
1191                 errmsg = "invalid data";
1192                 startinpos = s-starts;
1193                 endinpos = startinpos+4;
1194                 goto utf8Error;
1195             }
1196             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1197                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1198             /* validate and convert to UTF-16 */
1199             if ((ch < 0x10000)        /* minimum value allowed for 4
1200                                          byte encoding */
1201                 || (ch > 0x10ffff))   /* maximum value allowed for
1202                                          UTF-16 */
1203             {
1204                 errmsg = "illegal encoding";
1205                 startinpos = s-starts;
1206                 endinpos = startinpos+4;
1207                 goto utf8Error;
1208             }
1209 #ifdef Py_UNICODE_WIDE
1210             *p++ = (Py_UNICODE)ch;
1211 #else
1212             /*  compute and append the two surrogates: */
1213
1214             /*  translate from 10000..10FFFF to 0..FFFF */
1215             ch -= 0x10000;
1216
1217             /*  high surrogate = top 10 bits added to D800 */
1218             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1219
1220             /*  low surrogate = bottom 10 bits added to DC00 */
1221             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1222 #endif
1223             break;
1224
1225         default:
1226             /* Other sizes are only needed for UCS-4 */
1227             errmsg = "unsupported Unicode code range";
1228             startinpos = s-starts;
1229             endinpos = startinpos+n;
1230             goto utf8Error;
1231         }
1232         s += n;
1233         continue;
1234
1235     utf8Error:
1236     outpos = p-PyUnicode_AS_UNICODE(unicode);
1237     if (unicode_decode_call_errorhandler(
1238              errors, &errorHandler,
1239              "utf8", errmsg,
1240              starts, size, &startinpos, &endinpos, &exc, &s,
1241              (PyObject **)&unicode, &outpos, &p))
1242         goto onError;
1243     }
1244
1245     /* Adjust length */
1246     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1247         goto onError;
1248
1249     Py_XDECREF(errorHandler);
1250     Py_XDECREF(exc);
1251     return (PyObject *)unicode;
1252
1253 onError:
1254     Py_XDECREF(errorHandler);
1255     Py_XDECREF(exc);
1256     Py_DECREF(unicode);
1257     return NULL;
1258 }
1259
1260 /* Allocation strategy:  if the string is short, convert into a stack buffer
1261    and allocate exactly as much space needed at the end.  Else allocate the
1262    maximum possible needed (4 result bytes per Unicode character), and return
1263    the excess memory at the end.
1264 */
1265 PyObject *
1266 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1267                      int size,
1268                      const char *errors)
1269 {
1270 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
1271
1272     int i;              /* index into s of next input byte */
1273     PyObject *v;        /* result string object */
1274     char *p;            /* next free byte in output buffer */
1275     int nallocated;     /* number of result bytes allocated */
1276     int nneeded;        /* number of result bytes needed */
1277     char stackbuf[MAX_SHORT_UNICHARS * 4];
1278
1279     assert(s != NULL);
1280     assert(size >= 0);
1281
1282     if (size <= MAX_SHORT_UNICHARS) {
1283         /* Write into the stack buffer; nallocated can't overflow.
1284          * At the end, we'll allocate exactly as much heap space as it
1285          * turns out we need.
1286          */
1287         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1288         v = NULL;   /* will allocate after we're done */
1289         p = stackbuf;
1290     }
1291     else {
1292         /* Overallocate on the heap, and give the excess back at the end. */
1293         nallocated = size * 4;
1294         if (nallocated / 4 != size)  /* overflow! */
1295             return PyErr_NoMemory();
1296         v = PyString_FromStringAndSize(NULL, nallocated);
1297         if (v == NULL)
1298             return NULL;
1299         p = PyString_AS_STRING(v);
1300     }
1301
1302     for (i = 0; i < size;) {
1303         Py_UCS4 ch = s[i++];
1304
1305         if (ch < 0x80)
1306             /* Encode ASCII */
1307             *p++ = (char) ch;
1308
1309         else if (ch < 0x0800) {
1310             /* Encode Latin-1 */
1311             *p++ = (char)(0xc0 | (ch >> 6));
1312             *p++ = (char)(0x80 | (ch & 0x3f));
1313         }
1314         else {
1315             /* Encode UCS2 Unicode ordinals */
1316             if (ch < 0x10000) {
1317                 /* Special case: check for high surrogate */
1318                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1319                     Py_UCS4 ch2 = s[i];
1320                     /* Check for low surrogate and combine the two to
1321                        form a UCS4 value */
1322                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1323                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1324                         i++;
1325                         goto encodeUCS4;
1326                     }
1327                     /* Fall through: handles isolated high surrogates */
1328                 }
1329                 *p++ = (char)(0xe0 | (ch >> 12));
1330                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1331                 *p++ = (char)(0x80 | (ch & 0x3f));
1332                 continue;
1333             }
1334 encodeUCS4:
1335             /* Encode UCS4 Unicode ordinals */
1336             *p++ = (char)(0xf0 | (ch >> 18));
1337             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1338             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1339             *p++ = (char)(0x80 | (ch & 0x3f));
1340         }
1341     }
1342
1343     if (v == NULL) {
1344         /* This was stack allocated. */
1345         nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1346         assert(nneeded <= nallocated);
1347         v = PyString_FromStringAndSize(stackbuf, nneeded);
1348     }
1349     else {
1350         /* Cut back to size actually needed. */
1351         nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1352         assert(nneeded <= nallocated);
1353         _PyString_Resize(&v, nneeded);
1354     }
1355     return v;
1356
1357 #undef MAX_SHORT_UNICHARS
1358 }
1359
1360 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1361 {
1362     if (!PyUnicode_Check(unicode)) {
1363         PyErr_BadArgument();
1364         return NULL;
1365     }
1366     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1367                                 PyUnicode_GET_SIZE(unicode),
1368                                 NULL);
1369 }
1370
1371 /* --- UTF-16 Codec ------------------------------------------------------- */
1372
1373 PyObject *
1374 PyUnicode_DecodeUTF16(const char *s,
1375                       int size,
1376                       const char *errors,
1377                       int *byteorder)
1378 {
1379     const char *starts = s;
1380     int startinpos;
1381     int endinpos;
1382     int outpos;
1383     PyUnicodeObject *unicode;
1384     Py_UNICODE *p;
1385     const unsigned char *q, *e;
1386     int bo = 0;       /* assume native ordering by default */
1387     const char *errmsg = "";
1388     /* Offsets from q for retrieving byte pairs in the right order. */
1389 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1390     int ihi = 1, ilo = 0;
1391 #else
1392     int ihi = 0, ilo = 1;
1393 #endif
1394     PyObject *errorHandler = NULL;
1395     PyObject *exc = NULL;
1396
1397     /* Note: size will always be longer than the resulting Unicode
1398        character count */
1399     unicode = _PyUnicode_New(size);
1400     if (!unicode)
1401         return NULL;
1402     if (size == 0)
1403         return (PyObject *)unicode;
1404
1405     /* Unpack UTF-16 encoded data */
1406     p = unicode->str;
1407     q = (unsigned char *)s;
1408     e = q + size;
1409
1410     if (byteorder)
1411         bo = *byteorder;
1412
1413     /* Check for BOM marks (U+FEFF) in the input and adjust current
1414        byte order setting accordingly. In native mode, the leading BOM
1415        mark is skipped, in all other modes, it is copied to the output
1416        stream as-is (giving a ZWNBSP character). */
1417     if (bo == 0) {
1418         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1419 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1420         if (bom == 0xFEFF) {
1421             q += 2;
1422             bo = -1;
1423         }
1424         else if (bom == 0xFFFE) {
1425             q += 2;
1426             bo = 1;
1427         }
1428 #else
1429         if (bom == 0xFEFF) {
1430             q += 2;
1431             bo = 1;
1432         }
1433         else if (bom == 0xFFFE) {
1434             q += 2;
1435             bo = -1;
1436         }
1437 #endif
1438     }
1439
1440     if (bo == -1) {
1441         /* force LE */
1442         ihi = 1;
1443         ilo = 0;
1444     }
1445     else if (bo == 1) {
1446         /* force BE */
1447         ihi = 0;
1448         ilo = 1;
1449     }
1450
1451     while (q < e) {
1452         Py_UNICODE ch;
1453         /* remaing bytes at the end? (size should be even) */
1454         if (e-q<2) {
1455             errmsg = "truncated data";
1456             startinpos = ((const char *)q)-starts;
1457             endinpos = ((const char *)e)-starts;
1458             goto utf16Error;
1459             /* The remaining input chars are ignored if the callback
1460                chooses to skip the input */
1461         }
1462         ch = (q[ihi] << 8) | q[ilo];
1463
1464         q += 2;
1465
1466         if (ch < 0xD800 || ch > 0xDFFF) {
1467             *p++ = ch;
1468             continue;
1469         }
1470
1471         /* UTF-16 code pair: */
1472         if (q >= e) {
1473             errmsg = "unexpected end of data";
1474             startinpos = (((const char *)q)-2)-starts;
1475             endinpos = ((const char *)e)-starts;
1476             goto utf16Error;
1477         }
1478         if (0xD800 <= ch && ch <= 0xDBFF) {
1479             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1480             q += 2;
1481             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1482 #ifndef Py_UNICODE_WIDE
1483                 *p++ = ch;
1484                 *p++ = ch2;
1485 #else
1486                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1487 #endif
1488                 continue;
1489             }
1490             else {
1491                 errmsg = "illegal UTF-16 surrogate";
1492                 startinpos = (((const char *)q)-4)-starts;
1493                 endinpos = startinpos+2;
1494                 goto utf16Error;
1495             }
1496
1497         }
1498         errmsg = "illegal encoding";
1499         startinpos = (((const char *)q)-2)-starts;
1500         endinpos = startinpos+2;
1501         /* Fall through to report the error */
1502
1503     utf16Error:
1504         outpos = p-PyUnicode_AS_UNICODE(unicode);
1505         if (unicode_decode_call_errorhandler(
1506                  errors, &errorHandler,
1507                  "utf16", errmsg,
1508                  starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1509                  (PyObject **)&unicode, &outpos, &p))
1510             goto onError;
1511     }
1512
1513     if (byteorder)
1514         *byteorder = bo;
1515
1516     /* Adjust length */
1517     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1518         goto onError;
1519
1520     Py_XDECREF(errorHandler);
1521     Py_XDECREF(exc);
1522     return (PyObject *)unicode;
1523
1524 onError:
1525     Py_DECREF(unicode);
1526     Py_XDECREF(errorHandler);
1527     Py_XDECREF(exc);
1528     return NULL;
1529 }
1530
1531 PyObject *
1532 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1533                       int size,
1534                       const char *errors,
1535                       int byteorder)
1536 {
1537     PyObject *v;
1538     unsigned char *p;
1539     int i, pairs;
1540     /* Offsets from p for storing byte pairs in the right order. */
1541 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1542     int ihi = 1, ilo = 0;
1543 #else
1544     int ihi = 0, ilo = 1;
1545 #endif
1546
1547 #define STORECHAR(CH)                   \
1548     do {                                \
1549         p[ihi] = ((CH) >> 8) & 0xff;    \
1550         p[ilo] = (CH) & 0xff;           \
1551         p += 2;                         \
1552     } while(0)
1553
1554     for (i = pairs = 0; i < size; i++)
1555         if (s[i] >= 0x10000)
1556             pairs++;
1557     v = PyString_FromStringAndSize(NULL,
1558                   2 * (size + pairs + (byteorder == 0)));
1559     if (v == NULL)
1560         return NULL;
1561
1562     p = (unsigned char *)PyString_AS_STRING(v);
1563     if (byteorder == 0)
1564         STORECHAR(0xFEFF);
1565     if (size == 0)
1566         return v;
1567
1568     if (byteorder == -1) {
1569         /* force LE */
1570         ihi = 1;
1571         ilo = 0;
1572     }
1573     else if (byteorder == 1) {
1574         /* force BE */
1575         ihi = 0;
1576         ilo = 1;
1577     }
1578
1579     while (size-- > 0) {
1580         Py_UNICODE ch = *s++;
1581         Py_UNICODE ch2 = 0;
1582         if (ch >= 0x10000) {
1583             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1584             ch  = 0xD800 | ((ch-0x10000) >> 10);
1585         }
1586         STORECHAR(ch);
1587         if (ch2)
1588             STORECHAR(ch2);
1589     }
1590     return v;
1591 #undef STORECHAR
1592 }
1593
1594 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1595 {
1596     if (!PyUnicode_Check(unicode)) {
1597         PyErr_BadArgument();
1598         return NULL;
1599     }
1600     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1601                                  PyUnicode_GET_SIZE(unicode),
1602                                  NULL,
1603                                  0);
1604 }
1605
1606 /* --- Unicode Escape Codec ----------------------------------------------- */
1607
1608 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1609
1610 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1611                                         int size,
1612                                         const char *errors)
1613 {
1614     const char *starts = s;
1615     int startinpos;
1616     int endinpos;
1617     int outpos;
1618     int i;
1619     PyUnicodeObject *v;
1620     Py_UNICODE *p;
1621     const char *end;
1622     char* message;
1623     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1624     PyObject *errorHandler = NULL;
1625     PyObject *exc = NULL;
1626
1627     /* Escaped strings will always be longer than the resulting
1628        Unicode string, so we start with size here and then reduce the
1629        length after conversion to the true value.
1630        (but if the error callback returns a long replacement string
1631        we'll have to allocate more space) */
1632     v = _PyUnicode_New(size);
1633     if (v == NULL)
1634         goto onError;
1635     if (size == 0)
1636         return (PyObject *)v;
1637
1638     p = PyUnicode_AS_UNICODE(v);
1639     end = s + size;
1640
1641     while (s < end) {
1642         unsigned char c;
1643         Py_UNICODE x;
1644         int digits;
1645
1646         /* Non-escape characters are interpreted as Unicode ordinals */
1647         if (*s != '\\') {
1648             *p++ = (unsigned char) *s++;
1649             continue;
1650         }
1651
1652         startinpos = s-starts;
1653         /* \ - Escapes */
1654         s++;
1655         switch (*s++) {
1656
1657         /* \x escapes */
1658         case '\n': break;
1659         case '\\': *p++ = '\\'; break;
1660         case '\'': *p++ = '\''; break;
1661         case '\"': *p++ = '\"'; break;
1662         case 'b': *p++ = '\b'; break;
1663         case 'f': *p++ = '\014'; break; /* FF */
1664         case 't': *p++ = '\t'; break;
1665         case 'n': *p++ = '\n'; break;
1666         case 'r': *p++ = '\r'; break;
1667         case 'v': *p++ = '\013'; break; /* VT */
1668         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1669
1670         /* \OOO (octal) escapes */
1671         case '0': case '1': case '2': case '3':
1672         case '4': case '5': case '6': case '7':
1673             x = s[-1] - '0';
1674             if ('0' <= *s && *s <= '7') {
1675                 x = (x<<3) + *s++ - '0';
1676                 if ('0' <= *s && *s <= '7')
1677                     x = (x<<3) + *s++ - '0';
1678             }
1679             *p++ = x;
1680             break;
1681
1682         /* hex escapes */
1683         /* \xXX */
1684         case 'x':
1685             digits = 2;
1686             message = "truncated \\xXX escape";
1687             goto hexescape;
1688
1689         /* \uXXXX */
1690         case 'u':
1691             digits = 4;
1692             message = "truncated \\uXXXX escape";
1693             goto hexescape;
1694
1695         /* \UXXXXXXXX */
1696         case 'U':
1697             digits = 8;
1698             message = "truncated \\UXXXXXXXX escape";
1699         hexescape:
1700             chr = 0;
1701             outpos = p-PyUnicode_AS_UNICODE(v);
1702             if (s+digits>end) {
1703                 endinpos = size;
1704                 if (unicode_decode_call_errorhandler(
1705                     errors, &errorHandler,
1706                     "unicodeescape", "end of string in escape sequence",
1707                     starts, size, &startinpos, &endinpos, &exc, &s,
1708                     (PyObject **)&v, &outpos, &p))
1709                     goto onError;
1710                 goto nextByte;
1711             }
1712             for (i = 0; i < digits; ++i) {
1713                 c = (unsigned char) s[i];
1714                 if (!isxdigit(c)) {
1715                     endinpos = (s+i+1)-starts;
1716                     if (unicode_decode_call_errorhandler(
1717                         errors, &errorHandler,
1718                         "unicodeescape", message,
1719                         starts, size, &startinpos, &endinpos, &exc, &s,
1720                         (PyObject **)&v, &outpos, &p))
1721                         goto onError;
1722                     goto nextByte;
1723                 }
1724                 chr = (chr<<4) & ~0xF;
1725                 if (c >= '0' && c <= '9')
1726                     chr += c - '0';
1727                 else if (c >= 'a' && c <= 'f')
1728                     chr += 10 + c - 'a';
1729                 else
1730                     chr += 10 + c - 'A';
1731             }
1732             s += i;
1733             if (chr == 0xffffffff)
1734                 /* _decoding_error will have already written into the
1735                    target buffer. */
1736                 break;
1737         store:
1738             /* when we get here, chr is a 32-bit unicode character */
1739             if (chr <= 0xffff)
1740                 /* UCS-2 character */
1741                 *p++ = (Py_UNICODE) chr;
1742             else if (chr <= 0x10ffff) {
1743                 /* UCS-4 character. Either store directly, or as
1744                    surrogate pair. */
1745 #ifdef Py_UNICODE_WIDE
1746                 *p++ = chr;
1747 #else
1748                 chr -= 0x10000L;
1749                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1750                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1751 #endif
1752             } else {
1753                 endinpos = s-starts;
1754                 outpos = p-PyUnicode_AS_UNICODE(v);
1755                 if (unicode_decode_call_errorhandler(
1756                     errors, &errorHandler,
1757                     "unicodeescape", "illegal Unicode character",
1758                     starts, size, &startinpos, &endinpos, &exc, &s,
1759                     (PyObject **)&v, &outpos, &p))
1760                     goto onError;
1761             }
1762             break;
1763
1764         /* \N{name} */
1765         case 'N':
1766             message = "malformed \\N character escape";
1767             if (ucnhash_CAPI == NULL) {
1768                 /* load the unicode data module */
1769                 PyObject *m, *v;
1770                 m = PyImport_ImportModule("unicodedata");
1771                 if (m == NULL)
1772                     goto ucnhashError;
1773                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1774                 Py_DECREF(m);
1775                 if (v == NULL)
1776                     goto ucnhashError;
1777                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1778                 Py_DECREF(v);
1779                 if (ucnhash_CAPI == NULL)
1780                     goto ucnhashError;
1781             }
1782             if (*s == '{') {
1783                 const char *start = s+1;
1784                 /* look for the closing brace */
1785                 while (*s != '}' && s < end)
1786                     s++;
1787                 if (s > start && s < end && *s == '}') {
1788                     /* found a name.  look it up in the unicode database */
1789                     message = "unknown Unicode character name";
1790                     s++;
1791                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1792                         goto store;
1793                 }
1794             }
1795             endinpos = s-starts;
1796             outpos = p-PyUnicode_AS_UNICODE(v);
1797             if (unicode_decode_call_errorhandler(
1798                 errors, &errorHandler,
1799                 "unicodeescape", message,
1800                 starts, size, &startinpos, &endinpos, &exc, &s,
1801                 (PyObject **)&v, &outpos, &p))
1802                 goto onError;
1803             break;
1804
1805         default:
1806             if (s > end) {
1807                 message = "\\ at end of string";
1808                 s--;
1809                 endinpos = s-starts;
1810                 outpos = p-PyUnicode_AS_UNICODE(v);
1811                 if (unicode_decode_call_errorhandler(
1812                     errors, &errorHandler,
1813                     "unicodeescape", message,
1814                     starts, size, &startinpos, &endinpos, &exc, &s,
1815                     (PyObject **)&v, &outpos, &p))
1816                     goto onError;
1817             }
1818             else {
1819                 *p++ = '\\';
1820                 *p++ = (unsigned char)s[-1];
1821             }
1822             break;
1823         }
1824         nextByte:
1825         ;
1826     }
1827     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1828         goto onError;
1829     return (PyObject *)v;
1830
1831 ucnhashError:
1832     PyErr_SetString(
1833         PyExc_UnicodeError,
1834         "\\N escapes not supported (can't load unicodedata module)"
1835         );
1836     Py_XDECREF(errorHandler);
1837     Py_XDECREF(exc);
1838     return NULL;
1839
1840 onError:
1841     Py_XDECREF(v);
1842     Py_XDECREF(errorHandler);
1843     Py_XDECREF(exc);
1844     return NULL;
1845 }
1846
1847 /* Return a Unicode-Escape string version of the Unicode object.
1848
1849    If quotes is true, the string is enclosed in u"" or u'' quotes as
1850    appropriate.
1851
1852 */
1853
1854 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1855                                   int size,
1856                                   Py_UNICODE ch);
1857
1858 static
1859 PyObject *unicodeescape_string(const Py_UNICODE *s,
1860                                int size,
1861                                int quotes)
1862 {
1863     PyObject *repr;
1864     char *p;
1865
1866     static const char *hexdigit = "0123456789abcdef";
1867
1868     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1869     if (repr == NULL)
1870         return NULL;
1871
1872     p = PyString_AS_STRING(repr);
1873
1874     if (quotes) {
1875         *p++ = 'u';
1876         *p++ = (findchar(s, size, '\'') &&
1877                 !findchar(s, size, '"')) ? '"' : '\'';
1878     }
1879     while (size-- > 0) {
1880         Py_UNICODE ch = *s++;
1881
1882         /* Escape quotes */
1883         if (quotes &&
1884             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1885             *p++ = '\\';
1886             *p++ = (char) ch;
1887             continue;
1888         }
1889
1890 #ifdef Py_UNICODE_WIDE
1891         /* Map 21-bit characters to '\U00xxxxxx' */
1892         else if (ch >= 0x10000) {
1893             int offset = p - PyString_AS_STRING(repr);
1894
1895             /* Resize the string if necessary */
1896             if (offset + 12 > PyString_GET_SIZE(repr)) {
1897                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1898                     return NULL;
1899                 p = PyString_AS_STRING(repr) + offset;
1900             }
1901
1902             *p++ = '\\';
1903             *p++ = 'U';
1904             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1905             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1906             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1907             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1908             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1909             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1910             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1911             *p++ = hexdigit[ch & 0x0000000F];
1912             continue;
1913         }
1914 #endif
1915         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1916         else if (ch >= 0xD800 && ch < 0xDC00) {
1917             Py_UNICODE ch2;
1918             Py_UCS4 ucs;
1919
1920             ch2 = *s++;
1921             size--;
1922             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1923                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1924                 *p++ = '\\';
1925                 *p++ = 'U';
1926                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1927                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1928                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1929                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1930                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1931                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1932                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1933                 *p++ = hexdigit[ucs & 0x0000000F];
1934                 continue;
1935             }
1936             /* Fall through: isolated surrogates are copied as-is */
1937             s--;
1938             size++;
1939         }
1940
1941         /* Map 16-bit characters to '\uxxxx' */
1942         if (ch >= 256) {
1943             *p++ = '\\';
1944             *p++ = 'u';
1945             *p++ = hexdigit[(ch >> 12) & 0x000F];
1946             *p++ = hexdigit[(ch >> 8) & 0x000F];
1947             *p++ = hexdigit[(ch >> 4) & 0x000F];
1948             *p++ = hexdigit[ch & 0x000F];
1949         }
1950
1951         /* Map special whitespace to '\t', \n', '\r' */
1952         else if (ch == '\t') {
1953             *p++ = '\\';
1954             *p++ = 't';
1955         }
1956         else if (ch == '\n') {
1957             *p++ = '\\';
1958             *p++ = 'n';
1959         }
1960         else if (ch == '\r') {
1961             *p++ = '\\';
1962             *p++ = 'r';
1963         }
1964
1965         /* Map non-printable US ASCII to '\xhh' */
1966         else if (ch < ' ' || ch >= 0x7F) {
1967             *p++ = '\\';
1968             *p++ = 'x';
1969             *p++ = hexdigit[(ch >> 4) & 0x000F];
1970             *p++ = hexdigit[ch & 0x000F];
1971         }
1972
1973         /* Copy everything else as-is */
1974         else
1975             *p++ = (char) ch;
1976     }
1977     if (quotes)
1978         *p++ = PyString_AS_STRING(repr)[1];
1979
1980     *p = '\0';
1981     _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
1982     return repr;
1983 }
1984
1985 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1986                                         int size)
1987 {
1988     return unicodeescape_string(s, size, 0);
1989 }
1990
1991 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1992 {
1993     if (!PyUnicode_Check(unicode)) {
1994         PyErr_BadArgument();
1995         return NULL;
1996     }
1997     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1998                                          PyUnicode_GET_SIZE(unicode));
1999 }
2000
2001 /* --- Raw Unicode Escape Codec ------------------------------------------- */
2002
2003 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2004                                            int size,
2005                                            const char *errors)
2006 {
2007     const char *starts = s;
2008     int startinpos;
2009     int endinpos;
2010     int outpos;
2011     PyUnicodeObject *v;
2012     Py_UNICODE *p;
2013     const char *end;
2014     const char *bs;
2015     PyObject *errorHandler = NULL;
2016     PyObject *exc = NULL;
2017
2018     /* Escaped strings will always be longer than the resulting
2019        Unicode string, so we start with size here and then reduce the
2020        length after conversion to the true value. (But decoding error
2021        handler might have to resize the string) */
2022     v = _PyUnicode_New(size);
2023     if (v == NULL)
2024         goto onError;
2025     if (size == 0)
2026         return (PyObject *)v;
2027     p = PyUnicode_AS_UNICODE(v);
2028     end = s + size;
2029     while (s < end) {
2030         unsigned char c;
2031         Py_UCS4 x;
2032         int i;
2033
2034         /* Non-escape characters are interpreted as Unicode ordinals */
2035         if (*s != '\\') {
2036             *p++ = (unsigned char)*s++;
2037             continue;
2038         }
2039         startinpos = s-starts;
2040
2041         /* \u-escapes are only interpreted iff the number of leading
2042            backslashes if odd */
2043         bs = s;
2044         for (;s < end;) {
2045             if (*s != '\\')
2046                 break;
2047             *p++ = (unsigned char)*s++;
2048         }
2049         if (((s - bs) & 1) == 0 ||
2050             s >= end ||
2051             *s != 'u') {
2052             continue;
2053         }
2054         p--;
2055         s++;
2056
2057         /* \uXXXX with 4 hex digits */
2058         outpos = p-PyUnicode_AS_UNICODE(v);
2059         for (x = 0, i = 0; i < 4; ++i, ++s) {
2060             c = (unsigned char)*s;
2061             if (!isxdigit(c)) {
2062                 endinpos = s-starts;
2063                 if (unicode_decode_call_errorhandler(
2064                     errors, &errorHandler,
2065                     "rawunicodeescape", "truncated \\uXXXX",
2066                     starts, size, &startinpos, &endinpos, &exc, &s,
2067                     (PyObject **)&v, &outpos, &p))
2068                     goto onError;
2069                 goto nextByte;
2070             }
2071             x = (x<<4) & ~0xF;
2072             if (c >= '0' && c <= '9')
2073                 x += c - '0';
2074             else if (c >= 'a' && c <= 'f')
2075                 x += 10 + c - 'a';
2076             else
2077                 x += 10 + c - 'A';
2078         }
2079         *p++ = x;
2080         nextByte:
2081         ;
2082     }
2083     if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2084         goto onError;
2085     Py_XDECREF(errorHandler);
2086     Py_XDECREF(exc);
2087     return (PyObject *)v;
2088
2089  onError:
2090     Py_XDECREF(v);
2091     Py_XDECREF(errorHandler);
2092     Py_XDECREF(exc);
2093     return NULL;
2094 }
2095
2096 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2097                                            int size)
2098 {
2099     PyObject *repr;
2100     char *p;
2101     char *q;
2102
2103     static const char *hexdigit = "0123456789abcdef";
2104
2105     repr = PyString_FromStringAndSize(NULL, 6 * size);
2106     if (repr == NULL)
2107         return NULL;
2108     if (size == 0)
2109         return repr;
2110
2111     p = q = PyString_AS_STRING(repr);
2112     while (size-- > 0) {
2113         Py_UNICODE ch = *s++;
2114         /* Map 16-bit characters to '\uxxxx' */
2115         if (ch >= 256) {
2116             *p++ = '\\';
2117             *p++ = 'u';
2118             *p++ = hexdigit[(ch >> 12) & 0xf];
2119             *p++ = hexdigit[(ch >> 8) & 0xf];
2120             *p++ = hexdigit[(ch >> 4) & 0xf];
2121             *p++ = hexdigit[ch & 15];
2122         }
2123         /* Copy everything else as-is */
2124         else
2125             *p++ = (char) ch;
2126     }
2127     *p = '\0';
2128     _PyString_Resize(&repr, p - q);
2129     return repr;
2130 }
2131
2132 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2133 {
2134     if (!PyUnicode_Check(unicode)) {
2135         PyErr_BadArgument();
2136         return NULL;
2137     }
2138     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2139                                             PyUnicode_GET_SIZE(unicode));
2140 }
2141
2142 /* --- Latin-1 Codec ------------------------------------------------------ */
2143
2144 PyObject *PyUnicode_DecodeLatin1(const char *s,
2145                                  int size,
2146                                  const char *errors)
2147 {
2148     PyUnicodeObject *v;
2149     Py_UNICODE *p;
2150
2151     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2152     if (size == 1 && *(unsigned char*)s < 256) {
2153         Py_UNICODE r = *(unsigned char*)s;
2154         return PyUnicode_FromUnicode(&r, 1);
2155     }
2156
2157     v = _PyUnicode_New(size);
2158     if (v == NULL)
2159         goto onError;
2160     if (size == 0)
2161         return (PyObject *)v;
2162     p = PyUnicode_AS_UNICODE(v);
2163     while (size-- > 0)
2164         *p++ = (unsigned char)*s++;
2165     return (PyObject *)v;
2166
2167  onError:
2168     Py_XDECREF(v);
2169     return NULL;
2170 }
2171
2172 /* create or adjust a UnicodeEncodeError */
2173 static void make_encode_exception(PyObject **exceptionObject,
2174     const char *encoding,
2175     const Py_UNICODE *unicode, int size,
2176     int startpos, int endpos,
2177     const char *reason)
2178 {
2179     if (*exceptionObject == NULL) {
2180         *exceptionObject = PyUnicodeEncodeError_Create(
2181             encoding, unicode, size, startpos, endpos, reason);
2182     }
2183     else {
2184         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2185             goto onError;
2186         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2187             goto onError;
2188         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2189             goto onError;
2190         return;
2191         onError:
2192         Py_DECREF(*exceptionObject);
2193         *exceptionObject = NULL;
2194     }
2195 }
2196
2197 /* raises a UnicodeEncodeError */
2198 static void raise_encode_exception(PyObject **exceptionObject,
2199     const char *encoding,
2200     const Py_UNICODE *unicode, int size,
2201     int startpos, int endpos,
2202     const char *reason)
2203 {
2204     make_encode_exception(exceptionObject,
2205         encoding, unicode, size, startpos, endpos, reason);
2206     if (*exceptionObject != NULL)
2207         PyCodec_StrictErrors(*exceptionObject);
2208 }
2209
2210 /* error handling callback helper:
2211    build arguments, call the callback and check the arguments,
2212    put the result into newpos and return the replacement string, which
2213    has to be freed by the caller */
2214 static PyObject *unicode_encode_call_errorhandler(const char *errors,
2215     PyObject **errorHandler,
2216     const char *encoding, const char *reason,
2217     const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2218     int startpos, int endpos,
2219     int *newpos)
2220 {
2221     static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2222
2223     PyObject *restuple;
2224     PyObject *resunicode;
2225
2226     if (*errorHandler == NULL) {
2227         *errorHandler = PyCodec_LookupError(errors);
2228         if (*errorHandler == NULL)
2229             return NULL;
2230     }
2231
2232     make_encode_exception(exceptionObject,
2233         encoding, unicode, size, startpos, endpos, reason);
2234     if (*exceptionObject == NULL)
2235         return NULL;
2236
2237     restuple = PyObject_CallFunctionObjArgs(
2238         *errorHandler, *exceptionObject, NULL);
2239     if (restuple == NULL)
2240         return NULL;
2241     if (!PyTuple_Check(restuple)) {
2242         PyErr_Format(PyExc_TypeError, &argparse[4]);
2243         Py_DECREF(restuple);
2244         return NULL;
2245     }
2246     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2247         &resunicode, newpos)) {
2248         Py_DECREF(restuple);
2249         return NULL;
2250     }
2251     if (*newpos<0)
2252         *newpos = size+*newpos;
2253     if (*newpos<0 || *newpos>size) {
2254         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2255         Py_DECREF(restuple);
2256         return NULL;
2257     }
2258     Py_INCREF(resunicode);
2259     Py_DECREF(restuple);
2260     return resunicode;
2261 }
2262
2263 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2264                                  int size,
2265                                  const char *errors,
2266                                  int limit)
2267 {
2268     /* output object */
2269     PyObject *res;
2270     /* pointers to the beginning and end+1 of input */
2271     const Py_UNICODE *startp = p;
2272     const Py_UNICODE *endp = p + size;
2273     /* pointer to the beginning of the unencodable characters */
2274     /* const Py_UNICODE *badp = NULL; */
2275     /* pointer into the output */
2276     char *str;
2277     /* current output position */
2278     int respos = 0;
2279     int ressize;
2280     char *encoding = (limit == 256) ? "latin-1" : "ascii";
2281     char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2282     PyObject *errorHandler = NULL;
2283     PyObject *exc = NULL;
2284     /* the following variable is used for caching string comparisons
2285      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2286     int known_errorHandler = -1;
2287
2288     /* allocate enough for a simple encoding without
2289        replacements, if we need more, we'll resize */
2290     res = PyString_FromStringAndSize(NULL, size);
2291     if (res == NULL)
2292         goto onError;
2293     if (size == 0)
2294         return res;
2295     str = PyString_AS_STRING(res);
2296     ressize = size;
2297
2298     while (p<endp) {
2299         Py_UNICODE c = *p;
2300
2301         /* can we encode this? */
2302         if (c<limit) {
2303             /* no overflow check, because we know that the space is enough */
2304             *str++ = (char)c;
2305             ++p;
2306         }
2307         else {
2308             int unicodepos = p-startp;
2309             int requiredsize;
2310             PyObject *repunicode;
2311             int repsize;
2312             int newpos;
2313             int respos;
2314             Py_UNICODE *uni2;
2315             /* startpos for collecting unencodable chars */
2316             const Py_UNICODE *collstart = p;
2317             const Py_UNICODE *collend = p;
2318             /* find all unecodable characters */
2319             while ((collend < endp) && ((*collend)>=limit))
2320                 ++collend;
2321             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2322             if (known_errorHandler==-1) {
2323                 if ((errors==NULL) || (!strcmp(errors, "strict")))
2324                     known_errorHandler = 1;
2325                 else if (!strcmp(errors, "replace"))
2326                     known_errorHandler = 2;
2327                 else if (!strcmp(errors, "ignore"))
2328                     known_errorHandler = 3;
2329                 else if (!strcmp(errors, "xmlcharrefreplace"))
2330                     known_errorHandler = 4;
2331                 else
2332                     known_errorHandler = 0;
2333             }
2334             switch (known_errorHandler) {
2335                 case 1: /* strict */
2336                     raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2337                     goto onError;
2338                 case 2: /* replace */
2339                     while (collstart++<collend)
2340                         *str++ = '?'; /* fall through */
2341                 case 3: /* ignore */
2342                     p = collend;
2343                     break;
2344                 case 4: /* xmlcharrefreplace */
2345                     respos = str-PyString_AS_STRING(res);
2346                     /* determine replacement size (temporarily (mis)uses p) */
2347                     for (p = collstart, repsize = 0; p < collend; ++p) {
2348                         if (*p<10)
2349                             repsize += 2+1+1;
2350                         else if (*p<100)
2351                             repsize += 2+2+1;
2352                         else if (*p<1000)
2353                             repsize += 2+3+1;
2354                         else if (*p<10000)
2355                             repsize += 2+4+1;
2356                         else if (*p<100000)
2357                             repsize += 2+5+1;
2358                         else if (*p<1000000)
2359                             repsize += 2+6+1;
2360                         else
2361                             repsize += 2+7+1;
2362                     }
2363                     requiredsize = respos+repsize+(endp-collend);
2364                     if (requiredsize > ressize) {
2365                         if (requiredsize<2*ressize)
2366                             requiredsize = 2*ressize;
2367                         if (_PyString_Resize(&res, requiredsize))
2368                             goto onError;
2369                         str = PyString_AS_STRING(res) + respos;
2370                         ressize = requiredsize;
2371                     }
2372                     /* generate replacement (temporarily (mis)uses p) */
2373                     for (p = collstart; p < collend; ++p) {
2374                         str += sprintf(str, "&#%d;", (int)*p);
2375                     }
2376                     p = collend;
2377                     break;
2378                 default:
2379                     repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2380                         encoding, reason, startp, size, &exc,
2381                         collstart-startp, collend-startp, &newpos);
2382                     if (repunicode == NULL)
2383                         goto onError;
2384                     /* need more space? (at least enough for what we
2385                        have+the replacement+the rest of the string, so
2386                        we won't have to check space for encodable characters) */
2387                     respos = str-PyString_AS_STRING(res);
2388                     repsize = PyUnicode_GET_SIZE(repunicode);
2389                     requiredsize = respos+repsize+(endp-collend);
2390                     if (requiredsize > ressize) {
2391                         if (requiredsize<2*ressize)
2392                             requiredsize = 2*ressize;
2393                         if (_PyString_Resize(&res, requiredsize)) {
2394                             Py_DECREF(repunicode);
2395                             goto onError;
2396                         }
2397                         str = PyString_AS_STRING(res) + respos;
2398                         ressize = requiredsize;
2399                     }
2400                     /* check if there is anything unencodable in the replacement
2401                        and copy it to the output */
2402                     for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2403                         c = *uni2;
2404                         if (c >= limit) {
2405                             raise_encode_exception(&exc, encoding, startp, size,
2406                                 unicodepos, unicodepos+1, reason);
2407                             Py_DECREF(repunicode);
2408                             goto onError;
2409                         }
2410                         *str = (char)c;
2411                     }
2412                     p = startp + newpos;
2413                     Py_DECREF(repunicode);
2414             }
2415         }
2416     }
2417     /* Resize if we allocated to much */
2418     respos = str-PyString_AS_STRING(res);
2419     if (respos<ressize)
2420        /* If this falls res will be NULL */
2421         _PyString_Resize(&res, respos);
2422     Py_XDECREF(errorHandler);
2423     Py_XDECREF(exc);
2424     return res;
2425
2426     onError:
2427     Py_XDECREF(res);
2428     Py_XDECREF(errorHandler);
2429     Py_XDECREF(exc);
2430     return NULL;
2431 }
2432
2433 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2434                                  int size,
2435                                  const char *errors)
2436 {
2437     return unicode_encode_ucs1(p, size, errors, 256);
2438 }
2439
2440 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2441 {
2442     if (!PyUnicode_Check(unicode)) {
2443         PyErr_BadArgument();
2444         return NULL;
2445     }
2446     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2447                                   PyUnicode_GET_SIZE(unicode),
2448                                   NULL);
2449 }
2450
2451 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2452
2453 PyObject *PyUnicode_DecodeASCII(const char *s,
2454                                 int size,
2455                                 const char *errors)
2456 {
2457     const char *starts = s;
2458     PyUnicodeObject *v;
2459     Py_UNICODE *p;
2460     int startinpos;
2461     int endinpos;
2462     int outpos;
2463     const char *e;
2464     PyObject *errorHandler = NULL;
2465     PyObject *exc = NULL;
2466
2467     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2468     if (size == 1 && *(unsigned char*)s < 128) {
2469         Py_UNICODE r = *(unsigned char*)s;
2470         return PyUnicode_FromUnicode(&r, 1);
2471     }
2472
2473     v = _PyUnicode_New(size);
2474     if (v == NULL)
2475         goto onError;
2476     if (size == 0)
2477         return (PyObject *)v;
2478     p = PyUnicode_AS_UNICODE(v);
2479     e = s + size;
2480     while (s < e) {
2481         register unsigned char c = (unsigned char)*s;
2482         if (c < 128) {
2483             *p++ = c;
2484             ++s;
2485         }
2486         else {
2487             startinpos = s-starts;
2488             endinpos = startinpos + 1;
2489             outpos = p-PyUnicode_AS_UNICODE(v);
2490             if (unicode_decode_call_errorhandler(
2491                  errors, &errorHandler,
2492                  "ascii", "ordinal not in range(128)",
2493                  starts, size, &startinpos, &endinpos, &exc, &s,
2494                  (PyObject **)&v, &outpos, &p))
2495                 goto onError;
2496         }
2497     }
2498     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2499         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2500             goto onError;
2501     Py_XDECREF(errorHandler);
2502     Py_XDECREF(exc);
2503     return (PyObject *)v;
2504
2505  onError:
2506     Py_XDECREF(v);
2507     Py_XDECREF(errorHandler);
2508     Py_XDECREF(exc);
2509     return NULL;
2510 }
2511
2512 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2513                                 int size,
2514                                 const char *errors)
2515 {
2516     return unicode_encode_ucs1(p, size, errors, 128);
2517 }
2518
2519 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2520 {
2521     if (!PyUnicode_Check(unicode)) {
2522         PyErr_BadArgument();
2523         return NULL;
2524     }
2525     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2526                                  PyUnicode_GET_SIZE(unicode),
2527                                  NULL);
2528 }
2529
2530 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
2531
2532 /* --- MBCS codecs for Windows -------------------------------------------- */
2533
2534 PyObject *PyUnicode_DecodeMBCS(const char *s,
2535                                 int size,
2536                                 const char *errors)
2537 {
2538     PyUnicodeObject *v;
2539     Py_UNICODE *p;
2540
2541     /* First get the size of the result */
2542     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2543     if (size > 0 && usize==0)
2544         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2545
2546     v = _PyUnicode_New(usize);
2547     if (v == NULL)
2548         return NULL;
2549     if (usize == 0)
2550         return (PyObject *)v;
2551     p = PyUnicode_AS_UNICODE(v);
2552     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2553         Py_DECREF(v);
2554         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2555     }
2556
2557     return (PyObject *)v;
2558 }
2559
2560 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2561                                 int size,
2562                                 const char *errors)
2563 {
2564     PyObject *repr;
2565     char *s;
2566     DWORD mbcssize;
2567
2568     /* If there are no characters, bail now! */
2569     if (size==0)
2570             return PyString_FromString("");
2571
2572     /* First get the size of the result */
2573     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2574     if (mbcssize==0)
2575         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2576
2577     repr = PyString_FromStringAndSize(NULL, mbcssize);
2578     if (repr == NULL)
2579         return NULL;
2580     if (mbcssize == 0)
2581         return repr;
2582
2583     /* Do the conversion */
2584     s = PyString_AS_STRING(repr);
2585     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2586         Py_DECREF(repr);
2587         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2588     }
2589     return repr;
2590 }
2591
2592 #endif /* MS_WINDOWS */
2593
2594 /* --- Character Mapping Codec -------------------------------------------- */
2595
2596 PyObject *PyUnicode_DecodeCharmap(const char *s,
2597                                   int size,
2598                                   PyObject *mapping,
2599                                   const char *errors)
2600 {
2601     const char *starts = s;
2602     int startinpos;
2603     int endinpos;
2604     int outpos;
2605     const char *e;
2606     PyUnicodeObject *v;
2607     Py_UNICODE *p;
2608     int extrachars = 0;
2609     PyObject *errorHandler = NULL;
2610     PyObject *exc = NULL;
2611
2612     /* Default to Latin-1 */
2613     if (mapping == NULL)
2614         return PyUnicode_DecodeLatin1(s, size, errors);
2615
2616     v = _PyUnicode_New(size);
2617     if (v == NULL)
2618         goto onError;
2619     if (size == 0)
2620         return (PyObject *)v;
2621     p = PyUnicode_AS_UNICODE(v);
2622     e = s + size;
2623     while (s < e) {
2624         unsigned char ch = *s;
2625         PyObject *w, *x;
2626
2627         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2628         w = PyInt_FromLong((long)ch);
2629         if (w == NULL)
2630             goto onError;
2631         x = PyObject_GetItem(mapping, w);
2632         Py_DECREF(w);
2633         if (x == NULL) {
2634             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2635                 /* No mapping found means: mapping is undefined. */
2636                 PyErr_Clear();
2637                 x = Py_None;
2638                 Py_INCREF(x);
2639             } else
2640                 goto onError;
2641         }
2642
2643         /* Apply mapping */
2644         if (PyInt_Check(x)) {
2645             long value = PyInt_AS_LONG(x);
2646             if (value < 0 || value > 65535) {
2647                 PyErr_SetString(PyExc_TypeError,
2648                                 "character mapping must be in range(65536)");
2649                 Py_DECREF(x);
2650                 goto onError;
2651             }
2652             *p++ = (Py_UNICODE)value;
2653         }
2654         else if (x == Py_None) {
2655             /* undefined mapping */
2656             outpos = p-PyUnicode_AS_UNICODE(v);
2657             startinpos = s-starts;
2658             endinpos = startinpos+1;
2659             if (unicode_decode_call_errorhandler(
2660                  errors, &errorHandler,
2661                  "charmap", "character maps to <undefined>",
2662                  starts, size, &startinpos, &endinpos, &exc, &s,
2663                  (PyObject **)&v, &outpos, &p)) {
2664                 Py_DECREF(x);
2665                 goto onError;
2666             }
2667             continue;
2668         }
2669         else if (PyUnicode_Check(x)) {
2670             int targetsize = PyUnicode_GET_SIZE(x);
2671
2672             if (targetsize == 1)
2673                 /* 1-1 mapping */
2674                 *p++ = *PyUnicode_AS_UNICODE(x);
2675
2676             else if (targetsize > 1) {
2677                 /* 1-n mapping */
2678                 if (targetsize > extrachars) {
2679                     /* resize first */
2680                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2681                     int needed = (targetsize - extrachars) + \
2682                                  (targetsize << 2);
2683                     extrachars += needed;
2684                     if (_PyUnicode_Resize(&v,
2685                                          PyUnicode_GET_SIZE(v) + needed)) {
2686                         Py_DECREF(x);
2687                         goto onError;
2688                     }
2689                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2690                 }
2691                 Py_UNICODE_COPY(p,
2692                                 PyUnicode_AS_UNICODE(x),
2693                                 targetsize);
2694                 p += targetsize;
2695                 extrachars -= targetsize;
2696             }
2697             /* 1-0 mapping: skip the character */
2698         }
2699         else {
2700             /* wrong return value */
2701             PyErr_SetString(PyExc_TypeError,
2702                   "character mapping must return integer, None or unicode");
2703             Py_DECREF(x);
2704             goto onError;
2705         }
2706         Py_DECREF(x);
2707         ++s;
2708     }
2709     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2710         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2711             goto onError;
2712     Py_XDECREF(errorHandler);
2713     Py_XDECREF(exc);
2714     return (PyObject *)v;
2715
2716  onError:
2717     Py_XDECREF(errorHandler);
2718     Py_XDECREF(exc);
2719     Py_XDECREF(v);
2720     return NULL;
2721 }
2722
2723 /* Lookup the character ch in the mapping. If the character
2724    can't be found, Py_None is returned (or NULL, if another
2725    error occured). */
2726 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
2727 {
2728     PyObject *w = PyInt_FromLong((long)c);
2729     PyObject *x;
2730
2731     if (w == NULL)
2732          return NULL;
2733     x = PyObject_GetItem(mapping, w);
2734     Py_DECREF(w);
2735     if (x == NULL) {
2736         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2737             /* No mapping found means: mapping is undefined. */
2738             PyErr_Clear();
2739             x = Py_None;
2740             Py_INCREF(x);
2741             return x;
2742         } else
2743             return NULL;
2744     }
2745     else if (x == Py_None)
2746         return x;
2747     else if (PyInt_Check(x)) {
2748         long value = PyInt_AS_LONG(x);
2749         if (value < 0 || value > 255) {
2750             PyErr_SetString(PyExc_TypeError,
2751                              "character mapping must be in range(256)");
2752             Py_DECREF(x);
2753             return NULL;
2754         }
2755         return x;
2756     }
2757     else if (PyString_Check(x))
2758         return x;
2759     else {
2760         /* wrong return value */
2761         PyErr_SetString(PyExc_TypeError,
2762               "character mapping must return integer, None or str");
2763         Py_DECREF(x);
2764         return NULL;
2765     }
2766 }
2767
2768 /* lookup the character, put the result in the output string and adjust
2769    various state variables. Reallocate the output string if not enough
2770    space is available. Return a new reference to the object that
2771    was put in the output buffer, or Py_None, if the mapping was undefined
2772    (in which case no character was written) or NULL, if a
2773    reallocation error ocurred. The called must decref the result */
2774 static
2775 PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2776     PyObject **outobj, int *outpos)
2777 {
2778     PyObject *rep = charmapencode_lookup(c, mapping);
2779
2780     if (rep==NULL)
2781         return NULL;
2782     else if (rep==Py_None)
2783         return rep;
2784     else {
2785         char *outstart = PyString_AS_STRING(*outobj);
2786         int outsize = PyString_GET_SIZE(*outobj);
2787         if (PyInt_Check(rep)) {
2788             int requiredsize = *outpos+1;
2789             if (outsize<requiredsize) {
2790                 /* exponentially overallocate to minimize reallocations */
2791                 if (requiredsize < 2*outsize)
2792                     requiredsize = 2*outsize;
2793                 if (_PyString_Resize(outobj, requiredsize)) {
2794                     Py_DECREF(rep);
2795                     return NULL;
2796                 }
2797                 outstart = PyString_AS_STRING(*outobj);
2798             }
2799             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2800         }
2801         else {
2802             const char *repchars = PyString_AS_STRING(rep);
2803             int repsize = PyString_GET_SIZE(rep);
2804             int requiredsize = *outpos+repsize;
2805             if (outsize<requiredsize) {
2806                 /* exponentially overallocate to minimize reallocations */
2807                 if (requiredsize < 2*outsize)
2808                     requiredsize = 2*outsize;
2809                 if (_PyString_Resize(outobj, requiredsize)) {
2810                     Py_DECREF(rep);
2811                     return NULL;
2812                 }
2813                 outstart = PyString_AS_STRING(*outobj);
2814             }
2815             memcpy(outstart + *outpos, repchars, repsize);
2816             *outpos += repsize;
2817         }
2818     }
2819     return rep;
2820 }
2821
2822 /* handle an error in PyUnicode_EncodeCharmap
2823    Return 0 on success, -1 on error */
2824 static
2825 int charmap_encoding_error(
2826     const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2827     PyObject **exceptionObject,
2828     int *known_errorHandler, PyObject *errorHandler, const char *errors,
2829     PyObject **res, int *respos)
2830 {
2831     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2832     int repsize;
2833     int newpos;
2834     Py_UNICODE *uni2;
2835     /* startpos for collecting unencodable chars */
2836     int collstartpos = *inpos;
2837     int collendpos = *inpos+1;
2838     int collpos;
2839     char *encoding = "charmap";
2840     char *reason = "character maps to <undefined>";
2841
2842     PyObject *x;
2843     /* find all unencodable characters */
2844     while (collendpos < size) {
2845         x = charmapencode_lookup(p[collendpos], mapping);
2846         if (x==NULL)
2847             return -1;
2848         else if (x!=Py_None) {
2849             Py_DECREF(x);
2850             break;
2851         }
2852         Py_DECREF(x);
2853         ++collendpos;
2854     }
2855     /* cache callback name lookup
2856      * (if not done yet, i.e. it's the first error) */
2857     if (*known_errorHandler==-1) {
2858         if ((errors==NULL) || (!strcmp(errors, "strict")))
2859             *known_errorHandler = 1;
2860         else if (!strcmp(errors, "replace"))
2861             *known_errorHandler = 2;
2862         else if (!strcmp(errors, "ignore"))
2863             *known_errorHandler = 3;
2864         else if (!strcmp(errors, "xmlcharrefreplace"))
2865             *known_errorHandler = 4;
2866         else
2867             *known_errorHandler = 0;
2868     }
2869     switch (*known_errorHandler) {
2870         case 1: /* strict */
2871             raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2872             return -1;
2873         case 2: /* replace */
2874             for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2875                 x = charmapencode_output('?', mapping, res, respos);
2876                 if (x==NULL) {
2877                     return -1;
2878                 }
2879                 else if (x==Py_None) {
2880                     Py_DECREF(x);
2881                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2882                     return -1;
2883                 }
2884                 Py_DECREF(x);
2885             }
2886             /* fall through */
2887         case 3: /* ignore */
2888             *inpos = collendpos;
2889             break;
2890         case 4: /* xmlcharrefreplace */
2891             /* generate replacement (temporarily (mis)uses p) */
2892             for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2893                 char buffer[2+29+1+1];
2894                 char *cp;
2895                 sprintf(buffer, "&#%d;", (int)p[collpos]);
2896                 for (cp = buffer; *cp; ++cp) {
2897                     x = charmapencode_output(*cp, mapping, res, respos);
2898                     if (x==NULL)
2899                         return -1;
2900                     else if (x==Py_None) {
2901                         Py_DECREF(x);
2902                         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2903                         return -1;
2904                     }
2905                     Py_DECREF(x);
2906                 }
2907             }
2908             *inpos = collendpos;
2909             break;
2910         default:
2911             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2912                 encoding, reason, p, size, exceptionObject,
2913                 collstartpos, collendpos, &newpos);
2914             if (repunicode == NULL)
2915                 return -1;
2916             /* generate replacement  */
2917             repsize = PyUnicode_GET_SIZE(repunicode);
2918             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2919                 x = charmapencode_output(*uni2, mapping, res, respos);
2920                 if (x==NULL) {
2921                     Py_DECREF(repunicode);
2922                     return -1;
2923                 }
2924                 else if (x==Py_None) {
2925                     Py_DECREF(repunicode);
2926                     Py_DECREF(x);
2927                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2928                     return -1;
2929                 }
2930                 Py_DECREF(x);
2931             }
2932             *inpos = newpos;
2933             Py_DECREF(repunicode);
2934     }
2935     return 0;
2936 }
2937
2938 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2939                                   int size,
2940                                   PyObject *mapping,
2941                                   const char *errors)
2942 {
2943     /* output object */
2944     PyObject *res = NULL;
2945     /* current input position */
2946     int inpos = 0;
2947     /* current output position */
2948     int respos = 0;
2949     PyObject *errorHandler = NULL;
2950     PyObject *exc = NULL;
2951     /* the following variable is used for caching string comparisons
2952      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2953      * 3=ignore, 4=xmlcharrefreplace */
2954     int known_errorHandler = -1;
2955
2956     /* Default to Latin-1 */
2957     if (mapping == NULL)
2958         return PyUnicode_EncodeLatin1(p, size, errors);
2959
2960     /* allocate enough for a simple encoding without
2961        replacements, if we need more, we'll resize */
2962     res = PyString_FromStringAndSize(NULL, size);
2963     if (res == NULL)
2964         goto onError;
2965     if (size == 0)
2966         return res;
2967
2968     while (inpos<size) {
2969         /* try to encode it */
2970         PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2971         if (x==NULL) /* error */
2972             goto onError;
2973         if (x==Py_None) { /* unencodable character */
2974             if (charmap_encoding_error(p, size, &inpos, mapping,
2975                 &exc,
2976                 &known_errorHandler, errorHandler, errors,
2977                 &res, &respos))
2978                 goto onError;
2979         }
2980         else
2981             /* done with this character => adjust input position */
2982             ++inpos;
2983         Py_DECREF(x);
2984     }
2985
2986     /* Resize if we allocated to much */
2987     if (respos<PyString_GET_SIZE(res)) {
2988         if (_PyString_Resize(&res, respos))
2989             goto onError;
2990     }
2991     Py_XDECREF(exc);
2992     Py_XDECREF(errorHandler);
2993     return res;
2994
2995     onError:
2996     Py_XDECREF(res);
2997     Py_XDECREF(exc);
2998     Py_XDECREF(errorHandler);
2999     return NULL;
3000 }
3001
3002 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3003                                     PyObject *mapping)
3004 {
3005     if (!PyUnicode_Check(unicode) || mapping == NULL) {
3006         PyErr_BadArgument();
3007         return NULL;
3008     }
3009     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3010                                    PyUnicode_GET_SIZE(unicode),
3011                                    mapping,
3012                                    NULL);
3013 }
3014
3015 /* create or adjust a UnicodeTranslateError */
3016 static void make_translate_exception(PyObject **exceptionObject,
3017     const Py_UNICODE *unicode, int size,
3018     int startpos, int endpos,
3019     const char *reason)
3020 {
3021     if (*exceptionObject == NULL) {
3022         *exceptionObject = PyUnicodeTranslateError_Create(
3023             unicode, size, startpos, endpos, reason);
3024     }
3025     else {
3026         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3027             goto onError;
3028         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3029             goto onError;
3030         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3031             goto onError;
3032         return;
3033         onError:
3034         Py_DECREF(*exceptionObject);
3035         *exceptionObject = NULL;
3036     }
3037 }
3038
3039 /* raises a UnicodeTranslateError */
3040 static void raise_translate_exception(PyObject **exceptionObject,
3041     const Py_UNICODE *unicode, int size,
3042     int startpos, int endpos,
3043     const char *reason)
3044 {
3045     make_translate_exception(exceptionObject,
3046         unicode, size, startpos, endpos, reason);
3047     if (*exceptionObject != NULL)
3048         PyCodec_StrictErrors(*exceptionObject);
3049 }
3050
3051 /* error handling callback helper:
3052    build arguments, call the callback and check the arguments,
3053    put the result into newpos and return the replacement string, which
3054    has to be freed by the caller */
3055 static PyObject *unicode_translate_call_errorhandler(const char *errors,
3056     PyObject **errorHandler,
3057     const char *reason,
3058     const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3059     int startpos, int endpos,
3060     int *newpos)
3061 {
3062     static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3063
3064     PyObject *restuple;
3065     PyObject *resunicode;
3066
3067     if (*errorHandler == NULL) {
3068         *errorHandler = PyCodec_LookupError(errors);
3069         if (*errorHandler == NULL)
3070             return NULL;
3071     }
3072
3073     make_translate_exception(exceptionObject,
3074         unicode, size, startpos, endpos, reason);
3075     if (*exceptionObject == NULL)
3076         return NULL;
3077
3078     restuple = PyObject_CallFunctionObjArgs(
3079         *errorHandler, *exceptionObject, NULL);
3080     if (restuple == NULL)
3081         return NULL;
3082     if (!PyTuple_Check(restuple)) {
3083         PyErr_Format(PyExc_TypeError, &argparse[4]);
3084         Py_DECREF(restuple);
3085         return NULL;
3086     }
3087     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3088         &resunicode, newpos)) {
3089         Py_DECREF(restuple);
3090         return NULL;
3091     }
3092     if (*newpos<0)
3093         *newpos = size+*newpos;
3094     if (*newpos<0 || *newpos>size) {
3095         PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3096         Py_DECREF(restuple);
3097         return NULL;
3098     }
3099     Py_INCREF(resunicode);
3100     Py_DECREF(restuple);
3101     return resunicode;
3102 }
3103
3104 /* Lookup the character ch in the mapping and put the result in result,
3105    which must be decrefed by the caller.
3106    Return 0 on success, -1 on error */
3107 static
3108 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3109 {
3110     PyObject *w = PyInt_FromLong((long)c);
3111     PyObject *x;
3112
3113     if (w == NULL)
3114          return -1;
3115     x = PyObject_GetItem(mapping, w);
3116     Py_DECREF(w);
3117     if (x == NULL) {
3118         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3119             /* No mapping found means: use 1:1 mapping. */
3120             PyErr_Clear();
3121             *result = NULL;
3122             return 0;
3123         } else
3124             return -1;
3125     }
3126     else if (x == Py_None) {
3127         *result = x;
3128         return 0;
3129     }
3130     else if (PyInt_Check(x)) {
3131         long value = PyInt_AS_LONG(x);
3132         long max = PyUnicode_GetMax();
3133         if (value < 0 || value > max) {
3134             PyErr_Format(PyExc_TypeError,
3135                              "character mapping must be in range(0x%lx)", max+1);
3136             Py_DECREF(x);
3137             return -1;
3138         }
3139         *result = x;
3140         return 0;
3141     }
3142     else if (PyUnicode_Check(x)) {
3143         *result = x;
3144         return 0;
3145     }
3146     else {
3147         /* wrong return value */
3148         PyErr_SetString(PyExc_TypeError,
3149               "character mapping must return integer, None or unicode");
3150         return -1;
3151     }
3152 }
3153 /* ensure that *outobj is at least requiredsize characters long,
3154 if not reallocate and adjust various state variables.
3155 Return 0 on success, -1 on error */
3156 static
3157 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3158     int requiredsize)
3159 {
3160     if (requiredsize > *outsize) {
3161         /* remember old output position */
3162         int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3163         /* exponentially overallocate to minimize reallocations */
3164         if (requiredsize < 2 * *outsize)
3165             requiredsize = 2 * *outsize;
3166         if (_PyUnicode_Resize(outobj, requiredsize))
3167             return -1;
3168         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3169         *outsize = requiredsize;
3170     }
3171     return 0;
3172 }
3173 /* lookup the character, put the result in the output string and adjust
3174    various state variables. Return a new reference to the object that
3175    was put in the output buffer in *result, or Py_None, if the mapping was
3176    undefined (in which case no character was written).
3177    The called must decref result.
3178    Return 0 on success, -1 on error. */
3179 static
3180 int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3181     PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3182 {
3183     if (charmaptranslate_lookup(c, mapping, res))
3184         return -1;
3185     if (*res==NULL) {
3186         /* not found => default to 1:1 mapping */
3187         *(*outp)++ = (Py_UNICODE)c;
3188     }
3189     else if (*res==Py_None)
3190         ;
3191     else if (PyInt_Check(*res)) {
3192         /* no overflow check, because we know that the space is enough */
3193         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3194     }
3195     else if (PyUnicode_Check(*res)) {
3196         int repsize = PyUnicode_GET_SIZE(*res);
3197         if (repsize==1) {
3198             /* no overflow check, because we know that the space is enough */
3199             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3200         }
3201         else if (repsize!=0) {
3202             /* more than one character */
3203             int requiredsize = *outsize + repsize - 1;
3204             if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3205                 return -1;
3206             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3207             *outp += repsize;
3208         }
3209     }
3210     else
3211         return -1;
3212     return 0;
3213 }
3214
3215 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
3216                                      int size,
3217                                      PyObject *mapping,
3218                                      const char *errors)
3219 {
3220     /* output object */
3221     PyObject *res = NULL;
3222     /* pointers to the beginning and end+1 of input */
3223     const Py_UNICODE *startp = p;
3224     const Py_UNICODE *endp = p + size;
3225     /* pointer into the output */
3226     Py_UNICODE *str;
3227     /* current output position */
3228     int respos = 0;
3229     int ressize;
3230     char *reason = "character maps to <undefined>";
3231     PyObject *errorHandler = NULL;
3232     PyObject *exc = NULL;
3233     /* the following variable is used for caching string comparisons
3234      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3235      * 3=ignore, 4=xmlcharrefreplace */
3236     int known_errorHandler = -1;
3237
3238     if (mapping == NULL) {
3239         PyErr_BadArgument();
3240         return NULL;
3241     }
3242
3243     /* allocate enough for a simple 1:1 translation without
3244        replacements, if we need more, we'll resize */
3245     res = PyUnicode_FromUnicode(NULL, size);
3246     if (res == NULL)
3247         goto onError;
3248     if (size == 0)
3249         return res;
3250     str = PyUnicode_AS_UNICODE(res);
3251     ressize = size;
3252
3253     while (p<endp) {
3254         /* try to encode it */
3255         PyObject *x = NULL;
3256         if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3257             Py_XDECREF(x);
3258             goto onError;
3259         }
3260         Py_XDECREF(x);
3261         if (x!=Py_None) /* it worked => adjust input pointer */
3262             ++p;
3263         else { /* untranslatable character */
3264             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3265             int repsize;
3266             int newpos;
3267             Py_UNICODE *uni2;
3268             /* startpos for collecting untranslatable chars */
3269             const Py_UNICODE *collstart = p;
3270             const Py_UNICODE *collend = p+1;
3271             const Py_UNICODE *coll;
3272
3273             /* find all untranslatable characters */
3274             while (collend < endp) {
3275                 if (charmaptranslate_lookup(*collend, mapping, &x))
3276                     goto onError;
3277                 Py_XDECREF(x);
3278                 if (x!=Py_None)
3279                     break;
3280                 ++collend;
3281             }
3282             /* cache callback name lookup
3283              * (if not done yet, i.e. it's the first error) */
3284             if (known_errorHandler==-1) {
3285                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3286                     known_errorHandler = 1;
3287                 else if (!strcmp(errors, "replace"))
3288                     known_errorHandler = 2;
3289                 else if (!strcmp(errors, "ignore"))
3290                     known_errorHandler = 3;
3291                 else if (!strcmp(errors, "xmlcharrefreplace"))
3292                     known_errorHandler = 4;
3293                 else
3294                     known_errorHandler = 0;
3295             }
3296             switch (known_errorHandler) {
3297                 case 1: /* strict */
3298                     raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3299                     goto onError;
3300                 case 2: /* replace */
3301                     /* No need to check for space, this is a 1:1 replacement */
3302                     for (coll = collstart; coll<collend; ++coll)
3303                         *str++ = '?';
3304                     /* fall through */
3305                 case 3: /* ignore */
3306                     p = collend;
3307                     break;
3308                 case 4: /* xmlcharrefreplace */
3309                     /* generate replacement (temporarily (mis)uses p) */
3310                     for (p = collstart; p < collend; ++p) {
3311                         char buffer[2+29+1+1];
3312                         char *cp;
3313                         sprintf(buffer, "&#%d;", (int)*p);
3314                         if (charmaptranslate_makespace(&res, &str, &ressize,
3315                             (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3316                             goto onError;
3317                         for (cp = buffer; *cp; ++cp)
3318                             *str++ = *cp;
3319                     }
3320                     p = collend;
3321                     break;
3322                 default:
3323                     repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3324                         reason, startp, size, &exc,
3325                         collstart-startp, collend-startp, &newpos);
3326                     if (repunicode == NULL)
3327                         goto onError;
3328                     /* generate replacement  */
3329                     repsize = PyUnicode_GET_SIZE(repunicode);
3330                     if (charmaptranslate_makespace(&res, &str, &ressize,
3331                         (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3332                         Py_DECREF(repunicode);
3333                         goto onError;
3334                     }
3335                     for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3336                         *str++ = *uni2;
3337                     p = startp + newpos;
3338                     Py_DECREF(repunicode);
3339             }
3340         }
3341     }
3342     /* Resize if we allocated to much */
3343     respos = str-PyUnicode_AS_UNICODE(res);
3344     if (respos<ressize) {
3345         if (_PyUnicode_Resize(&res, respos))
3346             goto onError;
3347     }
3348     Py_XDECREF(exc);
3349     Py_XDECREF(errorHandler);
3350     return res;
3351
3352     onError:
3353     Py_XDECREF(res);
3354     Py_XDECREF(exc);
3355     Py_XDECREF(errorHandler);
3356     return NULL;
3357 }
3358
3359 PyObject *PyUnicode_Translate(PyObject *str,
3360                               PyObject *mapping,
3361                               const char *errors)
3362 {
3363     PyObject *result;
3364
3365     str = PyUnicode_FromObject(str);
3366     if (str == NULL)
3367         goto onError;
3368     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3369                                         PyUnicode_GET_SIZE(str),
3370                                         mapping,
3371                                         errors);
3372     Py_DECREF(str);
3373     return result;
3374
3375  onError:
3376     Py_XDECREF(str);
3377     return NULL;
3378 }
3379
3380 /* --- Decimal Encoder ---------------------------------------------------- */
3381
3382 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3383                             int length,
3384                             char *output,
3385                             const char *errors)
3386 {
3387     Py_UNICODE *p, *end;
3388     PyObject *errorHandler = NULL;
3389     PyObject *exc = NULL;
3390     const char *encoding = "decimal";
3391     const char *reason = "invalid decimal Unicode string";
3392     /* the following variable is used for caching string comparisons
3393      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3394     int known_errorHandler = -1;
3395
3396     if (output == NULL) {
3397         PyErr_BadArgument();
3398         return -1;
3399     }
3400
3401     p = s;
3402     end = s + length;
3403     while (p < end) {
3404         register Py_UNICODE ch = *p;
3405         int decimal;
3406         PyObject *repunicode;
3407         int repsize;
3408         int newpos;
3409         Py_UNICODE *uni2;
3410         Py_UNICODE *collstart;
3411         Py_UNICODE *collend;
3412
3413         if (Py_UNICODE_ISSPACE(ch)) {
3414             *output++ = ' ';
3415             ++p;
3416             continue;
3417         }
3418         decimal = Py_UNICODE_TODECIMAL(ch);
3419         if (decimal >= 0) {
3420             *output++ = '0' + decimal;
3421             ++p;
3422             continue;
3423         }
3424         if (0 < ch && ch < 256) {
3425             *output++ = (char)ch;
3426             ++p;
3427             continue;
3428         }
3429         /* All other characters are considered unencodable */
3430         collstart = p;
3431         collend = p+1;
3432         while (collend < end) {
3433             if ((0 < *collend && *collend < 256) ||
3434                 !Py_UNICODE_ISSPACE(*collend) ||
3435                 Py_UNICODE_TODECIMAL(*collend))
3436                 break;
3437         }
3438         /* cache callback name lookup
3439          * (if not done yet, i.e. it's the first error) */
3440         if (known_errorHandler==-1) {
3441             if ((errors==NULL) || (!strcmp(errors, "strict")))
3442                 known_errorHandler = 1;
3443             else if (!strcmp(errors, "replace"))
3444                 known_errorHandler = 2;
3445             else if (!strcmp(errors, "ignore"))
3446                 known_errorHandler = 3;
3447             else if (!strcmp(errors, "xmlcharrefreplace"))
3448                 known_errorHandler = 4;
3449             else
3450                 known_errorHandler = 0;
3451         }
3452         switch (known_errorHandler) {
3453             case 1: /* strict */
3454                 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3455                 goto onError;
3456             case 2: /* replace */
3457                 for (p = collstart; p < collend; ++p)
3458                     *output++ = '?';
3459                 /* fall through */
3460             case 3: /* ignore */
3461                 p = collend;
3462                 break;
3463             case 4: /* xmlcharrefreplace */
3464                 /* generate replacement (temporarily (mis)uses p) */
3465                 for (p = collstart; p < collend; ++p)
3466                     output += sprintf(output, "&#%d;", (int)*p);
3467                 p = collend;
3468                 break;
3469             default:
3470                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3471                     encoding, reason, s, length, &exc,
3472                     collstart-s, collend-s, &newpos);
3473                 if (repunicode == NULL)
3474                     goto onError;
3475                 /* generate replacement  */
3476                 repsize = PyUnicode_GET_SIZE(repunicode);
3477                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3478                     Py_UNICODE ch = *uni2;
3479                     if (Py_UNICODE_ISSPACE(ch))
3480                         *output++ = ' ';
3481                     else {
3482                         decimal = Py_UNICODE_TODECIMAL(ch);
3483                         if (decimal >= 0)
3484                             *output++ = '0' + decimal;
3485                         else if (0 < ch && ch < 256)
3486                             *output++ = (char)ch;
3487                         else {
3488                             Py_DECREF(repunicode);
3489                             raise_encode_exception(&exc, encoding,
3490                                 s, length, collstart-s, collend-s, reason);
3491                             goto onError;
3492                         }
3493                     }
3494                 }
3495                 p = s + newpos;
3496                 Py_DECREF(repunicode);
3497         }
3498     }
3499     /* 0-terminate the output string */
3500     *output++ = '\0';
3501     Py_XDECREF(exc);
3502     Py_XDECREF(errorHandler);
3503     return 0;
3504
3505  onError:
3506     Py_XDECREF(exc);
3507     Py_XDECREF(errorHandler);
3508     return -1;
3509 }
3510
3511 /* --- Helpers ------------------------------------------------------------ */
3512
3513 static
3514 int count(PyUnicodeObject *self,
3515           int start,
3516           int end,
3517           PyUnicodeObject *substring)
3518 {
3519     int count = 0;
3520
3521     if (start < 0)
3522         start += self->length;
3523     if (start < 0)
3524         start = 0;
3525     if (end > self->length)
3526         end = self->length;
3527     if (end < 0)
3528         end += self->length;
3529     if (end < 0)
3530         end = 0;
3531
3532     if (substring->length == 0)
3533         return (end - start + 1);
3534
3535     end -= substring->length;
3536
3537     while (start <= end)
3538         if (Py_UNICODE_MATCH(self, start, substring)) {
3539             count++;
3540             start += substring->length;
3541         } else
3542             start++;
3543
3544     return count;
3545 }
3546
3547 int PyUnicode_Count(PyObject *str,
3548                     PyObject *substr,
3549                     int start,
3550                     int end)
3551 {
3552     int result;
3553
3554     str = PyUnicode_FromObject(str);
3555     if (str == NULL)
3556         return -1;
3557     substr = PyUnicode_FromObject(substr);
3558     if (substr == NULL) {
3559         Py_DECREF(str);
3560         return -1;
3561     }
3562
3563     result = count((PyUnicodeObject *)str,
3564                    start, end,
3565                    (PyUnicodeObject *)substr);
3566
3567     Py_DECREF(str);
3568     Py_DECREF(substr);
3569     return result;
3570 }
3571
3572 static
3573 int findstring(PyUnicodeObject *self,
3574                PyUnicodeObject *substring,
3575                int start,
3576                int end,
3577                int direction)
3578 {
3579     if (start < 0)
3580         start += self->length;
3581     if (start < 0)
3582         start = 0;
3583
3584     if (end > self->length)
3585         end = self->length;
3586     if (end < 0)
3587         end += self->length;
3588     if (end < 0)
3589         end = 0;
3590
3591     if (substring->length == 0)
3592         return (direction > 0) ? start : end;
3593
3594     end -= substring->length;
3595
3596     if (direction < 0) {
3597         for (; end >= start; end--)
3598             if (Py_UNICODE_MATCH(self, end, substring))
3599                 return end;
3600     } else {
3601         for (; start <= end; start++)
3602             if (Py_UNICODE_MATCH(self, start, substring))
3603                 return start;
3604     }
3605
3606     return -1;
3607 }
3608
3609 int PyUnicode_Find(PyObject *str,
3610                    PyObject *substr,
3611                    int start,
3612                    int end,
3613                    int direction)
3614 {
3615     int result;
3616
3617     str = PyUnicode_FromObject(str);
3618     if (str == NULL)
3619         return -2;
3620     substr = PyUnicode_FromObject(substr);
3621     if (substr == NULL) {
3622         Py_DECREF(str);
3623         return -2;
3624     }
3625
3626     result = findstring((PyUnicodeObject *)str,
3627                         (PyUnicodeObject *)substr,
3628                         start, end, direction);
3629     Py_DECREF(str);
3630     Py_DECREF(substr);
3631     return result;
3632 }
3633
3634 static
3635 int tailmatch(PyUnicodeObject *self,
3636               PyUnicodeObject *substring,
3637               int start,
3638               int end,
3639               int direction)
3640 {
3641     if (start < 0)
3642         start += self->length;
3643     if (start < 0)
3644         start = 0;
3645
3646     if (substring->length == 0)
3647         return 1;
3648
3649     if (end > self->length)
3650         end = self->length;
3651     if (end < 0)
3652         end += self->length;
3653     if (end < 0)
3654         end = 0;
3655
3656     end -= substring->length;
3657     if (end < start)
3658         return 0;
3659
3660     if (direction > 0) {
3661         if (Py_UNICODE_MATCH(self, end, substring))
3662             return 1;
3663     } else {
3664         if (Py_UNICODE_MATCH(self, start, substring))
3665             return 1;
3666     }
3667
3668     return 0;
3669 }
3670
3671 int PyUnicode_Tailmatch(PyObject *str,
3672                         PyObject *substr,
3673                         int start,
3674                         int end,
3675                         int direction)
3676 {
3677     int result;
3678
3679     str = PyUnicode_FromObject(str);
3680     if (str == NULL)
3681         return -1;
3682     substr = PyUnicode_FromObject(substr);
3683     if (substr == NULL) {
3684         Py_DECREF(substr);
3685         return -1;
3686     }
3687
3688     result = tailmatch((PyUnicodeObject *)str,
3689                        (PyUnicodeObject *)substr,
3690                        start, end, direction);
3691     Py_DECREF(str);
3692     Py_DECREF(substr);
3693     return result;
3694 }
3695
3696 static
3697 const Py_UNICODE *findchar(const Py_UNICODE *s,
3698                      int size,
3699                      Py_UNICODE ch)
3700 {
3701     /* like wcschr, but doesn't stop at NULL characters */
3702
3703     while (size-- > 0) {
3704         if (*s == ch)
3705             return s;
3706         s++;
3707     }
3708
3709     return NULL;
3710 }
3711
3712 /* Apply fixfct filter to the Unicode object self and return a
3713    reference to the modified object */
3714
3715 static
3716 PyObject *fixup(PyUnicodeObject *self,
3717                 int (*fixfct)(PyUnicodeObject *s))
3718 {
3719
3720     PyUnicodeObject *u;
3721
3722     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3723     if (u == NULL)
3724         return NULL;
3725
3726     Py_UNICODE_COPY(u->str, self->str, self->length);
3727
3728     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3729         /* fixfct should return TRUE if it modified the buffer. If
3730            FALSE, return a reference to the original buffer instead
3731            (to save space, not time) */
3732         Py_INCREF(self);
3733         Py_DECREF(u);
3734         return (PyObject*) self;
3735     }
3736     return (PyObject*) u;
3737 }
3738
3739 static
3740 int fixupper(PyUnicodeObject *self)
3741 {
3742     int len = self->length;
3743     Py_UNICODE *s = self->str;
3744     int status = 0;
3745
3746     while (len-- > 0) {
3747         register Py_UNICODE ch;
3748
3749         ch = Py_UNICODE_TOUPPER(*s);
3750         if (ch != *s) {
3751             status = 1;
3752             *s = ch;
3753         }
3754         s++;
3755     }
3756
3757     return status;
3758 }
3759
3760 static
3761 int fixlower(PyUnicodeObject *self)
3762 {
3763     int len = self->length;
3764     Py_UNICODE *s = self->str;
3765     int status = 0;
3766
3767     while (len-- > 0) {
3768         register Py_UNICODE ch;
3769
3770         ch = Py_UNICODE_TOLOWER(*s);
3771         if (ch != *s) {
3772             status = 1;
3773             *s = ch;
3774         }
3775         s++;
3776     }
3777
3778     return status;
3779 }
3780
3781 static
3782 int fixswapcase(PyUnicodeObject *self)
3783 {
3784     int len = self->length;
3785     Py_UNICODE *s = self->str;
3786     int status = 0;
3787
3788     while (len-- > 0) {
3789         if (Py_UNICODE_ISUPPER(*s)) {
3790             *s = Py_UNICODE_TOLOWER(*s);
3791             status = 1;
3792         } else if (Py_UNICODE_ISLOWER(*s)) {
3793             *s = Py_UNICODE_TOUPPER(*s);
3794             status = 1;
3795         }
3796         s++;
3797     }
3798
3799     return status;
3800 }
3801
3802 static
3803 int fixcapitalize(PyUnicodeObject *self)
3804 {
3805     int len = self->length;
3806     Py_UNICODE *s = self->str;
3807     int status = 0;
3808
3809     if (len == 0)
3810         return 0;
3811     if (Py_UNICODE_ISLOWER(*s)) {
3812         *s = Py_UNICODE_TOUPPER(*s);
3813         status = 1;
3814     }
3815     s++;
3816     while (--len > 0) {
3817         if (Py_UNICODE_ISUPPER(*s)) {
3818             *s = Py_UNICODE_TOLOWER(*s);
3819             status = 1;
3820         }
3821         s++;
3822     }
3823     return status;
3824 }
3825
3826 static
3827 int fixtitle(PyUnicodeObject *self)
3828 {
3829     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830     register Py_UNICODE *e;
3831     int previous_is_cased;
3832
3833     /* Shortcut for single character strings */
3834     if (PyUnicode_GET_SIZE(self) == 1) {
3835         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3836         if (*p != ch) {
3837             *p = ch;
3838             return 1;
3839         }
3840         else
3841             return 0;
3842     }
3843
3844     e = p + PyUnicode_GET_SIZE(self);
3845     previous_is_cased = 0;
3846     for (; p < e; p++) {
3847         register const Py_UNICODE ch = *p;
3848
3849         if (previous_is_cased)
3850             *p = Py_UNICODE_TOLOWER(ch);
3851         else
3852             *p = Py_UNICODE_TOTITLE(ch);
3853
3854         if (Py_UNICODE_ISLOWER(ch) ||
3855             Py_UNICODE_ISUPPER(ch) ||
3856             Py_UNICODE_ISTITLE(ch))
3857             previous_is_cased = 1;
3858         else
3859             previous_is_cased = 0;
3860     }
3861     return 1;
3862 }
3863
3864 PyObject *PyUnicode_Join(PyObject *separator,
3865                          PyObject *seq)
3866 {
3867     Py_UNICODE *sep;
3868     int seplen;
3869     PyUnicodeObject *res = NULL;
3870     int reslen = 0;
3871     Py_UNICODE *p;
3872     int sz = 100;
3873     int i;
3874     PyObject *it;
3875
3876     it = PyObject_GetIter(seq);
3877     if (it == NULL)
3878         return NULL;
3879
3880     if (separator == NULL) {
3881         Py_UNICODE blank = ' ';
3882         sep = &blank;
3883         seplen = 1;
3884     }
3885     else {
3886         separator = PyUnicode_FromObject(separator);
3887         if (separator == NULL)
3888             goto onError;
3889         sep = PyUnicode_AS_UNICODE(separator);
3890         seplen = PyUnicode_GET_SIZE(separator);
3891     }
3892
3893     res = _PyUnicode_New(sz);
3894     if (res == NULL)
3895         goto onError;
3896     p = PyUnicode_AS_UNICODE(res);
3897     reslen = 0;
3898
3899     for (i = 0; ; ++i) {
3900         int itemlen;
3901         PyObject *item = PyIter_Next(it);
3902         if (item == NULL) {
3903             if (PyErr_Occurred())
3904                 goto onError;
3905             break;
3906         }
3907         if (!PyUnicode_Check(item)) {
3908             PyObject *v;
3909             if (!PyString_Check(item)) {
3910                 PyErr_Format(PyExc_TypeError,
3911                              "sequence item %i: expected string or Unicode,"
3912                              " %.80s found",
3913                              i, item->ob_type->tp_name);
3914                 Py_DECREF(item);
3915                 goto onError;
3916             }
3917             v = PyUnicode_FromObject(item);
3918             Py_DECREF(item);
3919             item = v;
3920             if (item == NULL)
3921                 goto onError;
3922         }
3923         itemlen = PyUnicode_GET_SIZE(item);
3924         while (reslen + itemlen + seplen >= sz) {
3925             if (_PyUnicode_Resize(&res, sz*2)) {
3926                 Py_DECREF(item);
3927                 goto onError;
3928             }
3929             sz *= 2;
3930             p = PyUnicode_AS_UNICODE(res) + reslen;
3931         }
3932         if (i > 0) {
3933             Py_UNICODE_COPY(p, sep, seplen);
3934             p += seplen;
3935             reslen += seplen;
3936         }
3937         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3938         p += itemlen;
3939         reslen += itemlen;
3940         Py_DECREF(item);
3941     }
3942     if (_PyUnicode_Resize(&res, reslen))
3943         goto onError;
3944
3945     Py_XDECREF(separator);
3946     Py_DECREF(it);
3947     return (PyObject *)res;
3948
3949  onError:
3950     Py_XDECREF(separator);
3951     Py_XDECREF(res);
3952     Py_DECREF(it);
3953     return NULL;
3954 }
3955
3956 static
3957 PyUnicodeObject *pad(PyUnicodeObject *self,
3958                      int left,
3959                      int right,
3960                      Py_UNICODE fill)
3961 {
3962     PyUnicodeObject *u;
3963
3964     if (left < 0)
3965         left = 0;
3966     if (right < 0)
3967         right = 0;
3968
3969     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3970         Py_INCREF(self);
3971         return self;
3972     }
3973
3974     u = _PyUnicode_New(left + self->length + right);
3975     if (u) {
3976         if (left)
3977             Py_UNICODE_FILL(u->str, fill, left);
3978         Py_UNICODE_COPY(u->str + left, self->str, self->length);
3979         if (right)
3980             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3981     }
3982
3983     return u;
3984 }
3985
3986 #define SPLIT_APPEND(data, left, right)                                 \
3987         str = PyUnicode_FromUnicode(data + left, right - left);         \
3988         if (!str)                                                       \
3989             goto onError;                                               \
3990         if (PyList_Append(list, str)) {                                 \
3991             Py_DECREF(str);                                             \
3992             goto onError;                                               \
3993         }                                                               \
3994         else                                                            \
3995             Py_DECREF(str);
3996
3997 static
3998 PyObject *split_whitespace(PyUnicodeObject *self,
3999                            PyObject *list,
4000                            int maxcount)
4001 {
4002     register int i;
4003     register int j;
4004     int len = self->length;
4005     PyObject *str;
4006
4007     for (i = j = 0; i < len; ) {
4008         /* find a token */
4009         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4010             i++;
4011         j = i;
4012         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4013             i++;
4014         if (j < i) {
4015             if (maxcount-- <= 0)
4016                 break;
4017             SPLIT_APPEND(self->str, j, i);
4018             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4019                 i++;
4020             j = i;
4021         }
4022     }
4023     if (j < len) {
4024         SPLIT_APPEND(self->str, j, len);
4025     }
4026     return list;
4027
4028  onError:
4029     Py_DECREF(list);
4030     return NULL;
4031 }
4032
4033 PyObject *PyUnicode_Splitlines(PyObject *string,
4034                                int keepends)
4035 {
4036     register int i;
4037     register int j;
4038     int len;
4039     PyObject *list;
4040     PyObject *str;
4041     Py_UNICODE *data;
4042
4043     string = PyUnicode_FromObject(string);
4044     if (string == NULL)
4045         return NULL;
4046     data = PyUnicode_AS_UNICODE(string);
4047     len = PyUnicode_GET_SIZE(string);
4048
4049     list = PyList_New(0);
4050     if (!list)
4051         goto onError;
4052
4053     for (i = j = 0; i < len; ) {
4054         int eol;
4055
4056         /* Find a line and append it */
4057         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4058             i++;
4059
4060         /* Skip the line break reading CRLF as one line break */
4061         eol = i;
4062         if (i < len) {
4063             if (data[i] == '\r' && i + 1 < len &&
4064                 data[i+1] == '\n')
4065                 i += 2;
4066             else
4067                 i++;
4068             if (keepends)
4069                 eol = i;
4070         }
4071         SPLIT_APPEND(data, j, eol);
4072         j = i;
4073     }
4074     if (j < len) {
4075         SPLIT_APPEND(data, j, len);
4076     }
4077
4078     Py_DECREF(string);
4079     return list;
4080
4081  onError:
4082     Py_DECREF(list);
4083     Py_DECREF(string);
4084     return NULL;
4085 }
4086
4087 static
4088 PyObject *split_char(PyUnicodeObject *self,
4089                      PyObject *list,
4090                      Py_UNICODE ch,
4091                      int maxcount)
4092 {
4093     register int i;
4094     register int j;
4095     int len = self->length;
4096     PyObject *str;
4097
4098     for (i = j = 0; i < len; ) {
4099         if (self->str[i] == ch) {
4100             if (maxcount-- <= 0)
4101                 break;
4102             SPLIT_APPEND(self->str, j, i);
4103             i = j = i + 1;
4104         } else
4105             i++;
4106     }
4107     if (j <= len) {
4108         SPLIT_APPEND(self->str, j, len);
4109     }
4110     return list;
4111
4112  onError:
4113     Py_DECREF(list);
4114     return NULL;
4115 }
4116
4117 static
4118 PyObject *split_substring(PyUnicodeObject *self,
4119                           PyObject *list,
4120                           PyUnicodeObject *substring,
4121                           int maxcount)
4122 {
4123     register int i;
4124     register int j;
4125     int len = self->length;
4126     int sublen = substring->length;
4127     PyObject *str;
4128
4129     for (i = j = 0; i <= len - sublen; ) {
4130         if (Py_UNICODE_MATCH(self, i, substring)) {
4131             if (maxcount-- <= 0)
4132                 break;
4133             SPLIT_APPEND(self->str, j, i);
4134             i = j = i + sublen;
4135         } else
4136             i++;
4137     }
4138     if (j <= len) {
4139         SPLIT_APPEND(self->str, j, len);
4140     }
4141     return list;
4142
4143  onError:
4144     Py_DECREF(list);
4145     return NULL;
4146 }
4147
4148 #undef SPLIT_APPEND
4149
4150 static
4151 PyObject *split(PyUnicodeObject *self,
4152                 PyUnicodeObject *substring,
4153                 int maxcount)
4154 {
4155     PyObject *list;
4156
4157     if (maxcount < 0)
4158         maxcount = INT_MAX;
4159
4160     list = PyList_New(0);
4161     if (!list)
4162         return NULL;
4163
4164     if (substring == NULL)
4165         return split_whitespace(self,list,maxcount);
4166
4167     else if (substring->length == 1)
4168         return split_char(self,list,substring->str[0],maxcount);
4169
4170     else if (substring->length == 0) {
4171         Py_DECREF(list);
4172         PyErr_SetString(PyExc_ValueError, "empty separator");
4173         return NULL;
4174     }
4175     else
4176         return split_substring(self,list,substring,maxcount);
4177 }
4178
4179 static
4180 PyObject *replace(PyUnicodeObject *self,
4181                   PyUnicodeObject *str1,
4182                   PyUnicodeObject *str2,
4183                   int maxcount)
4184 {
4185     PyUnicodeObject *u;
4186
4187     if (maxcount < 0)
4188         maxcount = INT_MAX;
4189
4190     if (str1->length == 1 && str2->length == 1) {
4191         int i;
4192
4193         /* replace characters */
4194         if (!findchar(self->str, self->length, str1->str[0]) &&
4195             PyUnicode_CheckExact(self)) {
4196             /* nothing to replace, return original string */
4197             Py_INCREF(self);
4198             u = self;
4199         } else {
4200             Py_UNICODE u1 = str1->str[0];
4201             Py_UNICODE u2 = str2->str[0];
4202
4203             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
4204                 NULL,
4205                 self->length
4206                 );
4207             if (u != NULL) {
4208                 Py_UNICODE_COPY(u->str, self->str,
4209                                 self->length);
4210                 for (i = 0; i < u->length; i++)
4211                     if (u->str[i] == u1) {
4212                         if (--maxcount < 0)
4213                             break;
4214                         u->str[i] = u2;
4215                     }
4216         }
4217         }
4218
4219     } else {
4220         int n, i;
4221         Py_UNICODE *p;
4222
4223         /* replace strings */
4224         n = count(self, 0, self->length, str1);
4225         if (n > maxcount)
4226             n = maxcount;
4227         if (n == 0) {
4228             /* nothing to replace, return original string */
4229             if (PyUnicode_CheckExact(self)) {
4230                 Py_INCREF(self);
4231                 u = self;
4232             }
4233             else {
4234                 u = (PyUnicodeObject *)
4235                     PyUnicode_FromUnicode(self->str, self->length);
4236             }
4237         } else {
4238             u = _PyUnicode_New(
4239                 self->length + n * (str2->length - str1->length));
4240             if (u) {
4241                 i = 0;
4242                 p = u->str;
4243                 if (str1->length > 0) {
4244                     while (i <= self->length - str1->length)
4245                         if (Py_UNICODE_MATCH(self, i, str1)) {
4246                             /* replace string segment */
4247                             Py_UNICODE_COPY(p, str2->str, str2->length);
4248                             p += str2->length;
4249                             i += str1->length;
4250                             if (--n <= 0) {
4251                                 /* copy remaining part */
4252                                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4253                                 break;
4254                             }
4255                         } else
4256                             *p++ = self->str[i++];
4257                 } else {
4258                     while (n > 0) {
4259                         Py_UNICODE_COPY(p, str2->str, str2->length);
4260                         p += str2->length;
4261                         if (--n <= 0)
4262                             break;
4263                         *p++ = self->str[i++];
4264                     }
4265                     Py_UNICODE_COPY(p, self->str+i, self->length-i);
4266                 }
4267             }
4268         }
4269     }
4270
4271     return (PyObject *) u;
4272 }
4273
4274 /* --- Unicode Object Methods --------------------------------------------- */
4275
4276 PyDoc_STRVAR(title__doc__,
4277 "S.title() -> unicode\n\
4278 \n\
4279 Return a titlecased version of S, i.e. words start with title case\n\
4280 characters, all remaining cased characters have lower case.");
4281
4282 static PyObject*
4283 unicode_title(PyUnicodeObject *self)
4284 {
4285     return fixup(self, fixtitle);
4286 }
4287
4288 PyDoc_STRVAR(capitalize__doc__,
4289 "S.capitalize() -> unicode\n\
4290 \n\
4291 Return a capitalized version of S, i.e. make the first character\n\
4292 have upper case.");
4293
4294 static PyObject*
4295 unicode_capitalize(PyUnicodeObject *self)
4296 {
4297     return fixup(self, fixcapitalize);
4298 }
4299
4300 #if 0
4301 PyDoc_STRVAR(capwords__doc__,
4302 "S.capwords() -> unicode\n\
4303 \n\
4304 Apply .capitalize() to all words in S and return the result with\n\
4305 normalized whitespace (all whitespace strings are replaced by ' ').");
4306
4307 static PyObject*
4308 unicode_capwords(PyUnicodeObject *self)
4309 {
4310     PyObject *list;
4311     PyObject *item;
4312     int i;
4313
4314     /* Split into words */
4315     list = split(self, NULL, -1);
4316     if (!list)
4317         return NULL;
4318
4319     /* Capitalize each word */
4320     for (i = 0; i < PyList_GET_SIZE(list); i++) {
4321         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4322                      fixcapitalize);
4323         if (item == NULL)
4324             goto onError;
4325         Py_DECREF(PyList_GET_ITEM(list, i));
4326         PyList_SET_ITEM(list, i, item);
4327     }
4328
4329     /* Join the words to form a new string */
4330     item = PyUnicode_Join(NULL, list);
4331
4332 onError:
4333     Py_DECREF(list);
4334     return (PyObject *)item;
4335 }
4336 #endif
4337
4338 PyDoc_STRVAR(center__doc__,
4339 "S.center(width) -> unicode\n\
4340 \n\
4341 Return S centered in a Unicode string of length width. Padding is done\n\
4342 using spaces.");
4343
4344 static PyObject *
4345 unicode_center(PyUnicodeObject *self, PyObject *args)
4346 {
4347     int marg, left;
4348     int width;
4349
4350     if (!PyArg_ParseTuple(args, "i:center", &width))
4351         return NULL;
4352
4353     if (self->length >= width && PyUnicode_CheckExact(self)) {
4354         Py_INCREF(self);
4355         return (PyObject*) self;
4356     }
4357
4358     marg = width - self->length;
4359     left = marg / 2 + (marg & width & 1);
4360
4361     return (PyObject*) pad(self, left, marg - left, ' ');
4362 }
4363
4364 #if 0
4365
4366 /* This code should go into some future Unicode collation support
4367    module. The basic comparison should compare ordinals on a naive
4368    basis (this is what Java does and thus JPython too). */
4369
4370 /* speedy UTF-16 code point order comparison */
4371 /* gleaned from: */
4372 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4373
4374 static short utf16Fixup[32] =
4375 {
4376     0, 0, 0, 0, 0, 0, 0, 0,
4377     0, 0, 0, 0, 0, 0, 0, 0,
4378     0, 0, 0, 0, 0, 0, 0, 0,
4379     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
4380 };
4381
4382 static int
4383 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4384 {
4385     int len1, len2;
4386
4387     Py_UNICODE *s1 = str1->str;
4388     Py_UNICODE *s2 = str2->str;
4389
4390     len1 = str1->length;
4391     len2 = str2->length;
4392
4393     while (len1 > 0 && len2 > 0) {
4394         Py_UNICODE c1, c2;
4395
4396         c1 = *s1++;
4397         c2 = *s2++;
4398
4399         if (c1 > (1<<11) * 26)
4400             c1 += utf16Fixup[c1>>11];
4401         if (c2 > (1<<11) * 26)
4402             c2 += utf16Fixup[c2>>11];
4403         /* now c1 and c2 are in UTF-32-compatible order */
4404
4405         if (c1 != c2)
4406             return (c1 < c2) ? -1 : 1;
4407
4408         len1--; len2--;
4409     }
4410
4411     return (len1 < len2) ? -1 : (len1 != len2);
4412 }
4413
4414 #else
4415
4416 static int
4417 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4418 {
4419     register int len1, len2;
4420
4421     Py_UNICODE *s1 = str1->str;
4422     Py_UNICODE *s2 = str2->str;
4423
4424     len1 = str1->length;
4425     len2 = str2->length;
4426
4427     while (len1 > 0 && len2 > 0) {
4428         Py_UNICODE c1, c2;
4429
4430         c1 = *s1++;
4431         c2 = *s2++;
4432
4433         if (c1 != c2)
4434             return (c1 < c2) ? -1 : 1;
4435
4436         len1--; len2--;
4437     }
4438
4439     return (len1 < len2) ? -1 : (len1 != len2);
4440 }
4441
4442 #endif
4443
4444 int PyUnicode_Compare(PyObject *left,
4445                       PyObject *right)
4446 {
4447     PyUnicodeObject *u = NULL, *v = NULL;
4448     int result;
4449
4450     /* Coerce the two arguments */
4451     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4452     if (u == NULL)
4453         goto onError;
4454     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4455     if (v == NULL)
4456         goto onError;
4457
4458     /* Shortcut for empty or interned objects */
4459     if (v == u) {
4460         Py_DECREF(u);
4461         Py_DECREF(v);
4462         return 0;
4463     }
4464
4465     result = unicode_compare(u, v);
4466
4467     Py_DECREF(u);
4468     Py_DECREF(v);
4469     return result;
4470
4471 onError:
4472     Py_XDECREF(u);
4473     Py_XDECREF(v);
4474     return -1;
4475 }
4476
4477 int PyUnicode_Contains(PyObject *container,
4478                        PyObject *element)
4479 {
4480     PyUnicodeObject *u = NULL, *v = NULL;
4481     int result, size;
4482     register const Py_UNICODE *lhs, *end, *rhs;
4483
4484     /* Coerce the two arguments */
4485     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
4486     if (v == NULL) {
4487         PyErr_SetString(PyExc_TypeError,
4488             "'in <string>' requires string as left operand");
4489         goto onError;
4490     }
4491     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
4492     if (u == NULL)
4493         goto onError;
4494
4495     size = PyUnicode_GET_SIZE(v);
4496     rhs = PyUnicode_AS_UNICODE(v);
4497     lhs = PyUnicode_AS_UNICODE(u);
4498
4499     result = 0;
4500     if (size == 1) {
4501         end = lhs + PyUnicode_GET_SIZE(u);
4502         while (lhs < end) {
4503             if (*lhs++ == *rhs) {
4504                 result = 1;
4505                 break;
4506             }
4507         }
4508     }
4509     else {
4510         end = lhs + (PyUnicode_GET_SIZE(u) - size);
4511         while (lhs <= end) {
4512             if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
4513                 result = 1;
4514                 break;
4515             }
4516         }
4517     }
4518
4519     Py_DECREF(u);
4520     Py_DECREF(v);
4521     return result;
4522
4523 onError:
4524     Py_XDECREF(u);
4525     Py_XDECREF(v);
4526     return -1;
4527 }
4528
4529 /* Concat to string or Unicode object giving a new Unicode object. */
4530
4531 PyObject *PyUnicode_Concat(PyObject *left,
4532                            PyObject *right)
4533 {
4534     PyUnicodeObject *u = NULL, *v = NULL, *w;
4535
4536     /* Coerce the two arguments */
4537     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4538     if (u == NULL)
4539         goto onError;
4540     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4541     if (v == NULL)
4542         goto onError;
4543
4544     /* Shortcuts */
4545     if (v == unicode_empty) {
4546         Py_DECREF(v);
4547         return (PyObject *)u;
4548     }
4549     if (u == unicode_empty) {
4550         Py_DECREF(u);
4551         return (PyObject *)v;
4552     }
4553
4554     /* Concat the two Unicode strings */
4555     w = _PyUnicode_New(u->length + v->length);
4556     if (w == NULL)
4557         goto onError;
4558     Py_UNICODE_COPY(w->str, u->str, u->length);
4559     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4560
4561     Py_DECREF(u);
4562     Py_DECREF(v);
4563     return (PyObject *)w;
4564
4565 onError:
4566     Py_XDECREF(u);
4567     Py_XDECREF(v);
4568     return NULL;
4569 }
4570
4571 PyDoc_STRVAR(count__doc__,
4572 "S.count(sub[, start[, end]]) -> int\n\
4573 \n\
4574 Return the number of occurrences of substring sub in Unicode string\n\
4575 S[start:end].  Optional arguments start and end are\n\
4576 interpreted as in slice notation.");
4577
4578 static PyObject *
4579 unicode_count(PyUnicodeObject *self, PyObject *args)
4580 {
4581     PyUnicodeObject *substring;
4582     int start = 0;
4583     int end = INT_MAX;
4584     PyObject *result;
4585
4586     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4587                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4588         return NULL;
4589
4590     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4591                                                 (PyObject *)substring);
4592     if (substring == NULL)
4593         return NULL;
4594
4595     if (start < 0)
4596         start += self->length;
4597     if (start < 0)
4598         start = 0;
4599     if (end > self->length)
4600         end = self->length;
4601     if (end < 0)
4602         end += self->length;
4603     if (end < 0)
4604         end = 0;
4605
4606     result = PyInt_FromLong((long) count(self, start, end, substring));
4607
4608     Py_DECREF(substring);
4609     return result;
4610 }
4611
4612 PyDoc_STRVAR(encode__doc__,
4613 "S.encode([encoding[,errors]]) -> string\n\
4614 \n\
4615 Return an encoded string version of S. Default encoding is the current\n\
4616 default string encoding. errors may be given to set a different error\n\
4617 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4618 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4619 'xmlcharrefreplace' as well as any other name registered with\n\
4620 codecs.register_error that can handle UnicodeEncodeErrors.");
4621
4622 static PyObject *
4623 unicode_encode(PyUnicodeObject *self, PyObject *args)
4624 {
4625     char *encoding = NULL;
4626     char *errors = NULL;
4627     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4628         return NULL;
4629     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4630 }
4631
4632 PyDoc_STRVAR(expandtabs__doc__,
4633 "S.expandtabs([tabsize]) -> unicode\n\
4634 \n\
4635 Return a copy of S where all tab characters are expanded using spaces.\n\
4636 If tabsize is not given, a tab size of 8 characters is assumed.");
4637
4638 static PyObject*
4639 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4640 {
4641     Py_UNICODE *e;
4642     Py_UNICODE *p;
4643     Py_UNICODE *q;
4644     int i, j;
4645     PyUnicodeObject *u;
4646     int tabsize = 8;
4647
4648     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4649         return NULL;
4650
4651     /* First pass: determine size of output string */
4652     i = j = 0;
4653     e = self->str + self->length;
4654     for (p = self->str; p < e; p++)
4655         if (*p == '\t') {
4656             if (tabsize > 0)
4657                 j += tabsize - (j % tabsize);
4658         }
4659         else {
4660             j++;
4661             if (*p == '\n' || *p == '\r') {
4662                 i += j;
4663                 j = 0;
4664             }
4665         }
4666
4667     /* Second pass: create output string and fill it */
4668     u = _PyUnicode_New(i + j);
4669     if (!u)
4670         return NULL;
4671
4672     j = 0;
4673     q = u->str;
4674
4675     for (p = self->str; p < e; p++)
4676         if (*p == '\t') {
4677             if (tabsize > 0) {
4678                 i = tabsize - (j % tabsize);
4679                 j += i;
4680                 while (i--)
4681                     *q++ = ' ';
4682             }
4683         }
4684         else {
4685             j++;
4686             *q++ = *p;
4687             if (*p == '\n' || *p == '\r')
4688                 j = 0;
4689         }
4690
4691     return (PyObject*) u;
4692 }
4693
4694 PyDoc_STRVAR(find__doc__,
4695 "S.find(sub [,start [,end]]) -> int\n\
4696 \n\
4697 Return the lowest index in S where substring sub is found,\n\
4698 such that sub is contained within s[start,end].  Optional\n\
4699 arguments start and end are interpreted as in slice notation.\n\
4700 \n\
4701 Return -1 on failure.");
4702
4703 static PyObject *
4704 unicode_find(PyUnicodeObject *self, PyObject *args)
4705 {
4706     PyUnicodeObject *substring;
4707     int start = 0;
4708     int end = INT_MAX;
4709     PyObject *result;
4710
4711     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4712                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4713         return NULL;
4714     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4715                                                 (PyObject *)substring);
4716     if (substring == NULL)
4717         return NULL;
4718
4719     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4720
4721     Py_DECREF(substring);
4722     return result;
4723 }
4724
4725 static PyObject *
4726 unicode_getitem(PyUnicodeObject *self, int index)
4727 {
4728     if (index < 0 || index >= self->length) {
4729         PyErr_SetString(PyExc_IndexError, "string index out of range");
4730         return NULL;
4731     }
4732
4733     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4734 }
4735
4736 static long
4737 unicode_hash(PyUnicodeObject *self)
4738 {
4739     /* Since Unicode objects compare equal to their ASCII string
4740        counterparts, they should use the individual character values
4741        as basis for their hash value.  This is needed to assure that
4742        strings and Unicode objects behave in the same way as
4743        dictionary keys. */
4744
4745     register int len;
4746     register Py_UNICODE *p;
4747     register long x;
4748
4749     if (self->hash != -1)
4750         return self->hash;
4751     len = PyUnicode_GET_SIZE(self);
4752     p = PyUnicode_AS_UNICODE(self);
4753     x = *p << 7;
4754     while (--len >= 0)
4755         x = (1000003*x) ^ *p++;
4756     x ^= PyUnicode_GET_SIZE(self);
4757     if (x == -1)
4758         x = -2;
4759     self->hash = x;
4760     return x;
4761 }
4762
4763 PyDoc_STRVAR(index__doc__,
4764 "S.index(sub [,start [,end]]) -> int\n\
4765 \n\
4766 Like S.find() but raise ValueError when the substring is not found.");
4767
4768 static PyObject *
4769 unicode_index(PyUnicodeObject *self, PyObject *args)
4770 {
4771     int result;
4772     PyUnicodeObject *substring;
4773     int start = 0;
4774     int end = INT_MAX;
4775
4776     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4777                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4778         return NULL;
4779
4780     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4781                                                 (PyObject *)substring);
4782     if (substring == NULL)
4783         return NULL;
4784
4785     result = findstring(self, substring, start, end, 1);
4786
4787     Py_DECREF(substring);
4788     if (result < 0) {
4789         PyErr_SetString(PyExc_ValueError, "substring not found");
4790         return NULL;
4791     }
4792     return PyInt_FromLong(result);
4793 }
4794
4795 PyDoc_STRVAR(islower__doc__,
4796 "S.islower() -> bool\n\
4797 \n\
4798 Return True if all cased characters in S are lowercase and there is\n\
4799 at least one cased character in S, False otherwise.");
4800
4801 static PyObject*
4802 unicode_islower(PyUnicodeObject *self)
4803 {
4804     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4805     register const Py_UNICODE *e;
4806     int cased;
4807
4808     /* Shortcut for single character strings */
4809     if (PyUnicode_GET_SIZE(self) == 1)
4810         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
4811
4812     /* Special case for empty strings */
4813     if (PyString_GET_SIZE(self) == 0)
4814         return PyBool_FromLong(0);
4815
4816     e = p + PyUnicode_GET_SIZE(self);
4817     cased = 0;
4818     for (; p < e; p++) {
4819         register const Py_UNICODE ch = *p;
4820
4821         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4822             return PyBool_FromLong(0);
4823         else if (!cased && Py_UNICODE_ISLOWER(ch))
4824             cased = 1;
4825     }
4826     return PyBool_FromLong(cased);
4827 }
4828
4829 PyDoc_STRVAR(isupper__doc__,
4830 "S.isupper() -> bool\n\
4831 \n\
4832 Return True if  all cased characters in S are uppercase and there is\n\
4833 at least one cased character in S, False otherwise.");
4834
4835 static PyObject*
4836 unicode_isupper(PyUnicodeObject *self)
4837 {
4838     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4839     register const Py_UNICODE *e;
4840     int cased;
4841
4842     /* Shortcut for single character strings */
4843     if (PyUnicode_GET_SIZE(self) == 1)
4844         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4845
4846     /* Special case for empty strings */
4847     if (PyString_GET_SIZE(self) == 0)
4848         return PyBool_FromLong(0);
4849
4850     e = p + PyUnicode_GET_SIZE(self);
4851     cased = 0;
4852     for (; p < e; p++) {
4853         register const Py_UNICODE ch = *p;
4854
4855         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4856             return PyBool_FromLong(0);
4857         else if (!cased && Py_UNICODE_ISUPPER(ch))
4858             cased = 1;
4859     }
4860     return PyBool_FromLong(cased);
4861 }
4862
4863 PyDoc_STRVAR(istitle__doc__,
4864 "S.istitle() -> bool\n\
4865 \n\
4866 Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4867 characters may only follow uncased characters and lowercase characters\n\
4868 only cased ones. Return False otherwise.");
4869
4870 static PyObject*
4871 unicode_istitle(PyUnicodeObject *self)
4872 {
4873     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4874     register const Py_UNICODE *e;
4875     int cased, previous_is_cased;
4876
4877     /* Shortcut for single character strings */
4878     if (PyUnicode_GET_SIZE(self) == 1)
4879         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4880                                (Py_UNICODE_ISUPPER(*p) != 0));
4881
4882     /* Special case for empty strings */
4883     if (PyString_GET_SIZE(self) == 0)
4884         return PyBool_FromLong(0);
4885
4886     e = p + PyUnicode_GET_SIZE(self);
4887     cased = 0;
4888     previous_is_cased = 0;
4889     for (; p < e; p++) {
4890         register const Py_UNICODE ch = *p;
4891
4892         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4893             if (previous_is_cased)
4894                 return PyBool_FromLong(0);
4895             previous_is_cased = 1;
4896             cased = 1;
4897         }
4898         else if (Py_UNICODE_ISLOWER(ch)) {
4899             if (!previous_is_cased)
4900                 return PyBool_FromLong(0);
4901             previous_is_cased = 1;
4902             cased = 1;
4903         }
4904         else
4905             previous_is_cased = 0;
4906     }
4907     return PyBool_FromLong(cased);
4908 }
4909
4910 PyDoc_STRVAR(isspace__doc__,
4911 "S.isspace() -> bool\n\
4912 \n\
4913 Return True if there are only whitespace characters in S,\n\
4914 False otherwise.");
4915
4916 static PyObject*
4917 unicode_isspace(PyUnicodeObject *self)
4918 {
4919     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4920     register const Py_UNICODE *e;
4921
4922     /* Shortcut for single character strings */
4923     if (PyUnicode_GET_SIZE(self) == 1 &&
4924         Py_UNICODE_ISSPACE(*p))
4925         return PyBool_FromLong(1);
4926
4927     /* Special case for empty strings */
4928     if (PyString_GET_SIZE(self) == 0)
4929         return PyBool_FromLong(0);
4930
4931     e = p + PyUnicode_GET_SIZE(self);
4932     for (; p < e; p++) {
4933         if (!Py_UNICODE_ISSPACE(*p))
4934             return PyBool_FromLong(0);
4935     }
4936     return PyBool_FromLong(1);
4937 }
4938
4939 PyDoc_STRVAR(isalpha__doc__,
4940 "S.isalpha() -> bool\n\
4941 \n\
4942 Return True if  all characters in S are alphabetic\n\
4943 and there is at least one character in S, False otherwise.");
4944
4945 static PyObject*
4946 unicode_isalpha(PyUnicodeObject *self)
4947 {
4948     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4949     register const Py_UNICODE *e;
4950
4951     /* Shortcut for single character strings */
4952     if (PyUnicode_GET_SIZE(self) == 1 &&
4953         Py_UNICODE_ISALPHA(*p))
4954         return PyBool_FromLong(1);
4955
4956     /* Special case for empty strings */
4957     if (PyString_GET_SIZE(self) == 0)
4958         return PyBool_FromLong(0);
4959
4960     e = p + PyUnicode_GET_SIZE(self);
4961     for (; p < e; p++) {
4962         if (!Py_UNICODE_ISALPHA(*p))
4963             return PyBool_FromLong(0);
4964     }
4965     return PyBool_FromLong(1);
4966 }
4967
4968 PyDoc_STRVAR(isalnum__doc__,
4969 "S.isalnum() -> bool\n\
4970 \n\
4971 Return True if  all characters in S are alphanumeric\n\
4972 and there is at least one character in S, False otherwise.");
4973
4974 static PyObject*
4975 unicode_isalnum(PyUnicodeObject *self)
4976 {
4977     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4978     register const Py_UNICODE *e;
4979
4980     /* Shortcut for single character strings */
4981     if (PyUnicode_GET_SIZE(self) == 1 &&
4982         Py_UNICODE_ISALNUM(*p))
4983         return PyBool_FromLong(1);
4984
4985     /* Special case for empty strings */
4986     if (PyString_GET_SIZE(self) == 0)
4987         return PyBool_FromLong(0);
4988
4989     e = p + PyUnicode_GET_SIZE(self);
4990     for (; p < e; p++) {
4991         if (!Py_UNICODE_ISALNUM(*p))
4992             return PyBool_FromLong(0);
4993     }
4994     return PyBool_FromLong(1);
4995 }
4996
4997 PyDoc_STRVAR(isdecimal__doc__,
4998 "S.isdecimal() -> bool\n\
4999 \n\
5000 Return True if there are only decimal characters in S,\n\
5001 False otherwise.");
5002
5003 static PyObject*
5004 unicode_isdecimal(PyUnicodeObject *self)
5005 {
5006     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5007     register const Py_UNICODE *e;
5008
5009     /* Shortcut for single character strings */
5010     if (PyUnicode_GET_SIZE(self) == 1 &&
5011         Py_UNICODE_ISDECIMAL(*p))
5012         return PyBool_FromLong(1);
5013
5014     /* Special case for empty strings */
5015     if (PyString_GET_SIZE(self) == 0)
5016         return PyBool_FromLong(0);
5017
5018     e = p + PyUnicode_GET_SIZE(self);
5019     for (; p < e; p++) {
5020         if (!Py_UNICODE_ISDECIMAL(*p))
5021             return PyBool_FromLong(0);
5022     }
5023     return PyBool_FromLong(1);
5024 }
5025
5026 PyDoc_STRVAR(isdigit__doc__,
5027 "S.isdigit() -> bool\n\
5028 \n\
5029 Return True if there are only digit characters in S,\n\
5030 False otherwise.");
5031
5032 static PyObject*
5033 unicode_isdigit(PyUnicodeObject *self)
5034 {
5035     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5036     register const Py_UNICODE *e;
5037
5038     /* Shortcut for single character strings */
5039     if (PyUnicode_GET_SIZE(self) == 1 &&
5040         Py_UNICODE_ISDIGIT(*p))
5041         return PyBool_FromLong(1);
5042
5043     /* Special case for empty strings */
5044     if (PyString_GET_SIZE(self) == 0)
5045         return PyBool_FromLong(0);
5046
5047     e = p + PyUnicode_GET_SIZE(self);
5048     for (; p < e; p++) {
5049         if (!Py_UNICODE_ISDIGIT(*p))
5050             return PyBool_FromLong(0);
5051     }
5052     return PyBool_FromLong(1);
5053 }
5054
5055 PyDoc_STRVAR(isnumeric__doc__,
5056 "S.isnumeric() -> bool\n\
5057 \n\
5058 Return True if there are only numeric characters in S,\n\
5059 False otherwise.");
5060
5061 static PyObject*
5062 unicode_isnumeric(PyUnicodeObject *self)
5063 {
5064     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5065     register const Py_UNICODE *e;
5066
5067     /* Shortcut for single character strings */
5068     if (PyUnicode_GET_SIZE(self) == 1 &&
5069         Py_UNICODE_ISNUMERIC(*p))
5070         return PyBool_FromLong(1);
5071
5072     /* Special case for empty strings */
5073     if (PyString_GET_SIZE(self) == 0)
5074         return PyBool_FromLong(0);
5075
5076     e = p + PyUnicode_GET_SIZE(self);
5077     for (; p < e; p++) {
5078         if (!Py_UNICODE_ISNUMERIC(*p))
5079             return PyBool_FromLong(0);
5080     }
5081     return PyBool_FromLong(1);
5082 }
5083
5084 PyDoc_STRVAR(join__doc__,
5085 "S.join(sequence) -> unicode\n\
5086 \n\
5087 Return a string which is the concatenation of the strings in the\n\
5088 sequence.  The separator between elements is S.");
5089
5090 static PyObject*
5091 unicode_join(PyObject *self, PyObject *data)
5092 {
5093     return PyUnicode_Join(self, data);
5094 }
5095
5096 static int
5097 unicode_length(PyUnicodeObject *self)
5098 {
5099     return self->length;
5100 }
5101
5102 PyDoc_STRVAR(ljust__doc__,
5103 "S.ljust(width) -> unicode\n\
5104 \n\
5105 Return S left justified in a Unicode string of length width. Padding is\n\
5106 done using spaces.");
5107
5108 static PyObject *
5109 unicode_ljust(PyUnicodeObject *self, PyObject *args)
5110 {
5111     int width;
5112     if (!PyArg_ParseTuple(args, "i:ljust", &width))
5113         return NULL;
5114
5115     if (self->length >= width && PyUnicode_CheckExact(self)) {
5116         Py_INCREF(self);
5117         return (PyObject*) self;
5118     }
5119
5120     return (PyObject*) pad(self, 0, width - self->length, ' ');
5121 }
5122
5123 PyDoc_STRVAR(lower__doc__,
5124 "S.lower() -> unicode\n\
5125 \n\
5126 Return a copy of the string S converted to lowercase.");
5127
5128 static PyObject*
5129 unicode_lower(PyUnicodeObject *self)
5130 {
5131     return fixup(self, fixlower);
5132 }
5133
5134 #define LEFTSTRIP 0
5135 #define RIGHTSTRIP 1
5136 #define BOTHSTRIP 2
5137
5138 /* Arrays indexed by above */
5139 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5140
5141 #define STRIPNAME(i) (stripformat[i]+3)
5142
5143 static const Py_UNICODE *
5144 unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5145 {
5146         size_t i;
5147         for (i = 0; i < n; ++i)
5148                 if (s[i] == c)
5149                         return s+i;
5150         return NULL;
5151 }
5152
5153 /* externally visible for str.strip(unicode) */
5154 PyObject *
5155 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5156 {
5157         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5158         int len = PyUnicode_GET_SIZE(self);
5159         Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5160         int seplen = PyUnicode_GET_SIZE(sepobj);
5161         int i, j;
5162
5163         i = 0;
5164         if (striptype != RIGHTSTRIP) {
5165                 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5166                         i++;
5167                 }
5168         }
5169
5170         j = len;
5171         if (striptype != LEFTSTRIP) {
5172                 do {
5173                         j--;
5174                 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5175                 j++;
5176         }
5177
5178         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5179                 Py_INCREF(self);
5180                 return (PyObject*)self;
5181         }
5182         else
5183                 return PyUnicode_FromUnicode(s+i, j-i);
5184 }
5185
5186
5187 static PyObject *
5188 do_strip(PyUnicodeObject *self, int striptype)
5189 {
5190         Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5191         int len = PyUnicode_GET_SIZE(self), i, j;
5192
5193         i = 0;
5194         if (striptype != RIGHTSTRIP) {
5195                 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5196                         i++;
5197                 }
5198         }
5199
5200         j = len;
5201         if (striptype != LEFTSTRIP) {
5202                 do {
5203                         j--;
5204                 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5205                 j++;
5206         }
5207
5208         if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5209                 Py_INCREF(self);
5210                 return (PyObject*)self;
5211         }
5212         else
5213                 return PyUnicode_FromUnicode(s+i, j-i);
5214 }
5215
5216
5217 static PyObject *
5218 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5219 {
5220         PyObject *sep = NULL;
5221
5222         if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5223                 return NULL;
5224
5225         if (sep != NULL && sep != Py_None) {
5226                 if (PyUnicode_Check(sep))
5227                         return _PyUnicode_XStrip(self, striptype, sep);
5228                 else if (PyString_Check(sep)) {
5229                         PyObject *res;
5230                         sep = PyUnicode_FromObject(sep);
5231                         if (sep==NULL)
5232                                 return NULL;
5233                         res = _PyUnicode_XStrip(self, striptype, sep);
5234                         Py_DECREF(sep);
5235                         return res;
5236                 }
5237                 else {
5238                         PyErr_Format(PyExc_TypeError,
5239                                      "%s arg must be None, unicode or str",
5240                                      STRIPNAME(striptype));
5241                         return NULL;
5242                 }
5243         }
5244
5245         return do_strip(self, striptype);
5246 }
5247
5248
5249 PyDoc_STRVAR(strip__doc__,
5250 "S.strip([chars]) -> unicode\n\
5251 \n\
5252 Return a copy of the string S with leading and trailing\n\
5253 whitespace removed.\n\
5254 If chars is given and not None, remove characters in chars instead.\n\
5255 If chars is a str, it will be converted to unicode before stripping");
5256
5257 static PyObject *
5258 unicode_strip(PyUnicodeObject *self, PyObject *args)
5259 {
5260         if (PyTuple_GET_SIZE(args) == 0)
5261                 return do_strip(self, BOTHSTRIP); /* Common case */
5262         else
5263                 return do_argstrip(self, BOTHSTRIP, args);
5264 }
5265
5266
5267 PyDoc_STRVAR(lstrip__doc__,
5268 "S.lstrip([chars]) -> unicode\n\
5269 \n\
5270 Return a copy of the string S with leading whitespace removed.\n\
5271 If chars is given and not None, remove characters in chars instead.\n\
5272 If chars is a str, it will be converted to unicode before stripping");
5273
5274 static PyObject *
5275 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5276 {
5277         if (PyTuple_GET_SIZE(args) == 0)
5278                 return do_strip(self, LEFTSTRIP); /* Common case */
5279         else
5280                 return do_argstrip(self, LEFTSTRIP, args);
5281 }
5282
5283
5284 PyDoc_STRVAR(rstrip__doc__,
5285 "S.rstrip([chars]) -> unicode\n\
5286 \n\
5287 Return a copy of the string S with trailing whitespace removed.\n\
5288 If chars is given and not None, remove characters in chars instead.\n\
5289 If chars is a str, it will be converted to unicode before stripping");
5290
5291 static PyObject *
5292 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5293 {
5294         if (PyTuple_GET_SIZE(args) == 0)
5295                 return do_strip(self, RIGHTSTRIP); /* Common case */
5296         else
5297                 return do_argstrip(self, RIGHTSTRIP, args);
5298 }
5299
5300
5301 static PyObject*
5302 unicode_repeat(PyUnicodeObject *str, int len)
5303 {
5304     PyUnicodeObject *u;
5305     Py_UNICODE *p;
5306     int nchars;
5307     size_t nbytes;
5308
5309     if (len < 0)
5310         len = 0;
5311
5312     if (len == 1 && PyUnicode_CheckExact(str)) {
5313         /* no repeat, return original string */
5314         Py_INCREF(str);
5315         return (PyObject*) str;
5316     }
5317
5318     /* ensure # of chars needed doesn't overflow int and # of bytes
5319      * needed doesn't overflow size_t
5320      */
5321     nchars = len * str->length;
5322     if (len && nchars / len != str->length) {
5323         PyErr_SetString(PyExc_OverflowError,
5324                         "repeated string is too long");
5325         return NULL;
5326     }
5327     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5328     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5329         PyErr_SetString(PyExc_OverflowError,
5330                         "repeated string is too long");
5331         return NULL;
5332     }
5333     u = _PyUnicode_New(nchars);
5334     if (!u)
5335         return NULL;
5336
5337     p = u->str;
5338
5339     while (len-- > 0) {
5340         Py_UNICODE_COPY(p, str->str, str->length);
5341         p += str->length;
5342     }
5343
5344     return (PyObject*) u;
5345 }
5346
5347 PyObject *PyUnicode_Replace(PyObject *obj,
5348                             PyObject *subobj,
5349                             PyObject *replobj,
5350                             int maxcount)
5351 {
5352     PyObject *self;
5353     PyObject *str1;
5354     PyObject *str2;
5355     PyObject *result;
5356
5357     self = PyUnicode_FromObject(obj);
5358     if (self == NULL)
5359         return NULL;
5360     str1 = PyUnicode_FromObject(subobj);
5361     if (str1 == NULL) {
5362         Py_DECREF(self);
5363         return NULL;
5364     }
5365     str2 = PyUnicode_FromObject(replobj);
5366     if (str2 == NULL) {
5367         Py_DECREF(self);
5368         Py_DECREF(str1);
5369         return NULL;
5370     }
5371     result = replace((PyUnicodeObject *)self,
5372                      (PyUnicodeObject *)str1,
5373                      (PyUnicodeObject *)str2,
5374                      maxcount);
5375     Py_DECREF(self);
5376     Py_DECREF(str1);
5377     Py_DECREF(str2);
5378     return result;
5379 }
5380
5381 PyDoc_STRVAR(replace__doc__,
5382 "S.replace (old, new[, maxsplit]) -> unicode\n\
5383 \n\
5384 Return a copy of S with all occurrences of substring\n\
5385 old replaced by new.  If the optional argument maxsplit is\n\
5386 given, only the first maxsplit occurrences are replaced.");
5387
5388 static PyObject*
5389 unicode_replace(PyUnicodeObject *self, PyObject *args)
5390 {
5391     PyUnicodeObject *str1;
5392     PyUnicodeObject *str2;
5393     int maxcount = -1;
5394     PyObject *result;
5395
5396     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5397         return NULL;
5398     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5399     if (str1 == NULL)
5400         return NULL;
5401     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
5402     if (str2 == NULL) {
5403         Py_DECREF(str1);
5404         return NULL;
5405     }
5406
5407     result = replace(self, str1, str2, maxcount);
5408
5409     Py_DECREF(str1);
5410     Py_DECREF(str2);
5411     return result;
5412 }
5413
5414 static
5415 PyObject *unicode_repr(PyObject *unicode)
5416 {
5417     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5418                                 PyUnicode_GET_SIZE(unicode),
5419                                 1);
5420 }
5421
5422 PyDoc_STRVAR(rfind__doc__,
5423 "S.rfind(sub [,start [,end]]) -> int\n\
5424 \n\
5425 Return the highest index in S where substring sub is found,\n\
5426 such that sub is contained within s[start,end].  Optional\n\
5427 arguments start and end are interpreted as in slice notation.\n\
5428 \n\
5429 Return -1 on failure.");
5430
5431 static PyObject *
5432 unicode_rfind(PyUnicodeObject *self, PyObject *args)
5433 {
5434     PyUnicodeObject *substring;
5435     int start = 0;
5436     int end = INT_MAX;
5437     PyObject *result;
5438
5439     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5440                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5441         return NULL;
5442     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5443                                                 (PyObject *)substring);
5444     if (substring == NULL)
5445         return NULL;
5446
5447     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5448
5449     Py_DECREF(substring);
5450     return result;
5451 }
5452
5453 PyDoc_STRVAR(rindex__doc__,
5454 "S.rindex(sub [,start [,end]]) -> int\n\
5455 \n\
5456 Like S.rfind() but raise ValueError when the substring is not found.");
5457
5458 static PyObject *
5459 unicode_rindex(PyUnicodeObject *self, PyObject *args)
5460 {
5461     int result;
5462     PyUnicodeObject *substring;
5463     int start = 0;
5464     int end = INT_MAX;
5465
5466     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5467                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5468         return NULL;
5469     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5470                                                 (PyObject *)substring);
5471     if (substring == NULL)
5472         return NULL;
5473
5474     result = findstring(self, substring, start, end, -1);
5475
5476     Py_DECREF(substring);
5477     if (result < 0) {
5478         PyErr_SetString(PyExc_ValueError, "substring not found");
5479         return NULL;
5480     }
5481     return PyInt_FromLong(result);
5482 }
5483
5484 PyDoc_STRVAR(rjust__doc__,
5485 "S.rjust(width) -> unicode\n\
5486 \n\
5487 Return S right justified in a Unicode string of length width. Padding is\n\
5488 done using spaces.");
5489
5490 static PyObject *
5491 unicode_rjust(PyUnicodeObject *self, PyObject *args)
5492 {
5493     int width;
5494     if (!PyArg_ParseTuple(args, "i:rjust", &width))
5495         return NULL;
5496
5497     if (self->length >= width && PyUnicode_CheckExact(self)) {
5498         Py_INCREF(self);
5499         return (PyObject*) self;
5500     }
5501
5502     return (PyObject*) pad(self, width - self->length, 0, ' ');
5503 }
5504
5505 static PyObject*
5506 unicode_slice(PyUnicodeObject *self, int start, int end)
5507 {
5508     /* standard clamping */
5509     if (start < 0)
5510         start = 0;
5511     if (end < 0)
5512         end = 0;
5513     if (end > self->length)
5514         end = self->length;
5515     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
5516         /* full slice, return original string */
5517         Py_INCREF(self);
5518         return (PyObject*) self;
5519     }
5520     if (start > end)
5521         start = end;
5522     /* copy slice */
5523     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5524                                              end - start);
5525 }
5526
5527 PyObject *PyUnicode_Split(PyObject *s,
5528                           PyObject *sep,
5529                           int maxsplit)
5530 {
5531     PyObject *result;
5532
5533     s = PyUnicode_FromObject(s);
5534     if (s == NULL)
5535         return NULL;
5536     if (sep != NULL) {
5537         sep = PyUnicode_FromObject(sep);
5538         if (sep == NULL) {
5539             Py_DECREF(s);
5540             return NULL;
5541         }
5542     }
5543
5544     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5545
5546     Py_DECREF(s);
5547     Py_XDECREF(sep);
5548     return result;
5549 }
5550
5551 PyDoc_STRVAR(split__doc__,
5552 "S.split([sep [,maxsplit]]) -> list of strings\n\
5553 \n\
5554 Return a list of the words in S, using sep as the\n\
5555 delimiter string.  If maxsplit is given, at most maxsplit\n\
5556 splits are done. If sep is not specified, any whitespace string\n\
5557 is a separator.");
5558
5559 static PyObject*
5560 unicode_split(PyUnicodeObject *self, PyObject *args)
5561 {
5562     PyObject *substring = Py_None;
5563     int maxcount = -1;
5564
5565     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5566         return NULL;
5567
5568     if (substring == Py_None)
5569         return split(self, NULL, maxcount);
5570     else if (PyUnicode_Check(substring))
5571         return split(self, (PyUnicodeObject *)substring, maxcount);
5572     else
5573         return PyUnicode_Split((PyObject *)self, substring, maxcount);
5574 }
5575
5576 PyDoc_STRVAR(splitlines__doc__,
5577 "S.splitlines([keepends]]) -> list of strings\n\
5578 \n\
5579 Return a list of the lines in S, breaking at line boundaries.\n\
5580 Line breaks are not included in the resulting list unless keepends\n\
5581 is given and true.");
5582
5583 static PyObject*
5584 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5585 {
5586     int keepends = 0;
5587
5588     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
5589         return NULL;
5590
5591     return PyUnicode_Splitlines((PyObject *)self, keepends);
5592 }
5593
5594 static
5595 PyObject *unicode_str(PyUnicodeObject *self)
5596 {
5597     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
5598 }
5599
5600 PyDoc_STRVAR(swapcase__doc__,
5601 "S.swapcase() -> unicode\n\
5602 \n\
5603 Return a copy of S with uppercase characters converted to lowercase\n\
5604 and vice versa.");
5605
5606 static PyObject*
5607 unicode_swapcase(PyUnicodeObject *self)
5608 {
5609     return fixup(self, fixswapcase);
5610 }
5611
5612 PyDoc_STRVAR(translate__doc__,
5613 "S.translate(table) -> unicode\n\
5614 \n\
5615 Return a copy of the string S, where all characters have been mapped\n\
5616 through the given translation table, which must be a mapping of\n\
5617 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5618 Unmapped characters are left untouched. Characters mapped to None\n\
5619 are deleted.");
5620
5621 static PyObject*
5622 unicode_translate(PyUnicodeObject *self, PyObject *table)
5623 {
5624     return PyUnicode_TranslateCharmap(self->str,
5625                                       self->length,
5626                                       table,
5627                                       "ignore");
5628 }
5629
5630 PyDoc_STRVAR(upper__doc__,
5631 "S.upper() -> unicode\n\
5632 \n\
5633 Return a copy of S converted to uppercase.");
5634
5635 static PyObject*
5636 unicode_upper(PyUnicodeObject *self)
5637 {
5638     return fixup(self, fixupper);
5639 }
5640
5641 PyDoc_STRVAR(zfill__doc__,
5642 "S.zfill(width) -> unicode\n\
5643 \n\
5644 Pad a numeric string x with zeros on the left, to fill a field\n\
5645 of the specified width. The string x is never truncated.");
5646
5647 static PyObject *
5648 unicode_zfill(PyUnicodeObject *self, PyObject *args)
5649 {
5650     int fill;
5651     PyUnicodeObject *u;
5652
5653     int width;
5654     if (!PyArg_ParseTuple(args, "i:zfill", &width))
5655         return NULL;
5656
5657     if (self->length >= width) {
5658         if (PyUnicode_CheckExact(self)) {
5659             Py_INCREF(self);
5660             return (PyObject*) self;
5661         }
5662         else
5663             return PyUnicode_FromUnicode(
5664                 PyUnicode_AS_UNICODE(self),
5665                 PyUnicode_GET_SIZE(self)
5666             );
5667     }
5668
5669     fill = width - self->length;
5670
5671     u = pad(self, fill, 0, '0');
5672
5673     if (u == NULL)
5674         return NULL;
5675
5676     if (u->str[fill] == '+' || u->str[fill] == '-') {
5677         /* move sign to beginning of string */
5678         u->str[0] = u->str[fill];
5679         u->str[fill] = '0';
5680     }
5681
5682     return (PyObject*) u;
5683 }
5684
5685 #if 0
5686 static PyObject*
5687 unicode_freelistsize(PyUnicodeObject *self)
5688 {
5689     return PyInt_FromLong(unicode_freelist_size);
5690 }
5691 #endif
5692
5693 PyDoc_STRVAR(startswith__doc__,
5694 "S.startswith(prefix[, start[, end]]) -> bool\n\
5695 \n\
5696 Return True if S starts with the specified prefix, False otherwise.\n\
5697 With optional start, test S beginning at that position.\n\
5698 With optional end, stop comparing S at that position.");
5699
5700 static PyObject *
5701 unicode_startswith(PyUnicodeObject *self,
5702                    PyObject *args)
5703 {
5704     PyUnicodeObject *substring;
5705     int start = 0;
5706     int end = INT_MAX;
5707     PyObject *result;
5708
5709     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5710                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5711         return NULL;
5712     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5713                                                 (PyObject *)substring);
5714     if (substring == NULL)
5715         return NULL;
5716
5717     result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
5718
5719     Py_DECREF(substring);
5720     return result;
5721 }
5722
5723
5724 PyDoc_STRVAR(endswith__doc__,
5725 "S.endswith(suffix[, start[, end]]) -> bool\n\
5726 \n\
5727 Return True if S ends with the specified suffix, False otherwise.\n\
5728 With optional start, test S beginning at that position.\n\
5729 With optional end, stop comparing S at that position.");
5730
5731 static PyObject *
5732 unicode_endswith(PyUnicodeObject *self,
5733                  PyObject *args)
5734 {
5735     PyUnicodeObject *substring;
5736     int start = 0;
5737     int end = INT_MAX;
5738     PyObject *result;
5739
5740     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5741                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
5742         return NULL;
5743     substring = (PyUnicodeObject *)PyUnicode_FromObject(
5744                                                 (PyObject *)substring);
5745     if (substring == NULL)
5746         return NULL;
5747
5748     result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
5749
5750     Py_DECREF(substring);
5751     return result;
5752 }
5753
5754
5755
5756 static PyObject *
5757 unicode_getnewargs(PyUnicodeObject *v)
5758 {
5759         return Py_BuildValue("(u#)", v->str, v->length);
5760 }
5761
5762
5763 static PyMethodDef unicode_methods[] = {
5764
5765     /* Order is according to common usage: often used methods should
5766        appear first, since lookup is done sequentially. */
5767
5768     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5769     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5770     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5771     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5772     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5773     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5774     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5775     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5776     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5777     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5778     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5779     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5780     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
5781     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
5782 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5783     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5784     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5785     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
5786     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
5787     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
5788     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
5789     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5790     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5791     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5792     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5793     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5794     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5795     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5796     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5797     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5798     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5799     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5800     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5801     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5802     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
5803     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
5804 #if 0
5805     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
5806 #endif
5807
5808 #if 0
5809     /* This one is just used for debugging the implementation. */
5810     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
5811 #endif
5812
5813     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
5814     {NULL, NULL}
5815 };
5816
5817 static PyObject *
5818 unicode_mod(PyObject *v, PyObject *w)
5819 {
5820        if (!PyUnicode_Check(v)) {
5821                Py_INCREF(Py_NotImplemented);
5822                return Py_NotImplemented;
5823        }
5824        return PyUnicode_Format(v, w);
5825 }
5826
5827 static PyNumberMethods unicode_as_number = {
5828         0,                              /*nb_add*/
5829         0,                              /*nb_subtract*/
5830         0,                              /*nb_multiply*/
5831         0,                              /*nb_divide*/
5832         unicode_mod,                    /*nb_remainder*/
5833 };
5834
5835 static PySequenceMethods unicode_as_sequence = {
5836     (inquiry) unicode_length,           /* sq_length */
5837     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
5838     (intargfunc) unicode_repeat,        /* sq_repeat */
5839     (intargfunc) unicode_getitem,       /* sq_item */
5840     (intintargfunc) unicode_slice,      /* sq_slice */
5841     0,                                  /* sq_ass_item */
5842     0,                                  /* sq_ass_slice */
5843     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
5844 };
5845
5846 static PyObject*
5847 unicode_subscript(PyUnicodeObject* self, PyObject* item)
5848 {
5849     if (PyInt_Check(item)) {
5850         long i = PyInt_AS_LONG(item);
5851         if (i < 0)
5852             i += PyString_GET_SIZE(self);
5853         return unicode_getitem(self, i);
5854     } else if (PyLong_Check(item)) {
5855         long i = PyLong_AsLong(item);
5856         if (i == -1 && PyErr_Occurred())
5857             return NULL;
5858         if (i < 0)
5859             i += PyString_GET_SIZE(self);
5860         return unicode_getitem(self, i);
5861     } else if (PySlice_Check(item)) {
5862         int start, stop, step, slicelength, cur, i;
5863         Py_UNICODE* source_buf;
5864         Py_UNICODE* result_buf;
5865         PyObject* result;
5866
5867         if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5868                                  &start, &stop, &step, &slicelength) < 0) {
5869             return NULL;
5870         }
5871
5872         if (slicelength <= 0) {
5873             return PyUnicode_FromUnicode(NULL, 0);
5874         } else {
5875             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5876             result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5877
5878             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5879                 result_buf[i] = source_buf[cur];
5880             }
5881
5882             result = PyUnicode_FromUnicode(result_buf, slicelength);
5883             PyMem_FREE(result_buf);
5884             return result;
5885         }
5886     } else {
5887         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5888         return NULL;
5889     }
5890 }
5891
5892 static PyMappingMethods unicode_as_mapping = {
5893     (inquiry)unicode_length,            /* mp_length */
5894     (binaryfunc)unicode_subscript,      /* mp_subscript */
5895     (objobjargproc)0,                   /* mp_ass_subscript */
5896 };
5897
5898 static int
5899 unicode_buffer_getreadbuf(PyUnicodeObject *self,
5900                           int index,
5901                           const void **ptr)
5902 {
5903     if (index != 0) {
5904         PyErr_SetString(PyExc_SystemError,
5905                         "accessing non-existent unicode segment");
5906         return -1;
5907     }
5908     *ptr = (void *) self->str;
5909     return PyUnicode_GET_DATA_SIZE(self);
5910 }
5911
5912 static int
5913 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5914                            const void **ptr)
5915 {
5916     PyErr_SetString(PyExc_TypeError,
5917                     "cannot use unicode as modifiable buffer");
5918     return -1;
5919 }
5920
5921 static int
5922 unicode_buffer_getsegcount(PyUnicodeObject *self,
5923                            int *lenp)
5924 {
5925     if (lenp)
5926         *lenp = PyUnicode_GET_DATA_SIZE(self);
5927     return 1;
5928 }
5929
5930 static int
5931 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5932                           int index,
5933                           const void **ptr)
5934 {
5935     PyObject *str;
5936
5937     if (index != 0) {
5938         PyErr_SetString(PyExc_SystemError,
5939                         "accessing non-existent unicode segment");
5940         return -1;
5941     }
5942     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5943     if (str == NULL)
5944         return -1;
5945     *ptr = (void *) PyString_AS_STRING(str);
5946     return PyString_GET_SIZE(str);
5947 }
5948
5949 /* Helpers for PyUnicode_Format() */
5950
5951 static PyObject *
5952 getnextarg(PyObject *args, int arglen, int *p_argidx)
5953 {
5954     int argidx = *p_argidx;
5955     if (argidx < arglen) {
5956         (*p_argidx)++;
5957         if (arglen < 0)
5958             return args;
5959         else
5960             return PyTuple_GetItem(args, argidx);
5961     }
5962     PyErr_SetString(PyExc_TypeError,
5963                     "not enough arguments for format string");
5964     return NULL;
5965 }
5966
5967 #define F_LJUST (1<<0)
5968 #define F_SIGN  (1<<1)
5969 #define F_BLANK (1<<2)
5970 #define F_ALT   (1<<3)
5971 #define F_ZERO  (1<<4)
5972
5973 static
5974 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5975 {
5976     register int i;
5977     int len;
5978     va_list va;
5979     char *charbuffer;
5980     va_start(va, format);
5981
5982     /* First, format the string as char array, then expand to Py_UNICODE
5983        array. */
5984     charbuffer = (char *)buffer;
5985     len = vsprintf(charbuffer, format, va);
5986     for (i = len - 1; i >= 0; i--)
5987         buffer[i] = (Py_UNICODE) charbuffer[i];
5988
5989     va_end(va);
5990     return len;
5991 }
5992
5993 /* XXX To save some code duplication, formatfloat/long/int could have been
5994    shared with stringobject.c, converting from 8-bit to Unicode after the
5995    formatting is done. */
5996
5997 static int
5998 formatfloat(Py_UNICODE *buf,
5999             size_t buflen,
6000             int flags,
6001             int prec,
6002             int type,
6003             PyObject *v)
6004 {
6005     /* fmt = '%#.' + `prec` + `type`
6006        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
6007     char fmt[20];
6008     double x;
6009
6010     x = PyFloat_AsDouble(v);
6011     if (x == -1.0 && PyErr_Occurred())
6012         return -1;
6013     if (prec < 0)
6014         prec = 6;
6015     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6016         type = 'g';
6017     /* Worst case length calc to ensure no buffer overrun:
6018
6019        'g' formats:
6020          fmt = %#.<prec>g
6021          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6022             for any double rep.)
6023          len = 1 + prec + 1 + 2 + 5 = 9 + prec
6024
6025        'f' formats:
6026          buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6027          len = 1 + 50 + 1 + prec = 52 + prec
6028
6029        If prec=0 the effective precision is 1 (the leading digit is
6030        always given), therefore increase the length by one.
6031
6032     */
6033     if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6034         (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
6035         PyErr_SetString(PyExc_OverflowError,
6036                         "formatted float is too long (precision too large?)");
6037         return -1;
6038     }
6039     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6040                   (flags&F_ALT) ? "#" : "",
6041                   prec, type);
6042     return usprintf(buf, fmt, x);
6043 }
6044
6045 static PyObject*
6046 formatlong(PyObject *val, int flags, int prec, int type)
6047 {
6048         char *buf;
6049         int i, len;
6050         PyObject *str; /* temporary string object. */
6051         PyUnicodeObject *result;
6052
6053         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6054         if (!str)
6055                 return NULL;
6056         result = _PyUnicode_New(len);
6057         for (i = 0; i < len; i++)
6058                 result->str[i] = buf[i];
6059         result->str[len] = 0;
6060         Py_DECREF(str);
6061         return (PyObject*)result;
6062 }
6063
6064 static int
6065 formatint(Py_UNICODE *buf,
6066           size_t buflen,
6067           int flags,
6068           int prec,
6069           int type,
6070           PyObject *v)
6071 {
6072     /* fmt = '%#.' + `prec` + 'l' + `type`
6073      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6074      *                     + 1 + 1
6075      *                   = 24
6076      */
6077     char fmt[64]; /* plenty big enough! */
6078     long x;
6079
6080     x = PyInt_AsLong(v);
6081     if (x == -1 && PyErr_Occurred())
6082         return -1;
6083     if (x < 0 && type != 'd' && type != 'i') {
6084         if (PyErr_Warn(PyExc_FutureWarning,
6085                        "%u/%o/%x/%X of negative int will return "
6086                        "a signed string in Python 2.4 and up") < 0)
6087             return -1;
6088     }
6089     if (prec < 0)
6090         prec = 1;
6091
6092     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
6093      * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6094      */
6095     if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
6096         PyErr_SetString(PyExc_OverflowError,
6097                 "formatted integer is too long (precision too large?)");
6098         return -1;
6099     }
6100
6101     if ((flags & F_ALT) &&
6102         (type == 'x' || type == 'X')) {
6103         /* When converting under %#x or %#X, there are a number
6104          * of issues that cause pain:
6105          * - when 0 is being converted, the C standard leaves off
6106          *   the '0x' or '0X', which is inconsistent with other
6107          *   %#x/%#X conversions and inconsistent with Python's
6108          *   hex() function
6109          * - there are platforms that violate the standard and
6110          *   convert 0 with the '0x' or '0X'
6111          *   (Metrowerks, Compaq Tru64)
6112          * - there are platforms that give '0x' when converting
6113          *   under %#X, but convert 0 in accordance with the
6114          *   standard (OS/2 EMX)
6115          *
6116          * We can achieve the desired consistency by inserting our
6117          * own '0x' or '0X' prefix, and substituting %x/%X in place
6118          * of %#x/%#X.
6119          *
6120          * Note that this is the same approach as used in
6121          * formatint() in stringobject.c
6122          */
6123         PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6124                       type, prec, type);
6125     }
6126     else {
6127         PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6128                       (flags&F_ALT) ? "#" : "",
6129                       prec, type);
6130     }
6131     return usprintf(buf, fmt, x);
6132 }
6133
6134 static int
6135 formatchar(Py_UNICODE *buf,
6136            size_t buflen,
6137            PyObject *v)
6138 {
6139     /* presume that the buffer is at least 2 characters long */
6140     if (PyUnicode_Check(v)) {
6141         if (PyUnicode_GET_SIZE(v) != 1)
6142             goto onError;
6143         buf[0] = PyUnicode_AS_UNICODE(v)[0];
6144     }
6145
6146     else if (PyString_Check(v)) {
6147         if (PyString_GET_SIZE(v) != 1)
6148             goto onError;
6149         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6150     }
6151
6152     else {
6153         /* Integer input truncated to a character */
6154         long x;
6155         x = PyInt_AsLong(v);
6156         if (x == -1 && PyErr_Occurred())
6157             goto onError;
6158 #ifdef Py_UNICODE_WIDE
6159         if (x < 0 || x > 0x10ffff) {
6160             PyErr_SetString(PyExc_OverflowError,
6161                             "%c arg not in range(0x110000) "
6162                             "(wide Python build)");
6163             return -1;
6164         }
6165 #else
6166         if (x < 0 || x > 0xffff) {
6167             PyErr_SetString(PyExc_OverflowError,
6168                             "%c arg not in range(0x10000) "
6169                             "(narrow Python build)");
6170             return -1;
6171         }
6172 #endif
6173         buf[0] = (Py_UNICODE) x;
6174     }
6175     buf[1] = '\0';
6176     return 1;
6177
6178  onError:
6179     PyErr_SetString(PyExc_TypeError,
6180                     "%c requires int or char");
6181     return -1;
6182 }
6183
6184 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6185
6186    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6187    chars are formatted. XXX This is a magic number. Each formatting
6188    routine does bounds checking to ensure no overflow, but a better
6189    solution may be to malloc a buffer of appropriate size for each
6190    format. For now, the current solution is sufficient.
6191 */
6192 #define FORMATBUFLEN (size_t)120
6193
6194 PyObject *PyUnicode_Format(PyObject *format,
6195                            PyObject *args)
6196 {
6197     Py_UNICODE *fmt, *res;
6198     int fmtcnt, rescnt, reslen, arglen, argidx;
6199     int args_owned = 0;
6200     PyUnicodeObject *result = NULL;
6201     PyObject *dict = NULL;
6202     PyObject *uformat;
6203
6204     if (format == NULL || args == NULL) {
6205         PyErr_BadInternalCall();
6206         return NULL;
6207     }
6208     uformat = PyUnicode_FromObject(format);
6209     if (uformat == NULL)
6210         return NULL;
6211     fmt = PyUnicode_AS_UNICODE(uformat);
6212     fmtcnt = PyUnicode_GET_SIZE(uformat);
6213
6214     reslen = rescnt = fmtcnt + 100;
6215     result = _PyUnicode_New(reslen);
6216     if (result == NULL)
6217         goto onError;
6218     res = PyUnicode_AS_UNICODE(result);
6219
6220     if (PyTuple_Check(args)) {
6221         arglen = PyTuple_Size(args);
6222         argidx = 0;
6223     }
6224     else {
6225         arglen = -1;
6226         argidx = -2;
6227     }
6228     if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6229         !PyObject_TypeCheck(args, &PyBaseString_Type))
6230         dict = args;
6231
6232     while (--fmtcnt >= 0) {
6233         if (*fmt != '%') {
6234             if (--rescnt < 0) {
6235                 rescnt = fmtcnt + 100;
6236                 reslen += rescnt;
6237                 if (_PyUnicode_Resize(&result, reslen) < 0)
6238                     return NULL;
6239                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6240                 --rescnt;
6241             }
6242             *res++ = *fmt++;
6243         }
6244         else {
6245             /* Got a format specifier */
6246             int flags = 0;
6247             int width = -1;
6248             int prec = -1;
6249             Py_UNICODE c = '\0';
6250             Py_UNICODE fill;
6251             PyObject *v = NULL;
6252             PyObject *temp = NULL;
6253             Py_UNICODE *pbuf;
6254             Py_UNICODE sign;
6255             int len;
6256             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
6257
6258             fmt++;
6259             if (*fmt == '(') {
6260                 Py_UNICODE *keystart;
6261                 int keylen;
6262                 PyObject *key;
6263                 int pcount = 1;
6264
6265                 if (dict == NULL) {
6266                     PyErr_SetString(PyExc_TypeError,
6267                                     "format requires a mapping");
6268                     goto onError;
6269                 }
6270                 ++fmt;
6271                 --fmtcnt;
6272                 keystart = fmt;
6273                 /* Skip over balanced parentheses */
6274                 while (pcount > 0 && --fmtcnt >= 0) {
6275                     if (*fmt == ')')
6276                         --pcount;
6277                     else if (*fmt == '(')
6278                         ++pcount;
6279                     fmt++;
6280                 }
6281                 keylen = fmt - keystart - 1;
6282                 if (fmtcnt < 0 || pcount > 0) {
6283                     PyErr_SetString(PyExc_ValueError,
6284                                     "incomplete format key");
6285                     goto onError;
6286                 }
6287 #if 0
6288                 /* keys are converted to strings using UTF-8 and
6289                    then looked up since Python uses strings to hold
6290                    variables names etc. in its namespaces and we
6291                    wouldn't want to break common idioms. */
6292                 key = PyUnicode_EncodeUTF8(keystart,
6293                                            keylen,
6294                                            NULL);
6295 #else
6296                 key = PyUnicode_FromUnicode(keystart, keylen);
6297 #endif
6298                 if (key == NULL)
6299                     goto onError;
6300                 if (args_owned) {
6301                     Py_DECREF(args);
6302                     args_owned = 0;
6303                 }
6304                 args = PyObject_GetItem(dict, key);
6305                 Py_DECREF(key);
6306                 if (args == NULL) {
6307                     goto onError;
6308                 }
6309                 args_owned = 1;
6310                 arglen = -1;
6311                 argidx = -2;
6312             }
6313             while (--fmtcnt >= 0) {
6314                 switch (c = *fmt++) {
6315                 case '-': flags |= F_LJUST; continue;
6316                 case '+': flags |= F_SIGN; continue;
6317                 case ' ': flags |= F_BLANK; continue;
6318                 case '#': flags |= F_ALT; continue;
6319                 case '0': flags |= F_ZERO; continue;
6320                 }
6321                 break;
6322             }
6323             if (c == '*') {
6324                 v = getnextarg(args, arglen, &argidx);
6325                 if (v == NULL)
6326                     goto onError;
6327                 if (!PyInt_Check(v)) {
6328                     PyErr_SetString(PyExc_TypeError,
6329                                     "* wants int");
6330                     goto onError;
6331                 }
6332                 width = PyInt_AsLong(v);
6333                 if (width < 0) {
6334                     flags |= F_LJUST;
6335                     width = -width;
6336                 }
6337                 if (--fmtcnt >= 0)
6338                     c = *fmt++;
6339             }
6340             else if (c >= '0' && c <= '9') {
6341                 width = c - '0';
6342                 while (--fmtcnt >= 0) {
6343                     c = *fmt++;
6344                     if (c < '0' || c > '9')
6345                         break;
6346                     if ((width*10) / 10 != width) {
6347                         PyErr_SetString(PyExc_ValueError,
6348                                         "width too big");
6349                         goto onError;
6350                     }
6351                     width = width*10 + (c - '0');
6352                 }
6353             }
6354             if (c == '.') {
6355                 prec = 0;
6356                 if (--fmtcnt >= 0)
6357                     c = *fmt++;
6358                 if (c == '*') {
6359                     v = getnextarg(args, arglen, &argidx);
6360                     if (v == NULL)
6361                         goto onError;
6362                     if (!PyInt_Check(v)) {
6363                         PyErr_SetString(PyExc_TypeError,
6364                                         "* wants int");
6365                         goto onError;
6366                     }
6367                     prec = PyInt_AsLong(v);
6368                     if (prec < 0)
6369                         prec = 0;
6370                     if (--fmtcnt >= 0)
6371                         c = *fmt++;
6372                 }
6373                 else if (c >= '0' && c <= '9') {
6374                     prec = c - '0';
6375                     while (--fmtcnt >= 0) {
6376                         c = Py_CHARMASK(*fmt++);
6377                         if (c < '0' || c > '9')
6378                             break;
6379                         if ((prec*10) / 10 != prec) {
6380                             PyErr_SetString(PyExc_ValueError,
6381                                             "prec too big");
6382                             goto onError;
6383                         }
6384                         prec = prec*10 + (c - '0');
6385                     }
6386                 }
6387             } /* prec */
6388             if (fmtcnt >= 0) {
6389                 if (c == 'h' || c == 'l' || c == 'L') {
6390                     if (--fmtcnt >= 0)
6391                         c = *fmt++;
6392                 }
6393             }
6394             if (fmtcnt < 0) {
6395                 PyErr_SetString(PyExc_ValueError,
6396                                 "incomplete format");
6397                 goto onError;
6398             }
6399             if (c != '%') {
6400                 v = getnextarg(args, arglen, &argidx);
6401                 if (v == NULL)
6402                     goto onError;
6403             }
6404             sign = 0;
6405             fill = ' ';
6406             switch (c) {
6407
6408             case '%':
6409                 pbuf = formatbuf;
6410                 /* presume that buffer length is at least 1 */
6411                 pbuf[0] = '%';
6412                 len = 1;
6413                 break;
6414
6415             case 's':
6416             case 'r':
6417                 if (PyUnicode_Check(v) && c == 's') {
6418                     temp = v;
6419                     Py_INCREF(temp);
6420                 }
6421                 else {
6422                     PyObject *unicode;
6423                     if (c == 's')
6424                         temp = PyObject_Str(v);
6425                     else
6426                         temp = PyObject_Repr(v);
6427                     if (temp == NULL)
6428                         goto onError;
6429                     if (!PyString_Check(temp)) {
6430                         /* XXX Note: this should never happen, since
6431                                PyObject_Repr() and PyObject_Str() assure
6432                                this */
6433                         Py_DECREF(temp);
6434                         PyErr_SetString(PyExc_TypeError,
6435                                         "%s argument has non-string str()");
6436                         goto onError;
6437                     }
6438                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
6439                                                    PyString_GET_SIZE(temp),
6440                                                NULL,
6441                                                    "strict");
6442                     Py_DECREF(temp);
6443                     temp = unicode;
6444                     if (temp == NULL)
6445                         goto onError;
6446                 }
6447                 pbuf = PyUnicode_AS_UNICODE(temp);
6448                 len = PyUnicode_GET_SIZE(temp);
6449                 if (prec >= 0 && len > prec)
6450                     len = prec;
6451                 break;
6452
6453             case 'i':
6454             case 'd':
6455             case 'u':
6456             case 'o':
6457             case 'x':
6458             case 'X':
6459                 if (c == 'i')
6460                     c = 'd';
6461                 if (PyLong_Check(v)) {
6462                     temp = formatlong(v, flags, prec, c);
6463                     if (!temp)
6464                         goto onError;
6465                     pbuf = PyUnicode_AS_UNICODE(temp);
6466                     len = PyUnicode_GET_SIZE(temp);
6467                     /* unbounded ints can always produce
6468                        a sign character! */
6469                     sign = 1;
6470                 }
6471                 else {
6472                     pbuf = formatbuf;
6473                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6474                                     flags, prec, c, v);
6475                     if (len < 0)
6476                         goto onError;
6477                     /* only d conversion is signed */
6478                     sign = c == 'd';
6479                 }
6480                 if (flags & F_ZERO)
6481                     fill = '0';
6482                 break;
6483
6484             case 'e':
6485             case 'E':
6486             case 'f':
6487             case 'g':
6488             case 'G':
6489                 pbuf = formatbuf;
6490                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6491                         flags, prec, c, v);
6492                 if (len < 0)
6493                     goto onError;
6494                 sign = 1;
6495                 if (flags & F_ZERO)
6496                     fill = '0';
6497                 break;
6498
6499             case 'c':
6500                 pbuf = formatbuf;
6501                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
6502                 if (len < 0)
6503                     goto onError;
6504                 break;
6505
6506             default:
6507                 PyErr_Format(PyExc_ValueError,
6508                              "unsupported format character '%c' (0x%x) "
6509                              "at index %i",
6510                              (31<=c && c<=126) ? (char)c : '?',
6511                              (int)c,
6512                              (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
6513                 goto onError;
6514             }
6515             if (sign) {
6516                 if (*pbuf == '-' || *pbuf == '+') {
6517                     sign = *pbuf++;
6518                     len--;
6519                 }
6520                 else if (flags & F_SIGN)
6521                     sign = '+';
6522                 else if (flags & F_BLANK)
6523                     sign = ' ';
6524                 else
6525                     sign = 0;
6526             }
6527             if (width < len)
6528                 width = len;
6529             if (rescnt - (sign != 0) < width) {
6530                 reslen -= rescnt;
6531                 rescnt = width + fmtcnt + 100;
6532                 reslen += rescnt;
6533                 if (reslen < 0) {
6534                     Py_DECREF(result);
6535                     return PyErr_NoMemory();
6536                 }
6537                 if (_PyUnicode_Resize(&result, reslen) < 0)
6538                     return NULL;
6539                 res = PyUnicode_AS_UNICODE(result)
6540                     + reslen - rescnt;
6541             }
6542             if (sign) {
6543                 if (fill != ' ')
6544                     *res++ = sign;
6545                 rescnt--;
6546                 if (width > len)
6547                     width--;
6548             }
6549             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6550                 assert(pbuf[0] == '0');
6551                 assert(pbuf[1] == c);
6552                 if (fill != ' ') {
6553                     *res++ = *pbuf++;
6554                     *res++ = *pbuf++;
6555                 }
6556                 rescnt -= 2;
6557                 width -= 2;
6558                 if (width < 0)
6559                     width = 0;
6560                 len -= 2;
6561             }
6562             if (width > len && !(flags & F_LJUST)) {
6563                 do {
6564                     --rescnt;
6565                     *res++ = fill;
6566                 } while (--width > len);
6567             }
6568             if (fill == ' ') {
6569                 if (sign)
6570                     *res++ = sign;
6571                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6572                     assert(pbuf[0] == '0');
6573                     assert(pbuf[1] == c);
6574                     *res++ = *pbuf++;
6575                     *res++ = *pbuf++;
6576                 }
6577             }
6578             Py_UNICODE_COPY(res, pbuf, len);
6579             res += len;
6580             rescnt -= len;
6581             while (--width >= len) {
6582                 --rescnt;
6583                 *res++ = ' ';
6584             }
6585             if (dict && (argidx < arglen) && c != '%') {
6586                 PyErr_SetString(PyExc_TypeError,
6587                                 "not all arguments converted during string formatting");
6588                 goto onError;
6589             }
6590             Py_XDECREF(temp);
6591         } /* '%' */
6592     } /* until end */
6593     if (argidx < arglen && !dict) {
6594         PyErr_SetString(PyExc_TypeError,
6595                         "not all arguments converted during string formatting");
6596         goto onError;
6597     }
6598
6599     if (args_owned) {
6600         Py_DECREF(args);
6601     }
6602     Py_DECREF(uformat);
6603     if (_PyUnicode_Resize(&result, reslen - rescnt))
6604         goto onError;
6605     return (PyObject *)result;
6606
6607  onError:
6608     Py_XDECREF(result);
6609     Py_DECREF(uformat);
6610     if (args_owned) {
6611         Py_DECREF(args);
6612     }
6613     return NULL;
6614 }
6615
6616 static PyBufferProcs unicode_as_buffer = {
6617     (getreadbufferproc) unicode_buffer_getreadbuf,
6618     (getwritebufferproc) unicode_buffer_getwritebuf,
6619     (getsegcountproc) unicode_buffer_getsegcount,
6620     (getcharbufferproc) unicode_buffer_getcharbuf,
6621 };
6622
6623 static PyObject *
6624 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6625
6626 static PyObject *
6627 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6628 {
6629         PyObject *x = NULL;
6630         static char *kwlist[] = {"string", "encoding", "errors", 0};
6631         char *encoding = NULL;
6632         char *errors = NULL;
6633
6634         if (type != &PyUnicode_Type)
6635                 return unicode_subtype_new(type, args, kwds);
6636         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6637                                           kwlist, &x, &encoding, &errors))
6638             return NULL;
6639         if (x == NULL)
6640                 return (PyObject *)_PyUnicode_New(0);
6641         if (encoding == NULL && errors == NULL)
6642             return PyObject_Unicode(x);
6643         else
6644         return PyUnicode_FromEncodedObject(x, encoding, errors);
6645 }
6646
6647 static PyObject *
6648 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6649 {
6650         PyUnicodeObject *tmp, *pnew;
6651         int n;
6652
6653         assert(PyType_IsSubtype(type, &PyUnicode_Type));
6654         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6655         if (tmp == NULL)
6656                 return NULL;
6657         assert(PyUnicode_Check(tmp));
6658         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6659         if (pnew == NULL)
6660                 return NULL;
6661         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6662         if (pnew->str == NULL) {
6663                 _Py_ForgetReference((PyObject *)pnew);
6664                 PyObject_Del(pnew);
6665                 return PyErr_NoMemory();
6666         }
6667         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6668         pnew->length = n;
6669         pnew->hash = tmp->hash;
6670         Py_DECREF(tmp);
6671         return (PyObject *)pnew;
6672 }
6673
6674 PyDoc_STRVAR(unicode_doc,
6675 "unicode(string [, encoding[, errors]]) -> object\n\
6676 \n\
6677 Create a new Unicode object from the given encoded string.\n\
6678 encoding defaults to the current default string encoding.\n\
6679 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
6680
6681 PyTypeObject PyUnicode_Type = {
6682     PyObject_HEAD_INIT(&PyType_Type)
6683     0,                                  /* ob_size */
6684     "unicode",                          /* tp_name */
6685     sizeof(PyUnicodeObject),            /* tp_size */
6686     0,                                  /* tp_itemsize */
6687     /* Slots */
6688     (destructor)unicode_dealloc,        /* tp_dealloc */
6689     0,                                  /* tp_print */
6690     0,                                  /* tp_getattr */
6691     0,                                  /* tp_setattr */
6692     (cmpfunc) unicode_compare,          /* tp_compare */
6693     (reprfunc) unicode_repr,            /* tp_repr */
6694     &unicode_as_number,                 /* tp_as_number */
6695     &unicode_as_sequence,               /* tp_as_sequence */
6696     &unicode_as_mapping,                /* tp_as_mapping */
6697     (hashfunc) unicode_hash,            /* tp_hash*/
6698     0,                                  /* tp_call*/
6699     (reprfunc) unicode_str,             /* tp_str */
6700     PyObject_GenericGetAttr,            /* tp_getattro */
6701     0,                                  /* tp_setattro */
6702     &unicode_as_buffer,                 /* tp_as_buffer */
6703     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6704             Py_TPFLAGS_BASETYPE,        /* tp_flags */
6705     unicode_doc,                        /* tp_doc */
6706     0,                                  /* tp_traverse */
6707     0,                                  /* tp_clear */
6708     0,                                  /* tp_richcompare */
6709     0,                                  /* tp_weaklistoffset */
6710     0,                                  /* tp_iter */
6711     0,                                  /* tp_iternext */
6712     unicode_methods,                    /* tp_methods */
6713     0,                                  /* tp_members */
6714     0,                                  /* tp_getset */
6715     &PyBaseString_Type,                 /* tp_base */
6716     0,                                  /* tp_dict */
6717     0,                                  /* tp_descr_get */
6718     0,                                  /* tp_descr_set */
6719     0,                                  /* tp_dictoffset */
6720     0,                                  /* tp_init */
6721     0,                                  /* tp_alloc */
6722     unicode_new,                        /* tp_new */
6723     PyObject_Del,               /* tp_free */
6724 };
6725
6726 /* Initialize the Unicode implementation */
6727
6728 void _PyUnicode_Init(void)
6729 {
6730     int i;
6731
6732     /* Init the implementation */
6733     unicode_freelist = NULL;
6734     unicode_freelist_size = 0;
6735     unicode_empty = _PyUnicode_New(0);
6736     strcpy(unicode_default_encoding, "ascii");
6737     for (i = 0; i < 256; i++)
6738         unicode_latin1[i] = NULL;
6739     if (PyType_Ready(&PyUnicode_Type) < 0)
6740         Py_FatalError("Can't initialize 'unicode'");
6741 }
6742
6743 /* Finalize the Unicode implementation */
6744
6745 void
6746 _PyUnicode_Fini(void)
6747 {
6748     PyUnicodeObject *u;
6749     int i;
6750
6751     Py_XDECREF(unicode_empty);
6752     unicode_empty = NULL;
6753
6754     for (i = 0; i < 256; i++) {
6755         if (unicode_latin1[i]) {
6756             Py_DECREF(unicode_latin1[i]);
6757             unicode_latin1[i] = NULL;
6758         }
6759     }
6760
6761     for (u = unicode_freelist; u != NULL;) {
6762         PyUnicodeObject *v = u;
6763         u = *(PyUnicodeObject **)u;
6764         if (v->str)
6765             PyMem_DEL(v->str);
6766         Py_XDECREF(v->defenc);
6767         PyObject_Del(v);
6768     }
6769     unicode_freelist = NULL;
6770     unicode_freelist_size = 0;
6771 }