Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_DEL(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 228 {
 229     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 230         /* Keep-Alive optimization */
 231         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 232             PyMem_DEL(unicode->str);
 233             unicode->str = NULL;
 234             unicode->length = 0;
 235         }
 236         if (unicode->defenc) {
 237             Py_DECREF(unicode->defenc);
 238             unicode->defenc = NULL;
 239         }
 240         /* Add to free list */
 241         *(PyUnicodeObject **)unicode = unicode_freelist;
 242         unicode_freelist = unicode;
 243         unicode_freelist_size++;
 244     }
 245     else {
 246         PyMem_DEL(unicode->str);
 247         Py_XDECREF(unicode->defenc);
 248         PyObject_DEL(unicode);
 249     }
 250 }
 251
 252 int PyUnicode_Resize(PyObject **unicode,
 253                      int length)
 254 {
 255     register PyUnicodeObject *v;
 256
 257     /* Argument checks */
 258     if (unicode == NULL) {
 259         PyErr_BadInternalCall();
 260         return -1;
 261     }
 262     v = (PyUnicodeObject *)*unicode;
 263     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 264         PyErr_BadInternalCall();
 265         return -1;
 266     }
 267
 268     /* Resizing unicode_empty and single character objects is not
 269        possible since these are being shared. We simply return a fresh
 270        copy with the same Unicode content. */
 271     if (v->length != length &&
 272         (v == unicode_empty || v->length == 1)) {
 273         PyUnicodeObject *w = _PyUnicode_New(length);
 274         if (w == NULL)
 275             return -1;
 276         Py_UNICODE_COPY(w->str, v->str,
 277                         length < v->length ? length : v->length);
 278         *unicode = (PyObject *)w;
 279         return 0;
 280     }
 281
 282     /* Note that we don't have to modify *unicode for unshared Unicode
 283        objects, since we can modify them in-place. */
 284     return unicode_resize(v, length);
 285 }
 286
 287 /* Internal API for use in unicodeobject.c only ! */
 288 #define _PyUnicode_Resize(unicodevar, length) \
 289         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 290
 291 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 292                                 int size)
 293 {
 294     PyUnicodeObject *unicode;
 295
 296     /* If the Unicode data is known at construction time, we can apply
 297        some optimizations which share commonly used objects. */
 298     if (u != NULL) {
 299
 300         /* Optimization for empty strings */
 301         if (size == 0 && unicode_empty != NULL) {
 302             Py_INCREF(unicode_empty);
 303             return (PyObject *)unicode_empty;
 304         }
 305
 306         /* Single character Unicode objects in the Latin-1 range are
 307            shared when using this constructor */
 308         if (size == 1 && *u < 256) {
 309             unicode = unicode_latin1[*u];
 310             if (!unicode) {
 311                 unicode = _PyUnicode_New(1);
 312                 if (!unicode)
 313                     return NULL;
 314                 unicode->str[0] = *u;
 315                 unicode_latin1[*u] = unicode;
 316             }
 317             Py_INCREF(unicode);
 318             return (PyObject *)unicode;
 319         }
 320     }
 321
 322     unicode = _PyUnicode_New(size);
 323     if (!unicode)
 324         return NULL;
 325
 326     /* Copy the Unicode data into the new object */
 327     if (u != NULL)
 328         Py_UNICODE_COPY(unicode->str, u, size);
 329
 330     return (PyObject *)unicode;
 331 }
 332
 333 #ifdef HAVE_WCHAR_H
 334
 335 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 336                                  int size)
 337 {
 338     PyUnicodeObject *unicode;
 339
 340     if (w == NULL) {
 341         PyErr_BadInternalCall();
 342         return NULL;
 343     }
 344
 345     unicode = _PyUnicode_New(size);
 346     if (!unicode)
 347         return NULL;
 348
 349     /* Copy the wchar_t data into the new object */
 350 #ifdef HAVE_USABLE_WCHAR_T
 351     memcpy(unicode->str, w, size * sizeof(wchar_t));
 352 #else
 353     {
 354         register Py_UNICODE *u;
 355         register int i;
 356         u = PyUnicode_AS_UNICODE(unicode);
 357         for (i = size; i >= 0; i--)
 358             *u++ = *w++;
 359     }
 360 #endif
 361
 362     return (PyObject *)unicode;
 363 }
 364
 365 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 366                          register wchar_t *w,
 367                          int size)
 368 {
 369     if (unicode == NULL) {
 370         PyErr_BadInternalCall();
 371         return -1;
 372     }
 373     if (size > PyUnicode_GET_SIZE(unicode))
 374         size = PyUnicode_GET_SIZE(unicode);
 375 #ifdef HAVE_USABLE_WCHAR_T
 376     memcpy(w, unicode->str, size * sizeof(wchar_t));
 377 #else
 378     {
 379         register Py_UNICODE *u;
 380         register int i;
 381         u = PyUnicode_AS_UNICODE(unicode);
 382         for (i = size; i >= 0; i--)
 383             *w++ = *u++;
 384     }
 385 #endif
 386
 387     return size;
 388 }
 389
 390 #endif
 391
 392 PyObject *PyUnicode_FromObject(register PyObject *obj)
 393 {
 394     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 395 }
 396
 397 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 398                                       const char *encoding,
 399                                       const char *errors)
 400 {
 401     const char *s;
 402     int len;
 403     int owned = 0;
 404     PyObject *v;
 405
 406     if (obj == NULL) {
 407         PyErr_BadInternalCall();
 408         return NULL;
 409     }
 410
 411     /* Coerce object */
 412     if (PyInstance_Check(obj)) {
 413         PyObject *func;
 414         func = PyObject_GetAttrString(obj, "__str__");
 415         if (func == NULL) {
 416             PyErr_SetString(PyExc_TypeError,
 417                   "coercing to Unicode: instance doesn't define __str__");
 418             return NULL;
 419         }
 420         obj = PyEval_CallObject(func, NULL);
 421         Py_DECREF(func);
 422         if (obj == NULL)
 423             return NULL;
 424         owned = 1;
 425     }
 426     if (PyUnicode_Check(obj)) {
 427         Py_INCREF(obj);
 428         v = obj;
 429         if (encoding) {
 430             PyErr_SetString(PyExc_TypeError,
 431                             "decoding Unicode is not supported");
 432             return NULL;
 433         }
 434         goto done;
 435     }
 436     else if (PyString_Check(obj)) {
 437         s = PyString_AS_STRING(obj);
 438         len = PyString_GET_SIZE(obj);
 439     }
 440     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 441         /* Overwrite the error message with something more useful in
 442            case of a TypeError. */
 443         if (PyErr_ExceptionMatches(PyExc_TypeError))
 444             PyErr_Format(PyExc_TypeError,
 445                          "coercing to Unicode: need string or buffer, "
 446                          "%.80s found",
 447                          obj->ob_type->tp_name);
 448         goto onError;
 449     }
 450
 451     /* Convert to Unicode */
 452     if (len == 0) {
 453         Py_INCREF(unicode_empty);
 454         v = (PyObject *)unicode_empty;
 455     }
 456     else
 457         v = PyUnicode_Decode(s, len, encoding, errors);
 458
 459  done:
 460     if (owned) {
 461         Py_DECREF(obj);
 462     }
 463     return v;
 464
 465  onError:
 466     if (owned) {
 467         Py_DECREF(obj);
 468     }
 469     return NULL;
 470 }
 471
 472 PyObject *PyUnicode_Decode(const char *s,
 473                            int size,
 474                            const char *encoding,
 475                            const char *errors)
 476 {
 477     PyObject *buffer = NULL, *unicode;
 478
 479     if (encoding == NULL)
 480         encoding = PyUnicode_GetDefaultEncoding();
 481
 482     /* Shortcuts for common default encodings */
 483     if (strcmp(encoding, "utf-8") == 0)
 484         return PyUnicode_DecodeUTF8(s, size, errors);
 485     else if (strcmp(encoding, "latin-1") == 0)
 486         return PyUnicode_DecodeLatin1(s, size, errors);
 487     else if (strcmp(encoding, "ascii") == 0)
 488         return PyUnicode_DecodeASCII(s, size, errors);
 489
 490     /* Decode via the codec registry */
 491     buffer = PyBuffer_FromMemory((void *)s, size);
 492     if (buffer == NULL)
 493         goto onError;
 494     unicode = PyCodec_Decode(buffer, encoding, errors);
 495     if (unicode == NULL)
 496         goto onError;
 497     if (!PyUnicode_Check(unicode)) {
 498         PyErr_Format(PyExc_TypeError,
 499                      "decoder did not return an unicode object (type=%.400s)",
 500                      unicode->ob_type->tp_name);
 501         Py_DECREF(unicode);
 502         goto onError;
 503     }
 504     Py_DECREF(buffer);
 505     return unicode;
 506
 507  onError:
 508     Py_XDECREF(buffer);
 509     return NULL;
 510 }
 511
 512 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 513                            int size,
 514                            const char *encoding,
 515                            const char *errors)
 516 {
 517     PyObject *v, *unicode;
 518
 519     unicode = PyUnicode_FromUnicode(s, size);
 520     if (unicode == NULL)
 521         return NULL;
 522     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 523     Py_DECREF(unicode);
 524     return v;
 525 }
 526
 527 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 528                                     const char *encoding,
 529                                     const char *errors)
 530 {
 531     PyObject *v;
 532
 533     if (!PyUnicode_Check(unicode)) {
 534         PyErr_BadArgument();
 535         goto onError;
 536     }
 537
 538     if (encoding == NULL)
 539         encoding = PyUnicode_GetDefaultEncoding();
 540
 541     /* Shortcuts for common default encodings */
 542     if (errors == NULL) {
 543         if (strcmp(encoding, "utf-8") == 0)
 544             return PyUnicode_AsUTF8String(unicode);
 545         else if (strcmp(encoding, "latin-1") == 0)
 546             return PyUnicode_AsLatin1String(unicode);
 547         else if (strcmp(encoding, "ascii") == 0)
 548             return PyUnicode_AsASCIIString(unicode);
 549     }
 550
 551     /* Encode via the codec registry */
 552     v = PyCodec_Encode(unicode, encoding, errors);
 553     if (v == NULL)
 554         goto onError;
 555     /* XXX Should we really enforce this ? */
 556     if (!PyString_Check(v)) {
 557         PyErr_Format(PyExc_TypeError,
 558                      "encoder did not return a string object (type=%.400s)",
 559                      v->ob_type->tp_name);
 560         Py_DECREF(v);
 561         goto onError;
 562     }
 563     return v;
 564
 565  onError:
 566     return NULL;
 567 }
 568
 569 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 570                                             const char *errors)
 571 {
 572     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 573
 574     if (v)
 575         return v;
 576     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 577     if (v && errors == NULL)
 578         ((PyUnicodeObject *)unicode)->defenc = v;
 579     return v;
 580 }
 581
 582 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 583 {
 584     if (!PyUnicode_Check(unicode)) {
 585         PyErr_BadArgument();
 586         goto onError;
 587     }
 588     return PyUnicode_AS_UNICODE(unicode);
 589
 590  onError:
 591     return NULL;
 592 }
 593
 594 int PyUnicode_GetSize(PyObject *unicode)
 595 {
 596     if (!PyUnicode_Check(unicode)) {
 597         PyErr_BadArgument();
 598         goto onError;
 599     }
 600     return PyUnicode_GET_SIZE(unicode);
 601
 602  onError:
 603     return -1;
 604 }
 605
 606 const char *PyUnicode_GetDefaultEncoding(void)
 607 {
 608     return unicode_default_encoding;
 609 }
 610
 611 int PyUnicode_SetDefaultEncoding(const char *encoding)
 612 {
 613     PyObject *v;
 614
 615     /* Make sure the encoding is valid. As side effect, this also
 616        loads the encoding into the codec registry cache. */
 617     v = _PyCodec_Lookup(encoding);
 618     if (v == NULL)
 619         goto onError;
 620     Py_DECREF(v);
 621     strncpy(unicode_default_encoding,
 622             encoding,
 623             sizeof(unicode_default_encoding));
 624     return 0;
 625
 626  onError:
 627     return -1;
 628 }
 629
 630 /* --- UTF-8 Codec -------------------------------------------------------- */
 631
 632 static
 633 char utf8_code_length[256] = {
 634     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 635        illegal prefix.  see RFC 2279 for details */
 636     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 637     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 638     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 639     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 640     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 641     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 642     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 643     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 644     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 645     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 646     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 647     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 648     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 649     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 650     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 651     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 652 };
 653
 654 static
 655 int utf8_decoding_error(const char **source,
 656                         Py_UNICODE **dest,
 657                         const char *errors,
 658                         const char *details)
 659 {
 660     if ((errors == NULL) ||
 661         (strcmp(errors,"strict") == 0)) {
 662         PyErr_Format(PyExc_UnicodeError,
 663                      "UTF-8 decoding error: %.400s",
 664                      details);
 665         return -1;
 666     }
 667     else if (strcmp(errors,"ignore") == 0) {
 668         (*source)++;
 669         return 0;
 670     }
 671     else if (strcmp(errors,"replace") == 0) {
 672         (*source)++;
 673         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 674         (*dest)++;
 675         return 0;
 676     }
 677     else {
 678         PyErr_Format(PyExc_ValueError,
 679                      "UTF-8 decoding error; unknown error handling code: %.400s",
 680                      errors);
 681         return -1;
 682     }
 683 }
 684
 685 PyObject *PyUnicode_DecodeUTF8(const char *s,
 686                                int size,
 687                                const char *errors)
 688 {
 689     int n;
 690     const char *e;
 691     PyUnicodeObject *unicode;
 692     Py_UNICODE *p;
 693     const char *errmsg = "";
 694
 695     /* Note: size will always be longer than the resulting Unicode
 696        character count */
 697     unicode = _PyUnicode_New(size);
 698     if (!unicode)
 699         return NULL;
 700     if (size == 0)
 701         return (PyObject *)unicode;
 702
 703     /* Unpack UTF-8 encoded data */
 704     p = unicode->str;
 705     e = s + size;
 706
 707     while (s < e) {
 708         Py_UCS4 ch = (unsigned char)*s;
 709
 710         if (ch < 0x80) {
 711             *p++ = (Py_UNICODE)ch;
 712             s++;
 713             continue;
 714         }
 715
 716         n = utf8_code_length[ch];
 717
 718         if (s + n > e) {
 719             errmsg = "unexpected end of data";
 720             goto utf8Error;
 721         }
 722
 723         switch (n) {
 724
 725         case 0:
 726             errmsg = "unexpected code byte";
 727             goto utf8Error;
 728
 729         case 1:
 730             errmsg = "internal error";
 731             goto utf8Error;
 732
 733         case 2:
 734             if ((s[1] & 0xc0) != 0x80) {
 735                 errmsg = "invalid data";
 736                 goto utf8Error;
 737             }
 738             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
 739             if (ch < 0x80) {
 740                 errmsg = "illegal encoding";
 741                 goto utf8Error;
 742             }
 743             else
 744                 *p++ = (Py_UNICODE)ch;
 745             break;
 746
 747         case 3:
 748             if ((s[1] & 0xc0) != 0x80 ||
 749                 (s[2] & 0xc0) != 0x80) {
 750                 errmsg = "invalid data";
 751                 goto utf8Error;
 752             }
 753             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
 754             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
 755                 errmsg = "illegal encoding";
 756                 goto utf8Error;
 757             }
 758             else
 759                                 *p++ = (Py_UNICODE)ch;
 760             break;
 761
 762         case 4:
 763             if ((s[1] & 0xc0) != 0x80 ||
 764                 (s[2] & 0xc0) != 0x80 ||
 765                 (s[3] & 0xc0) != 0x80) {
 766                 errmsg = "invalid data";
 767                 goto utf8Error;
 768             }
 769             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
 770                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
 771             /* validate and convert to UTF-16 */
 772             if ((ch < 0x10000)        /* minimum value allowed for 4
 773                                        byte encoding */
 774                 || (ch > 0x10ffff))   /* maximum value allowed for
 775                                        UTF-16 */
 776             {
 777                 errmsg = "illegal encoding";
 778                 goto utf8Error;
 779             }
 780 #ifdef Py_UNICODE_WIDE
 781             *p++ = (Py_UNICODE)ch;
 782 #else
 783             /*  compute and append the two surrogates: */
 784
 785             /*  translate from 10000..10FFFF to 0..FFFF */
 786             ch -= 0x10000;
 787
 788             /*  high surrogate = top 10 bits added to D800 */
 789             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
 790
 791             /*  low surrogate = bottom 10 bits added to DC00 */
 792             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
 793 #endif
 794             break;
 795
 796         default:
 797             /* Other sizes are only needed for UCS-4 */
 798             errmsg = "unsupported Unicode code range";
 799             goto utf8Error;
 800         }
 801         s += n;
 802         continue;
 803
 804     utf8Error:
 805       if (utf8_decoding_error(&s, &p, errors, errmsg))
 806           goto onError;
 807     }
 808
 809     /* Adjust length */
 810     if (_PyUnicode_Resize(&unicode, p - unicode->str))
 811         goto onError;
 812
 813     return (PyObject *)unicode;
 814
 815 onError:
 816     Py_DECREF(unicode);
 817     return NULL;
 818 }
 819
 820 /* Not used anymore, now that the encoder supports UTF-16
 821    surrogates. */
 822 #if 0
 823 static
 824 int utf8_encoding_error(const Py_UNICODE **source,
 825                         char **dest,
 826                         const char *errors,
 827                         const char *details)
 828 {
 829     if ((errors == NULL) ||
 830         (strcmp(errors,"strict") == 0)) {
 831         PyErr_Format(PyExc_UnicodeError,
 832                      "UTF-8 encoding error: %.400s",
 833                      details);
 834         return -1;
 835     }
 836     else if (strcmp(errors,"ignore") == 0) {
 837         return 0;
 838     }
 839     else if (strcmp(errors,"replace") == 0) {
 840         **dest = '?';
 841         (*dest)++;
 842         return 0;
 843     }
 844     else {
 845         PyErr_Format(PyExc_ValueError,
 846                      "UTF-8 encoding error; "
 847                      "unknown error handling code: %.400s",
 848                      errors);
 849         return -1;
 850     }
 851 }
 852 #endif
 853
 854 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 855                                int size,
 856                                const char *errors)
 857 {
 858     PyObject *v;
 859     char *p;
 860     char *q;
 861     Py_UCS4 ch2;
 862     unsigned int cbAllocated = 3 * size;
 863     unsigned int cbWritten = 0;
 864     int i = 0;
 865
 866     v = PyString_FromStringAndSize(NULL, cbAllocated);
 867     if (v == NULL)
 868         return NULL;
 869     if (size == 0)
 870         return v;
 871
 872     p = q = PyString_AS_STRING(v);
 873     while (i < size) {
 874         Py_UCS4 ch = s[i++];
 875         if (ch < 0x80) {
 876             *p++ = (char) ch;
 877             cbWritten++;
 878         }
 879         else if (ch < 0x0800) {
 880             *p++ = 0xc0 | (ch >> 6);
 881             *p++ = 0x80 | (ch & 0x3f);
 882             cbWritten += 2;
 883         }
 884         else if (ch < 0x10000) {
 885             /* Check for high surrogate */
 886             if (0xD800 <= ch && ch <= 0xDBFF) {
 887                 if (i != size) {
 888                     ch2 = s[i];
 889                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 890
 891                         if (cbWritten >= (cbAllocated - 4)) {
 892                             /* Provide enough room for some more
 893                                surrogates */
 894                             cbAllocated += 4*10;
 895                             if (_PyString_Resize(&v, cbAllocated))
 896                                 goto onError;
 897                         }
 898
 899                         /* combine the two values */
 900                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
 901
 902                         *p++ = (char)((ch >> 18) | 0xf0);
 903                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 904                         i++;
 905                         cbWritten += 4;
 906                     }
 907                 }
 908             }
 909             else {
 910                 *p++ = (char)(0xe0 | (ch >> 12));
 911                 cbWritten += 3;
 912             }
 913             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 914             *p++ = (char)(0x80 | (ch & 0x3f));
 915         } else {
 916             *p++ = 0xf0 | (ch>>18);
 917             *p++ = 0x80 | ((ch>>12) & 0x3f);
 918             *p++ = 0x80 | ((ch>>6) & 0x3f);
 919             *p++ = 0x80 | (ch & 0x3f);
 920             cbWritten += 4;
 921         }
 922     }
 923     *p = '\0';
 924     if (_PyString_Resize(&v, p - q))
 925         goto onError;
 926     return v;
 927
 928  onError:
 929     Py_DECREF(v);
 930     return NULL;
 931 }
 932
 933 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 934 {
 935     if (!PyUnicode_Check(unicode)) {
 936         PyErr_BadArgument();
 937         return NULL;
 938     }
 939     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
 940                                 PyUnicode_GET_SIZE(unicode),
 941                                 NULL);
 942 }
 943
 944 /* --- UTF-16 Codec ------------------------------------------------------- */
 945
 946 static
 947 int utf16_decoding_error(Py_UNICODE **dest,
 948                          const char *errors,
 949                          const char *details)
 950 {
 951     if ((errors == NULL) ||
 952         (strcmp(errors,"strict") == 0)) {
 953         PyErr_Format(PyExc_UnicodeError,
 954                      "UTF-16 decoding error: %.400s",
 955                      details);
 956         return -1;
 957     }
 958     else if (strcmp(errors,"ignore") == 0) {
 959         return 0;
 960     }
 961     else if (strcmp(errors,"replace") == 0) {
 962         if (dest) {
 963             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 964             (*dest)++;
 965         }
 966         return 0;
 967     }
 968     else {
 969         PyErr_Format(PyExc_ValueError,
 970                      "UTF-16 decoding error; "
 971                      "unknown error handling code: %.400s",
 972                      errors);
 973         return -1;
 974     }
 975 }
 976
 977 PyObject *
 978 PyUnicode_DecodeUTF16(const char *s,
 979                       int size,
 980                       const char *errors,
 981                       int *byteorder)
 982 {
 983     PyUnicodeObject *unicode;
 984     Py_UNICODE *p;
 985     const unsigned char *q, *e;
 986     int bo = 0;       /* assume native ordering by default */
 987     const char *errmsg = "";
 988     /* Offsets from q for retrieving byte pairs in the right order. */
 989 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 990     int ihi = 1, ilo = 0;
 991 #else
 992     int ihi = 0, ilo = 1;
 993 #endif
 994
 995     /* size should be an even number */
 996     if (size & 1) {
 997         if (utf16_decoding_error(NULL, errors, "truncated data"))
 998             return NULL;
 999         --size;  /* else ignore the oddball byte */
1000     }
1001
1002     /* Note: size will always be longer than the resulting Unicode
1003        character count */
1004     unicode = _PyUnicode_New(size);
1005     if (!unicode)
1006         return NULL;
1007     if (size == 0)
1008         return (PyObject *)unicode;
1009
1010     /* Unpack UTF-16 encoded data */
1011     p = unicode->str;
1012     q = (unsigned char *)s;
1013     e = q + size;
1014
1015     if (byteorder)
1016         bo = *byteorder;
1017
1018     /* Check for BOM marks (U+FEFF) in the input and adjust current
1019        byte order setting accordingly. In native mode, the leading BOM
1020        mark is skipped, in all other modes, it is copied to the output
1021        stream as-is (giving a ZWNBSP character). */
1022     if (bo == 0) {
1023         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1024 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1025         if (bom == 0xFEFF) {
1026             q += 2;
1027             bo = -1;
1028         }
1029         else if (bom == 0xFFFE) {
1030             q += 2;
1031             bo = 1;
1032         }
1033 #else
1034         if (bom == 0xFEFF) {
1035             q += 2;
1036             bo = 1;
1037         }
1038         else if (bom == 0xFFFE) {
1039             q += 2;
1040             bo = -1;
1041         }
1042 #endif
1043     }
1044
1045     if (bo == -1) {
1046         /* force LE */
1047         ihi = 1;
1048         ilo = 0;
1049     }
1050     else if (bo == 1) {
1051         /* force BE */
1052         ihi = 0;
1053         ilo = 1;
1054     }
1055
1056     while (q < e) {
1057         Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1058         q += 2;
1059
1060         if (ch < 0xD800 || ch > 0xDFFF) {
1061             *p++ = ch;
1062             continue;
1063         }
1064
1065         /* UTF-16 code pair: */
1066         if (q >= e) {
1067             errmsg = "unexpected end of data";
1068             goto utf16Error;
1069         }
1070         if (0xD800 <= ch && ch <= 0xDBFF) {
1071             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1072             q += 2;
1073             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1074 #ifndef Py_UNICODE_WIDE
1075                 *p++ = ch;
1076                 *p++ = ch2;
1077 #else
1078                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1079 #endif
1080                 continue;
1081             }
1082             else {
1083                 errmsg = "illegal UTF-16 surrogate";
1084                 goto utf16Error;
1085             }
1086
1087         }
1088         errmsg = "illegal encoding";
1089         /* Fall through to report the error */
1090
1091     utf16Error:
1092         if (utf16_decoding_error(&p, errors, errmsg))
1093             goto onError;
1094     }
1095
1096     if (byteorder)
1097         *byteorder = bo;
1098
1099     /* Adjust length */
1100     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1101         goto onError;
1102
1103     return (PyObject *)unicode;
1104
1105 onError:
1106     Py_DECREF(unicode);
1107     return NULL;
1108 }
1109
1110 PyObject *
1111 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1112                       int size,
1113                       const char *errors,
1114                       int byteorder)
1115 {
1116     PyObject *v;
1117     unsigned char *p;
1118     int i, pairs;
1119     /* Offsets from p for storing byte pairs in the right order. */
1120 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1121     int ihi = 1, ilo = 0;
1122 #else
1123     int ihi = 0, ilo = 1;
1124 #endif
1125
1126 #define STORECHAR(CH)                   \
1127     do {                                \
1128         p[ihi] = ((CH) >> 8) & 0xff;    \
1129         p[ilo] = (CH) & 0xff;           \
1130         p += 2;                         \
1131     } while(0)
1132
1133     for (i = pairs = 0; i < size; i++)
1134         if (s[i] >= 0x10000)
1135             pairs++;
1136     v = PyString_FromStringAndSize(NULL,
1137                   2 * (size + pairs + (byteorder == 0)));
1138     if (v == NULL)
1139         return NULL;
1140
1141     p = (unsigned char *)PyString_AS_STRING(v);
1142     if (byteorder == 0)
1143         STORECHAR(0xFEFF);
1144     if (size == 0)
1145         return v;
1146
1147     if (byteorder == -1) {
1148         /* force LE */
1149         ihi = 1;
1150         ilo = 0;
1151     }
1152     else if (byteorder == 1) {
1153         /* force BE */
1154         ihi = 0;
1155         ilo = 1;
1156     }
1157
1158     while (size-- > 0) {
1159         Py_UNICODE ch = *s++;
1160         Py_UNICODE ch2 = 0;
1161         if (ch >= 0x10000) {
1162             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1163             ch  = 0xD800 | ((ch-0x10000) >> 10);
1164         }
1165         STORECHAR(ch);
1166         if (ch2)
1167             STORECHAR(ch2);
1168     }
1169     return v;
1170 #undef STORECHAR
1171 }
1172
1173 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1174 {
1175     if (!PyUnicode_Check(unicode)) {
1176         PyErr_BadArgument();
1177         return NULL;
1178     }
1179     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1180                                  PyUnicode_GET_SIZE(unicode),
1181                                  NULL,
1182                                  0);
1183 }
1184
1185 /* --- Unicode Escape Codec ----------------------------------------------- */
1186
1187 static
1188 int unicodeescape_decoding_error(const char **source,
1189                                  Py_UNICODE *x,
1190                                  const char *errors,
1191                                  const char *details)
1192 {
1193     if ((errors == NULL) ||
1194         (strcmp(errors,"strict") == 0)) {
1195         PyErr_Format(PyExc_UnicodeError,
1196                      "Unicode-Escape decoding error: %.400s",
1197                      details);
1198         return -1;
1199     }
1200     else if (strcmp(errors,"ignore") == 0) {
1201         return 0;
1202     }
1203     else if (strcmp(errors,"replace") == 0) {
1204         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1205         return 0;
1206     }
1207     else {
1208         PyErr_Format(PyExc_ValueError,
1209                      "Unicode-Escape decoding error; "
1210                      "unknown error handling code: %.400s",
1211                      errors);
1212         return -1;
1213     }
1214 }
1215
1216 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1217
1218 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1219                                         int size,
1220                                         const char *errors)
1221 {
1222     PyUnicodeObject *v;
1223     Py_UNICODE *p, *buf;
1224     const char *end;
1225     char* message;
1226     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1227
1228     /* Escaped strings will always be longer than the resulting
1229        Unicode string, so we start with size here and then reduce the
1230        length after conversion to the true value. */
1231     v = _PyUnicode_New(size);
1232     if (v == NULL)
1233         goto onError;
1234     if (size == 0)
1235         return (PyObject *)v;
1236
1237     p = buf = PyUnicode_AS_UNICODE(v);
1238     end = s + size;
1239
1240     while (s < end) {
1241         unsigned char c;
1242         Py_UNICODE x;
1243         int i, digits;
1244
1245         /* Non-escape characters are interpreted as Unicode ordinals */
1246         if (*s != '\\') {
1247             *p++ = (unsigned char) *s++;
1248             continue;
1249         }
1250
1251         /* \ - Escapes */
1252         s++;
1253         switch (*s++) {
1254
1255         /* \x escapes */
1256         case '\n': break;
1257         case '\\': *p++ = '\\'; break;
1258         case '\'': *p++ = '\''; break;
1259         case '\"': *p++ = '\"'; break;
1260         case 'b': *p++ = '\b'; break;
1261         case 'f': *p++ = '\014'; break; /* FF */
1262         case 't': *p++ = '\t'; break;
1263         case 'n': *p++ = '\n'; break;
1264         case 'r': *p++ = '\r'; break;
1265         case 'v': *p++ = '\013'; break; /* VT */
1266         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1267
1268         /* \OOO (octal) escapes */
1269         case '0': case '1': case '2': case '3':
1270         case '4': case '5': case '6': case '7':
1271             x = s[-1] - '0';
1272             if ('0' <= *s && *s <= '7') {
1273                 x = (x<<3) + *s++ - '0';
1274                 if ('0' <= *s && *s <= '7')
1275                     x = (x<<3) + *s++ - '0';
1276             }
1277             *p++ = x;
1278             break;
1279
1280         /* hex escapes */
1281         /* \xXX */
1282         case 'x':
1283             digits = 2;
1284             message = "truncated \\xXX escape";
1285             goto hexescape;
1286
1287         /* \uXXXX */
1288         case 'u':
1289             digits = 4;
1290             message = "truncated \\uXXXX escape";
1291             goto hexescape;
1292
1293         /* \UXXXXXXXX */
1294         case 'U':
1295             digits = 8;
1296             message = "truncated \\UXXXXXXXX escape";
1297         hexescape:
1298             chr = 0;
1299             for (i = 0; i < digits; i++) {
1300                 c = (unsigned char) s[i];
1301                 if (!isxdigit(c)) {
1302                     if (unicodeescape_decoding_error(&s, &x, errors, message))
1303                         goto onError;
1304                     chr = x;
1305                     i++;
1306                     break;
1307                 }
1308                 chr = (chr<<4) & ~0xF;
1309                 if (c >= '0' && c <= '9')
1310                     chr += c - '0';
1311                 else if (c >= 'a' && c <= 'f')
1312                     chr += 10 + c - 'a';
1313                 else
1314                     chr += 10 + c - 'A';
1315             }
1316             s += i;
1317         store:
1318             /* when we get here, chr is a 32-bit unicode character */
1319             if (chr <= 0xffff)
1320                 /* UCS-2 character */
1321                 *p++ = (Py_UNICODE) chr;
1322             else if (chr <= 0x10ffff) {
1323                 /* UCS-4 character. Either store directly, or as
1324                    surrogate pair. */
1325 #ifdef Py_UNICODE_WIDE
1326                 *p++ = chr;
1327 #else
1328                 chr -= 0x10000L;
1329                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1330                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1331 #endif
1332             } else {
1333                 if (unicodeescape_decoding_error(
1334                     &s, &x, errors,
1335                     "illegal Unicode character")
1336                     )
1337                     goto onError;
1338                 *p++ = x; /* store replacement character */
1339             }
1340             break;
1341
1342         /* \N{name} */
1343         case 'N':
1344             message = "malformed \\N character escape";
1345             if (ucnhash_CAPI == NULL) {
1346                 /* load the unicode data module */
1347                 PyObject *m, *v;
1348                 m = PyImport_ImportModule("unicodedata");
1349                 if (m == NULL)
1350                     goto ucnhashError;
1351                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1352                 Py_DECREF(m);
1353                 if (v == NULL)
1354                     goto ucnhashError;
1355                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1356                 Py_DECREF(v);
1357                 if (ucnhash_CAPI == NULL)
1358                     goto ucnhashError;
1359             }
1360             if (*s == '{') {
1361                 const char *start = s+1;
1362                 /* look for the closing brace */
1363                 while (*s != '}' && s < end)
1364                     s++;
1365                 if (s > start && s < end && *s == '}') {
1366                     /* found a name.  look it up in the unicode database */
1367                     message = "unknown Unicode character name";
1368                     s++;
1369                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1370                         goto store;
1371                 }
1372             }
1373             if (unicodeescape_decoding_error(&s, &x, errors, message))
1374                 goto onError;
1375             *p++ = x;
1376             break;
1377
1378         default:
1379             *p++ = '\\';
1380             *p++ = (unsigned char)s[-1];
1381             break;
1382         }
1383     }
1384     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1385                 goto onError;
1386     return (PyObject *)v;
1387
1388 ucnhashError:
1389     PyErr_SetString(
1390         PyExc_UnicodeError,
1391         "\\N escapes not supported (can't load unicodedata module)"
1392         );
1393     return NULL;
1394
1395 onError:
1396     Py_XDECREF(v);
1397     return NULL;
1398 }
1399
1400 /* Return a Unicode-Escape string version of the Unicode object.
1401
1402    If quotes is true, the string is enclosed in u"" or u'' quotes as
1403    appropriate.
1404
1405 */
1406
1407 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1408                                   int size,
1409                                   Py_UNICODE ch);
1410
1411 static
1412 PyObject *unicodeescape_string(const Py_UNICODE *s,
1413                                int size,
1414                                int quotes)
1415 {
1416     PyObject *repr;
1417     char *p;
1418
1419     static const char *hexdigit = "0123456789abcdef";
1420
1421     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1422     if (repr == NULL)
1423         return NULL;
1424
1425     p = PyString_AS_STRING(repr);
1426
1427     if (quotes) {
1428         *p++ = 'u';
1429         *p++ = (findchar(s, size, '\'') &&
1430                 !findchar(s, size, '"')) ? '"' : '\'';
1431     }
1432     while (size-- > 0) {
1433         Py_UNICODE ch = *s++;
1434
1435         /* Escape quotes */
1436         if (quotes &&
1437             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1438             *p++ = '\\';
1439             *p++ = (char) ch;
1440         }
1441
1442 #ifdef Py_UNICODE_WIDE
1443         /* Map 21-bit characters to '\U00xxxxxx' */
1444         else if (ch >= 0x10000) {
1445             int offset = p - PyString_AS_STRING(repr);
1446
1447             /* Resize the string if necessary */
1448             if (offset + 12 > PyString_GET_SIZE(repr)) {
1449                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1450                     goto onError;
1451                 p = PyString_AS_STRING(repr) + offset;
1452             }
1453
1454             *p++ = '\\';
1455             *p++ = 'U';
1456             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1457             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1458             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1459             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1460             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1461             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1462             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1463             *p++ = hexdigit[ch & 0x0000000F];
1464             continue;
1465         }
1466 #endif
1467         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1468         else if (ch >= 0xD800 && ch < 0xDC00) {
1469             Py_UNICODE ch2;
1470             Py_UCS4 ucs;
1471
1472             ch2 = *s++;
1473             size--;
1474             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1475                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1476                 *p++ = '\\';
1477                 *p++ = 'U';
1478                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1479                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1480                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1481                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1482                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1483                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1484                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1485                 *p++ = hexdigit[ucs & 0x0000000F];
1486                 continue;
1487             }
1488             /* Fall through: isolated surrogates are copied as-is */
1489             s--;
1490             size++;
1491         }
1492
1493         /* Map 16-bit characters to '\uxxxx' */
1494         if (ch >= 256) {
1495             *p++ = '\\';
1496             *p++ = 'u';
1497             *p++ = hexdigit[(ch >> 12) & 0x000F];
1498             *p++ = hexdigit[(ch >> 8) & 0x000F];
1499             *p++ = hexdigit[(ch >> 4) & 0x000F];
1500             *p++ = hexdigit[ch & 0x000F];
1501         }
1502
1503         /* Map special whitespace to '\t', \n', '\r' */
1504         else if (ch == '\t') {
1505             *p++ = '\\';
1506             *p++ = 't';
1507         }
1508         else if (ch == '\n') {
1509             *p++ = '\\';
1510             *p++ = 'n';
1511         }
1512         else if (ch == '\r') {
1513             *p++ = '\\';
1514             *p++ = 'r';
1515         }
1516
1517         /* Map non-printable US ASCII to '\xhh' */
1518         else if (ch < ' ' || ch >= 128) {
1519             *p++ = '\\';
1520             *p++ = 'x';
1521             *p++ = hexdigit[(ch >> 4) & 0x000F];
1522             *p++ = hexdigit[ch & 0x000F];
1523         }
1524
1525         /* Copy everything else as-is */
1526         else
1527             *p++ = (char) ch;
1528     }
1529     if (quotes)
1530         *p++ = PyString_AS_STRING(repr)[1];
1531
1532     *p = '\0';
1533     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1534         goto onError;
1535
1536     return repr;
1537
1538  onError:
1539     Py_DECREF(repr);
1540     return NULL;
1541 }
1542
1543 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1544                                         int size)
1545 {
1546     return unicodeescape_string(s, size, 0);
1547 }
1548
1549 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1550 {
1551     if (!PyUnicode_Check(unicode)) {
1552         PyErr_BadArgument();
1553         return NULL;
1554     }
1555     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1556                                          PyUnicode_GET_SIZE(unicode));
1557 }
1558
1559 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1560
1561 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1562                                            int size,
1563                                            const char *errors)
1564 {
1565     PyUnicodeObject *v;
1566     Py_UNICODE *p, *buf;
1567     const char *end;
1568     const char *bs;
1569
1570     /* Escaped strings will always be longer than the resulting
1571        Unicode string, so we start with size here and then reduce the
1572        length after conversion to the true value. */
1573     v = _PyUnicode_New(size);
1574     if (v == NULL)
1575         goto onError;
1576     if (size == 0)
1577         return (PyObject *)v;
1578     p = buf = PyUnicode_AS_UNICODE(v);
1579     end = s + size;
1580     while (s < end) {
1581         unsigned char c;
1582         Py_UNICODE x;
1583         int i;
1584
1585         /* Non-escape characters are interpreted as Unicode ordinals */
1586         if (*s != '\\') {
1587             *p++ = (unsigned char)*s++;
1588             continue;
1589         }
1590
1591         /* \u-escapes are only interpreted iff the number of leading
1592            backslashes if odd */
1593         bs = s;
1594         for (;s < end;) {
1595             if (*s != '\\')
1596                 break;
1597             *p++ = (unsigned char)*s++;
1598         }
1599         if (((s - bs) & 1) == 0 ||
1600             s >= end ||
1601             *s != 'u') {
1602             continue;
1603         }
1604         p--;
1605         s++;
1606
1607         /* \uXXXX with 4 hex digits */
1608         for (x = 0, i = 0; i < 4; i++) {
1609             c = (unsigned char)s[i];
1610             if (!isxdigit(c)) {
1611                 if (unicodeescape_decoding_error(&s, &x, errors,
1612                                                  "truncated \\uXXXX"))
1613                     goto onError;
1614                 i++;
1615                 break;
1616             }
1617             x = (x<<4) & ~0xF;
1618             if (c >= '0' && c <= '9')
1619                 x += c - '0';
1620             else if (c >= 'a' && c <= 'f')
1621                 x += 10 + c - 'a';
1622             else
1623                 x += 10 + c - 'A';
1624         }
1625         s += i;
1626         *p++ = x;
1627     }
1628     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1629         goto onError;
1630     return (PyObject *)v;
1631
1632  onError:
1633     Py_XDECREF(v);
1634     return NULL;
1635 }
1636
1637 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1638                                            int size)
1639 {
1640     PyObject *repr;
1641     char *p;
1642     char *q;
1643
1644     static const char *hexdigit = "0123456789abcdef";
1645
1646     repr = PyString_FromStringAndSize(NULL, 6 * size);
1647     if (repr == NULL)
1648         return NULL;
1649     if (size == 0)
1650         return repr;
1651
1652     p = q = PyString_AS_STRING(repr);
1653     while (size-- > 0) {
1654         Py_UNICODE ch = *s++;
1655         /* Map 16-bit characters to '\uxxxx' */
1656         if (ch >= 256) {
1657             *p++ = '\\';
1658             *p++ = 'u';
1659             *p++ = hexdigit[(ch >> 12) & 0xf];
1660             *p++ = hexdigit[(ch >> 8) & 0xf];
1661             *p++ = hexdigit[(ch >> 4) & 0xf];
1662             *p++ = hexdigit[ch & 15];
1663         }
1664         /* Copy everything else as-is */
1665         else
1666             *p++ = (char) ch;
1667     }
1668     *p = '\0';
1669     if (_PyString_Resize(&repr, p - q))
1670         goto onError;
1671
1672     return repr;
1673
1674  onError:
1675     Py_DECREF(repr);
1676     return NULL;
1677 }
1678
1679 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1680 {
1681     if (!PyUnicode_Check(unicode)) {
1682         PyErr_BadArgument();
1683         return NULL;
1684     }
1685     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1686                                             PyUnicode_GET_SIZE(unicode));
1687 }
1688
1689 /* --- Latin-1 Codec ------------------------------------------------------ */
1690
1691 PyObject *PyUnicode_DecodeLatin1(const char *s,
1692                                  int size,
1693                                  const char *errors)
1694 {
1695     PyUnicodeObject *v;
1696     Py_UNICODE *p;
1697
1698     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1699     if (size == 1 && *(unsigned char*)s < 256) {
1700         Py_UNICODE r = *(unsigned char*)s;
1701         return PyUnicode_FromUnicode(&r, 1);
1702     }
1703
1704     v = _PyUnicode_New(size);
1705     if (v == NULL)
1706         goto onError;
1707     if (size == 0)
1708         return (PyObject *)v;
1709     p = PyUnicode_AS_UNICODE(v);
1710     while (size-- > 0)
1711         *p++ = (unsigned char)*s++;
1712     return (PyObject *)v;
1713
1714  onError:
1715     Py_XDECREF(v);
1716     return NULL;
1717 }
1718
1719 static
1720 int latin1_encoding_error(const Py_UNICODE **source,
1721                           char **dest,
1722                           const char *errors,
1723                           const char *details)
1724 {
1725     if ((errors == NULL) ||
1726         (strcmp(errors,"strict") == 0)) {
1727         PyErr_Format(PyExc_UnicodeError,
1728                      "Latin-1 encoding error: %.400s",
1729                      details);
1730         return -1;
1731     }
1732     else if (strcmp(errors,"ignore") == 0) {
1733         return 0;
1734     }
1735     else if (strcmp(errors,"replace") == 0) {
1736         **dest = '?';
1737         (*dest)++;
1738         return 0;
1739     }
1740     else {
1741         PyErr_Format(PyExc_ValueError,
1742                      "Latin-1 encoding error; "
1743                      "unknown error handling code: %.400s",
1744                      errors);
1745         return -1;
1746     }
1747 }
1748
1749 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1750                                  int size,
1751                                  const char *errors)
1752 {
1753     PyObject *repr;
1754     char *s, *start;
1755
1756     repr = PyString_FromStringAndSize(NULL, size);
1757     if (repr == NULL)
1758         return NULL;
1759     if (size == 0)
1760         return repr;
1761
1762     s = PyString_AS_STRING(repr);
1763     start = s;
1764     while (size-- > 0) {
1765         Py_UNICODE ch = *p++;
1766         if (ch >= 256) {
1767             if (latin1_encoding_error(&p, &s, errors,
1768                                       "ordinal not in range(256)"))
1769                 goto onError;
1770         }
1771         else
1772             *s++ = (char)ch;
1773     }
1774     /* Resize if error handling skipped some characters */
1775     if (s - start < PyString_GET_SIZE(repr))
1776         if (_PyString_Resize(&repr, s - start))
1777             goto onError;
1778     return repr;
1779
1780  onError:
1781     Py_DECREF(repr);
1782     return NULL;
1783 }
1784
1785 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1786 {
1787     if (!PyUnicode_Check(unicode)) {
1788         PyErr_BadArgument();
1789         return NULL;
1790     }
1791     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1792                                   PyUnicode_GET_SIZE(unicode),
1793                                   NULL);
1794 }
1795
1796 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1797
1798 static
1799 int ascii_decoding_error(const char **source,
1800                          Py_UNICODE **dest,
1801                          const char *errors,
1802                          const char *details)
1803 {
1804     if ((errors == NULL) ||
1805         (strcmp(errors,"strict") == 0)) {
1806         PyErr_Format(PyExc_UnicodeError,
1807                      "ASCII decoding error: %.400s",
1808                      details);
1809         return -1;
1810     }
1811     else if (strcmp(errors,"ignore") == 0) {
1812         return 0;
1813     }
1814     else if (strcmp(errors,"replace") == 0) {
1815         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1816         (*dest)++;
1817         return 0;
1818     }
1819     else {
1820         PyErr_Format(PyExc_ValueError,
1821                      "ASCII decoding error; "
1822                      "unknown error handling code: %.400s",
1823                      errors);
1824         return -1;
1825     }
1826 }
1827
1828 PyObject *PyUnicode_DecodeASCII(const char *s,
1829                                 int size,
1830                                 const char *errors)
1831 {
1832     PyUnicodeObject *v;
1833     Py_UNICODE *p;
1834
1835     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1836     if (size == 1 && *(unsigned char*)s < 128) {
1837         Py_UNICODE r = *(unsigned char*)s;
1838         return PyUnicode_FromUnicode(&r, 1);
1839     }
1840
1841     v = _PyUnicode_New(size);
1842     if (v == NULL)
1843         goto onError;
1844     if (size == 0)
1845         return (PyObject *)v;
1846     p = PyUnicode_AS_UNICODE(v);
1847     while (size-- > 0) {
1848         register unsigned char c;
1849
1850         c = (unsigned char)*s++;
1851         if (c < 128)
1852             *p++ = c;
1853         else if (ascii_decoding_error(&s, &p, errors,
1854                                       "ordinal not in range(128)"))
1855                 goto onError;
1856     }
1857     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1858         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1859             goto onError;
1860     return (PyObject *)v;
1861
1862  onError:
1863     Py_XDECREF(v);
1864     return NULL;
1865 }
1866
1867 static
1868 int ascii_encoding_error(const Py_UNICODE **source,
1869                          char **dest,
1870                          const char *errors,
1871                          const char *details)
1872 {
1873     if ((errors == NULL) ||
1874         (strcmp(errors,"strict") == 0)) {
1875         PyErr_Format(PyExc_UnicodeError,
1876                      "ASCII encoding error: %.400s",
1877                      details);
1878         return -1;
1879     }
1880     else if (strcmp(errors,"ignore") == 0) {
1881         return 0;
1882     }
1883     else if (strcmp(errors,"replace") == 0) {
1884         **dest = '?';
1885         (*dest)++;
1886         return 0;
1887     }
1888     else {
1889         PyErr_Format(PyExc_ValueError,
1890                      "ASCII encoding error; "
1891                      "unknown error handling code: %.400s",
1892                      errors);
1893         return -1;
1894     }
1895 }
1896
1897 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1898                                 int size,
1899                                 const char *errors)
1900 {
1901     PyObject *repr;
1902     char *s, *start;
1903
1904     repr = PyString_FromStringAndSize(NULL, size);
1905     if (repr == NULL)
1906         return NULL;
1907     if (size == 0)
1908         return repr;
1909
1910     s = PyString_AS_STRING(repr);
1911     start = s;
1912     while (size-- > 0) {
1913         Py_UNICODE ch = *p++;
1914         if (ch >= 128) {
1915             if (ascii_encoding_error(&p, &s, errors,
1916                                       "ordinal not in range(128)"))
1917                 goto onError;
1918         }
1919         else
1920             *s++ = (char)ch;
1921     }
1922     /* Resize if error handling skipped some characters */
1923     if (s - start < PyString_GET_SIZE(repr))
1924         if (_PyString_Resize(&repr, s - start))
1925             goto onError;
1926     return repr;
1927
1928  onError:
1929     Py_DECREF(repr);
1930     return NULL;
1931 }
1932
1933 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1934 {
1935     if (!PyUnicode_Check(unicode)) {
1936         PyErr_BadArgument();
1937         return NULL;
1938     }
1939     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1940                                  PyUnicode_GET_SIZE(unicode),
1941                                  NULL);
1942 }
1943
1944 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
1945
1946 /* --- MBCS codecs for Windows -------------------------------------------- */
1947
1948 PyObject *PyUnicode_DecodeMBCS(const char *s,
1949                                 int size,
1950                                 const char *errors)
1951 {
1952     PyUnicodeObject *v;
1953     Py_UNICODE *p;
1954
1955     /* First get the size of the result */
1956     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1957     if (size > 0 && usize==0)
1958         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1959
1960     v = _PyUnicode_New(usize);
1961     if (v == NULL)
1962         return NULL;
1963     if (usize == 0)
1964         return (PyObject *)v;
1965     p = PyUnicode_AS_UNICODE(v);
1966     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1967         Py_DECREF(v);
1968         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1969     }
1970
1971     return (PyObject *)v;
1972 }
1973
1974 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1975                                 int size,
1976                                 const char *errors)
1977 {
1978     PyObject *repr;
1979     char *s;
1980     DWORD mbcssize;
1981
1982     /* If there are no characters, bail now! */
1983     if (size==0)
1984             return PyString_FromString("");
1985
1986     /* First get the size of the result */
1987     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1988     if (mbcssize==0)
1989         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1990
1991     repr = PyString_FromStringAndSize(NULL, mbcssize);
1992     if (repr == NULL)
1993         return NULL;
1994     if (mbcssize == 0)
1995         return repr;
1996
1997     /* Do the conversion */
1998     s = PyString_AS_STRING(repr);
1999     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2000         Py_DECREF(repr);
2001         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2002     }
2003     return repr;
2004 }
2005
2006 #endif /* MS_WIN32 */
2007
2008 /* --- Character Mapping Codec -------------------------------------------- */
2009
2010 static
2011 int charmap_decoding_error(const char **source,
2012                          Py_UNICODE **dest,
2013                          const char *errors,
2014                          const char *details)
2015 {
2016     if ((errors == NULL) ||
2017         (strcmp(errors,"strict") == 0)) {
2018         PyErr_Format(PyExc_UnicodeError,
2019                      "charmap decoding error: %.400s",
2020                      details);
2021         return -1;
2022     }
2023     else if (strcmp(errors,"ignore") == 0) {
2024         return 0;
2025     }
2026     else if (strcmp(errors,"replace") == 0) {
2027         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2028         (*dest)++;
2029         return 0;
2030     }
2031     else {
2032         PyErr_Format(PyExc_ValueError,
2033                      "charmap decoding error; "
2034                      "unknown error handling code: %.400s",
2035                      errors);
2036         return -1;
2037     }
2038 }
2039
2040 PyObject *PyUnicode_DecodeCharmap(const char *s,
2041                                   int size,
2042                                   PyObject *mapping,
2043                                   const char *errors)
2044 {
2045     PyUnicodeObject *v;
2046     Py_UNICODE *p;
2047     int extrachars = 0;
2048
2049     /* Default to Latin-1 */
2050     if (mapping == NULL)
2051         return PyUnicode_DecodeLatin1(s, size, errors);
2052
2053     v = _PyUnicode_New(size);
2054     if (v == NULL)
2055         goto onError;
2056     if (size == 0)
2057         return (PyObject *)v;
2058     p = PyUnicode_AS_UNICODE(v);
2059     while (size-- > 0) {
2060         unsigned char ch = *s++;
2061         PyObject *w, *x;
2062
2063         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2064         w = PyInt_FromLong((long)ch);
2065         if (w == NULL)
2066             goto onError;
2067         x = PyObject_GetItem(mapping, w);
2068         Py_DECREF(w);
2069         if (x == NULL) {
2070             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2071                 /* No mapping found means: mapping is undefined. */
2072                 PyErr_Clear();
2073                 x = Py_None;
2074                 Py_INCREF(x);
2075             } else
2076                 goto onError;
2077         }
2078
2079         /* Apply mapping */
2080         if (PyInt_Check(x)) {
2081             long value = PyInt_AS_LONG(x);
2082             if (value < 0 || value > 65535) {
2083                 PyErr_SetString(PyExc_TypeError,
2084                                 "character mapping must be in range(65536)");
2085                 Py_DECREF(x);
2086                 goto onError;
2087             }
2088             *p++ = (Py_UNICODE)value;
2089         }
2090         else if (x == Py_None) {
2091             /* undefined mapping */
2092             if (charmap_decoding_error(&s, &p, errors,
2093                                        "character maps to <undefined>")) {
2094                 Py_DECREF(x);
2095                 goto onError;
2096             }
2097         }
2098         else if (PyUnicode_Check(x)) {
2099             int targetsize = PyUnicode_GET_SIZE(x);
2100
2101             if (targetsize == 1)
2102                 /* 1-1 mapping */
2103                 *p++ = *PyUnicode_AS_UNICODE(x);
2104
2105             else if (targetsize > 1) {
2106                 /* 1-n mapping */
2107                 if (targetsize > extrachars) {
2108                     /* resize first */
2109                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2110                     int needed = (targetsize - extrachars) + \
2111                                  (targetsize << 2);
2112                     extrachars += needed;
2113                     if (_PyUnicode_Resize(&v,
2114                                          PyUnicode_GET_SIZE(v) + needed)) {
2115                         Py_DECREF(x);
2116                         goto onError;
2117                     }
2118                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2119                 }
2120                 Py_UNICODE_COPY(p,
2121                                 PyUnicode_AS_UNICODE(x),
2122                                 targetsize);
2123                 p += targetsize;
2124                 extrachars -= targetsize;
2125             }
2126             /* 1-0 mapping: skip the character */
2127         }
2128         else {
2129             /* wrong return value */
2130             PyErr_SetString(PyExc_TypeError,
2131                   "character mapping must return integer, None or unicode");
2132             Py_DECREF(x);
2133             goto onError;
2134         }
2135         Py_DECREF(x);
2136     }
2137     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2138         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2139             goto onError;
2140     return (PyObject *)v;
2141
2142  onError:
2143     Py_XDECREF(v);
2144     return NULL;
2145 }
2146
2147 static
2148 int charmap_encoding_error(const Py_UNICODE **source,
2149                            char **dest,
2150                            const char *errors,
2151                            const char *details)
2152 {
2153     if ((errors == NULL) ||
2154         (strcmp(errors,"strict") == 0)) {
2155         PyErr_Format(PyExc_UnicodeError,
2156                      "charmap encoding error: %.400s",
2157                      details);
2158         return -1;
2159     }
2160     else if (strcmp(errors,"ignore") == 0) {
2161         return 0;
2162     }
2163     else if (strcmp(errors,"replace") == 0) {
2164         **dest = '?';
2165         (*dest)++;
2166         return 0;
2167     }
2168     else {
2169         PyErr_Format(PyExc_ValueError,
2170                      "charmap encoding error; "
2171                      "unknown error handling code: %.400s",
2172                      errors);
2173         return -1;
2174     }
2175 }
2176
2177 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2178                                   int size,
2179                                   PyObject *mapping,
2180                                   const char *errors)
2181 {
2182     PyObject *v;
2183     char *s;
2184     int extrachars = 0;
2185
2186     /* Default to Latin-1 */
2187     if (mapping == NULL)
2188         return PyUnicode_EncodeLatin1(p, size, errors);
2189
2190     v = PyString_FromStringAndSize(NULL, size);
2191     if (v == NULL)
2192         return NULL;
2193     if (size == 0)
2194         return v;
2195     s = PyString_AS_STRING(v);
2196     while (size-- > 0) {
2197         Py_UNICODE ch = *p++;
2198         PyObject *w, *x;
2199
2200         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2201         w = PyInt_FromLong((long)ch);
2202         if (w == NULL)
2203             goto onError;
2204         x = PyObject_GetItem(mapping, w);
2205         Py_DECREF(w);
2206         if (x == NULL) {
2207             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2208                 /* No mapping found means: mapping is undefined. */
2209                 PyErr_Clear();
2210                 x = Py_None;
2211                 Py_INCREF(x);
2212             } else
2213                 goto onError;
2214         }
2215
2216         /* Apply mapping */
2217         if (PyInt_Check(x)) {
2218             long value = PyInt_AS_LONG(x);
2219             if (value < 0 || value > 255) {
2220                 PyErr_SetString(PyExc_TypeError,
2221                                 "character mapping must be in range(256)");
2222                 Py_DECREF(x);
2223                 goto onError;
2224             }
2225             *s++ = (char)value;
2226         }
2227         else if (x == Py_None) {
2228             /* undefined mapping */
2229             if (charmap_encoding_error(&p, &s, errors,
2230                                        "character maps to <undefined>")) {
2231                 Py_DECREF(x);
2232                 goto onError;
2233             }
2234         }
2235         else if (PyString_Check(x)) {
2236             int targetsize = PyString_GET_SIZE(x);
2237
2238             if (targetsize == 1)
2239                 /* 1-1 mapping */
2240                 *s++ = *PyString_AS_STRING(x);
2241
2242             else if (targetsize > 1) {
2243                 /* 1-n mapping */
2244                 if (targetsize > extrachars) {
2245                     /* resize first */
2246                     int oldpos = (int)(s - PyString_AS_STRING(v));
2247                     int needed = (targetsize - extrachars) + \
2248                                  (targetsize << 2);
2249                     extrachars += needed;
2250                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2251                         Py_DECREF(x);
2252                         goto onError;
2253                     }
2254                     s = PyString_AS_STRING(v) + oldpos;
2255                 }
2256                 memcpy(s, PyString_AS_STRING(x), targetsize);
2257                 s += targetsize;
2258                 extrachars -= targetsize;
2259             }
2260             /* 1-0 mapping: skip the character */
2261         }
2262         else {
2263             /* wrong return value */
2264             PyErr_SetString(PyExc_TypeError,
2265                   "character mapping must return integer, None or unicode");
2266             Py_DECREF(x);
2267             goto onError;
2268         }
2269         Py_DECREF(x);
2270     }
2271     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2272         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2273             goto onError;
2274     return v;
2275
2276  onError:
2277     Py_DECREF(v);
2278     return NULL;
2279 }
2280
2281 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2282                                     PyObject *mapping)
2283 {
2284     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2285         PyErr_BadArgument();
2286         return NULL;
2287     }
2288     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2289                                    PyUnicode_GET_SIZE(unicode),
2290                                    mapping,
2291                                    NULL);
2292 }
2293
2294 static
2295 int translate_error(const Py_UNICODE **source,
2296                     Py_UNICODE **dest,
2297                     const char *errors,
2298                     const char *details)
2299 {
2300     if ((errors == NULL) ||
2301         (strcmp(errors,"strict") == 0)) {
2302         PyErr_Format(PyExc_UnicodeError,
2303                      "translate error: %.400s",
2304                      details);
2305         return -1;
2306     }
2307     else if (strcmp(errors,"ignore") == 0) {
2308         return 0;
2309     }
2310     else if (strcmp(errors,"replace") == 0) {
2311         **dest = '?';
2312         (*dest)++;
2313         return 0;
2314     }
2315     else {
2316         PyErr_Format(PyExc_ValueError,
2317                      "translate error; "
2318                      "unknown error handling code: %.400s",
2319                      errors);
2320         return -1;
2321     }
2322 }
2323
2324 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2325                                      int size,
2326                                      PyObject *mapping,
2327                                      const char *errors)
2328 {
2329     PyUnicodeObject *v;
2330     Py_UNICODE *p;
2331
2332     if (mapping == NULL) {
2333         PyErr_BadArgument();
2334         return NULL;
2335     }
2336
2337     /* Output will never be longer than input */
2338     v = _PyUnicode_New(size);
2339     if (v == NULL)
2340         goto onError;
2341     if (size == 0)
2342         goto done;
2343     p = PyUnicode_AS_UNICODE(v);
2344     while (size-- > 0) {
2345         Py_UNICODE ch = *s++;
2346         PyObject *w, *x;
2347
2348         /* Get mapping */
2349         w = PyInt_FromLong(ch);
2350         if (w == NULL)
2351             goto onError;
2352         x = PyObject_GetItem(mapping, w);
2353         Py_DECREF(w);
2354         if (x == NULL) {
2355             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2356                 /* No mapping found: default to 1-1 mapping */
2357                 PyErr_Clear();
2358                 *p++ = ch;
2359                 continue;
2360             }
2361             goto onError;
2362         }
2363
2364         /* Apply mapping */
2365         if (PyInt_Check(x))
2366             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2367         else if (x == Py_None) {
2368             /* undefined mapping */
2369             if (translate_error(&s, &p, errors,
2370                                 "character maps to <undefined>")) {
2371                 Py_DECREF(x);
2372                 goto onError;
2373             }
2374         }
2375         else if (PyUnicode_Check(x)) {
2376             if (PyUnicode_GET_SIZE(x) != 1) {
2377                 /* 1-n mapping */
2378                 PyErr_SetString(PyExc_NotImplementedError,
2379                                 "1-n mappings are currently not implemented");
2380                 Py_DECREF(x);
2381                 goto onError;
2382             }
2383             *p++ = *PyUnicode_AS_UNICODE(x);
2384         }
2385         else {
2386             /* wrong return value */
2387             PyErr_SetString(PyExc_TypeError,
2388                   "translate mapping must return integer, None or unicode");
2389             Py_DECREF(x);
2390             goto onError;
2391         }
2392         Py_DECREF(x);
2393     }
2394     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2395         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2396             goto onError;
2397
2398  done:
2399     return (PyObject *)v;
2400
2401  onError:
2402     Py_XDECREF(v);
2403     return NULL;
2404 }
2405
2406 PyObject *PyUnicode_Translate(PyObject *str,
2407                               PyObject *mapping,
2408                               const char *errors)
2409 {
2410     PyObject *result;
2411
2412     str = PyUnicode_FromObject(str);
2413     if (str == NULL)
2414         goto onError;
2415     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2416                                         PyUnicode_GET_SIZE(str),
2417                                         mapping,
2418                                         errors);
2419     Py_DECREF(str);
2420     return result;
2421
2422  onError:
2423     Py_XDECREF(str);
2424     return NULL;
2425 }
2426
2427 /* --- Decimal Encoder ---------------------------------------------------- */
2428
2429 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2430                             int length,
2431                             char *output,
2432                             const char *errors)
2433 {
2434     Py_UNICODE *p, *end;
2435
2436     if (output == NULL) {
2437         PyErr_BadArgument();
2438         return -1;
2439     }
2440
2441     p = s;
2442     end = s + length;
2443     while (p < end) {
2444         register Py_UNICODE ch = *p++;
2445         int decimal;
2446
2447         if (Py_UNICODE_ISSPACE(ch)) {
2448             *output++ = ' ';
2449             continue;
2450         }
2451         decimal = Py_UNICODE_TODECIMAL(ch);
2452         if (decimal >= 0) {
2453             *output++ = '0' + decimal;
2454             continue;
2455         }
2456         if (0 < ch && ch < 256) {
2457             *output++ = (char)ch;
2458             continue;
2459         }
2460         /* All other characters are considered invalid */
2461         if (errors == NULL || strcmp(errors, "strict") == 0) {
2462             PyErr_SetString(PyExc_ValueError,
2463                             "invalid decimal Unicode string");
2464             goto onError;
2465         }
2466         else if (strcmp(errors, "ignore") == 0)
2467             continue;
2468         else if (strcmp(errors, "replace") == 0) {
2469             *output++ = '?';
2470             continue;
2471         }
2472     }
2473     /* 0-terminate the output string */
2474     *output++ = '\0';
2475     return 0;
2476
2477  onError:
2478     return -1;
2479 }
2480
2481 /* --- Helpers ------------------------------------------------------------ */
2482
2483 static
2484 int count(PyUnicodeObject *self,
2485           int start,
2486           int end,
2487           PyUnicodeObject *substring)
2488 {
2489     int count = 0;
2490
2491     if (start < 0)
2492         start += self->length;
2493     if (start < 0)
2494         start = 0;
2495     if (end > self->length)
2496         end = self->length;
2497     if (end < 0)
2498         end += self->length;
2499     if (end < 0)
2500         end = 0;
2501
2502     if (substring->length == 0)
2503         return (end - start + 1);
2504
2505     end -= substring->length;
2506
2507     while (start <= end)
2508         if (Py_UNICODE_MATCH(self, start, substring)) {
2509             count++;
2510             start += substring->length;
2511         } else
2512             start++;
2513
2514     return count;
2515 }
2516
2517 int PyUnicode_Count(PyObject *str,
2518                     PyObject *substr,
2519                     int start,
2520                     int end)
2521 {
2522     int result;
2523
2524     str = PyUnicode_FromObject(str);
2525     if (str == NULL)
2526         return -1;
2527     substr = PyUnicode_FromObject(substr);
2528     if (substr == NULL) {
2529         Py_DECREF(str);
2530         return -1;
2531     }
2532
2533     result = count((PyUnicodeObject *)str,
2534                    start, end,
2535                    (PyUnicodeObject *)substr);
2536
2537     Py_DECREF(str);
2538     Py_DECREF(substr);
2539     return result;
2540 }
2541
2542 static
2543 int findstring(PyUnicodeObject *self,
2544                PyUnicodeObject *substring,
2545                int start,
2546                int end,
2547                int direction)
2548 {
2549     if (start < 0)
2550         start += self->length;
2551     if (start < 0)
2552         start = 0;
2553
2554     if (substring->length == 0)
2555         return start;
2556
2557     if (end > self->length)
2558         end = self->length;
2559     if (end < 0)
2560         end += self->length;
2561     if (end < 0)
2562         end = 0;
2563
2564     end -= substring->length;
2565
2566     if (direction < 0) {
2567         for (; end >= start; end--)
2568             if (Py_UNICODE_MATCH(self, end, substring))
2569                 return end;
2570     } else {
2571         for (; start <= end; start++)
2572             if (Py_UNICODE_MATCH(self, start, substring))
2573                 return start;
2574     }
2575
2576     return -1;
2577 }
2578
2579 int PyUnicode_Find(PyObject *str,
2580                    PyObject *substr,
2581                    int start,
2582                    int end,
2583                    int direction)
2584 {
2585     int result;
2586
2587     str = PyUnicode_FromObject(str);
2588     if (str == NULL)
2589         return -1;
2590     substr = PyUnicode_FromObject(substr);
2591     if (substr == NULL) {
2592         Py_DECREF(substr);
2593         return -1;
2594     }
2595
2596     result = findstring((PyUnicodeObject *)str,
2597                         (PyUnicodeObject *)substr,
2598                         start, end, direction);
2599     Py_DECREF(str);
2600     Py_DECREF(substr);
2601     return result;
2602 }
2603
2604 static
2605 int tailmatch(PyUnicodeObject *self,
2606               PyUnicodeObject *substring,
2607               int start,
2608               int end,
2609               int direction)
2610 {
2611     if (start < 0)
2612         start += self->length;
2613     if (start < 0)
2614         start = 0;
2615
2616     if (substring->length == 0)
2617         return 1;
2618
2619     if (end > self->length)
2620         end = self->length;
2621     if (end < 0)
2622         end += self->length;
2623     if (end < 0)
2624         end = 0;
2625
2626     end -= substring->length;
2627     if (end < start)
2628         return 0;
2629
2630     if (direction > 0) {
2631         if (Py_UNICODE_MATCH(self, end, substring))
2632             return 1;
2633     } else {
2634         if (Py_UNICODE_MATCH(self, start, substring))
2635             return 1;
2636     }
2637
2638     return 0;
2639 }
2640
2641 int PyUnicode_Tailmatch(PyObject *str,
2642                         PyObject *substr,
2643                         int start,
2644                         int end,
2645                         int direction)
2646 {
2647     int result;
2648
2649     str = PyUnicode_FromObject(str);
2650     if (str == NULL)
2651         return -1;
2652     substr = PyUnicode_FromObject(substr);
2653     if (substr == NULL) {
2654         Py_DECREF(substr);
2655         return -1;
2656     }
2657
2658     result = tailmatch((PyUnicodeObject *)str,
2659                        (PyUnicodeObject *)substr,
2660                        start, end, direction);
2661     Py_DECREF(str);
2662     Py_DECREF(substr);
2663     return result;
2664 }
2665
2666 static
2667 const Py_UNICODE *findchar(const Py_UNICODE *s,
2668                      int size,
2669                      Py_UNICODE ch)
2670 {
2671     /* like wcschr, but doesn't stop at NULL characters */
2672
2673     while (size-- > 0) {
2674         if (*s == ch)
2675             return s;
2676         s++;
2677     }
2678
2679     return NULL;
2680 }
2681
2682 /* Apply fixfct filter to the Unicode object self and return a
2683    reference to the modified object */
2684
2685 static
2686 PyObject *fixup(PyUnicodeObject *self,
2687                 int (*fixfct)(PyUnicodeObject *s))
2688 {
2689
2690     PyUnicodeObject *u;
2691
2692     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
2693     if (u == NULL)
2694         return NULL;
2695
2696     Py_UNICODE_COPY(u->str, self->str, self->length);
2697
2698     if (!fixfct(u)) {
2699         /* fixfct should return TRUE if it modified the buffer. If
2700            FALSE, return a reference to the original buffer instead
2701            (to save space, not time) */
2702         Py_INCREF(self);
2703         Py_DECREF(u);
2704         return (PyObject*) self;
2705     }
2706     return (PyObject*) u;
2707 }
2708
2709 static
2710 int fixupper(PyUnicodeObject *self)
2711 {
2712     int len = self->length;
2713     Py_UNICODE *s = self->str;
2714     int status = 0;
2715
2716     while (len-- > 0) {
2717         register Py_UNICODE ch;
2718
2719         ch = Py_UNICODE_TOUPPER(*s);
2720         if (ch != *s) {
2721             status = 1;
2722             *s = ch;
2723         }
2724         s++;
2725     }
2726
2727     return status;
2728 }
2729
2730 static
2731 int fixlower(PyUnicodeObject *self)
2732 {
2733     int len = self->length;
2734     Py_UNICODE *s = self->str;
2735     int status = 0;
2736
2737     while (len-- > 0) {
2738         register Py_UNICODE ch;
2739
2740         ch = Py_UNICODE_TOLOWER(*s);
2741         if (ch != *s) {
2742             status = 1;
2743             *s = ch;
2744         }
2745         s++;
2746     }
2747
2748     return status;
2749 }
2750
2751 static
2752 int fixswapcase(PyUnicodeObject *self)
2753 {
2754     int len = self->length;
2755     Py_UNICODE *s = self->str;
2756     int status = 0;
2757
2758     while (len-- > 0) {
2759         if (Py_UNICODE_ISUPPER(*s)) {
2760             *s = Py_UNICODE_TOLOWER(*s);
2761             status = 1;
2762         } else if (Py_UNICODE_ISLOWER(*s)) {
2763             *s = Py_UNICODE_TOUPPER(*s);
2764             status = 1;
2765         }
2766         s++;
2767     }
2768
2769     return status;
2770 }
2771
2772 static
2773 int fixcapitalize(PyUnicodeObject *self)
2774 {
2775     int len = self->length;
2776     Py_UNICODE *s = self->str;
2777     int status = 0;
2778
2779     if (len == 0)
2780         return 0;
2781     if (Py_UNICODE_ISLOWER(*s)) {
2782         *s = Py_UNICODE_TOUPPER(*s);
2783         status = 1;
2784     }
2785     s++;
2786     while (--len > 0) {
2787         if (Py_UNICODE_ISUPPER(*s)) {
2788             *s = Py_UNICODE_TOLOWER(*s);
2789             status = 1;
2790         }
2791         s++;
2792     }
2793     return status;
2794 }
2795
2796 static
2797 int fixtitle(PyUnicodeObject *self)
2798 {
2799     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2800     register Py_UNICODE *e;
2801     int previous_is_cased;
2802
2803     /* Shortcut for single character strings */
2804     if (PyUnicode_GET_SIZE(self) == 1) {
2805         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2806         if (*p != ch) {
2807             *p = ch;
2808             return 1;
2809         }
2810         else
2811             return 0;
2812     }
2813
2814     e = p + PyUnicode_GET_SIZE(self);
2815     previous_is_cased = 0;
2816     for (; p < e; p++) {
2817         register const Py_UNICODE ch = *p;
2818
2819         if (previous_is_cased)
2820             *p = Py_UNICODE_TOLOWER(ch);
2821         else
2822             *p = Py_UNICODE_TOTITLE(ch);
2823
2824         if (Py_UNICODE_ISLOWER(ch) ||
2825             Py_UNICODE_ISUPPER(ch) ||
2826             Py_UNICODE_ISTITLE(ch))
2827             previous_is_cased = 1;
2828         else
2829             previous_is_cased = 0;
2830     }
2831     return 1;
2832 }
2833
2834 PyObject *PyUnicode_Join(PyObject *separator,
2835                          PyObject *seq)
2836 {
2837     Py_UNICODE *sep;
2838     int seplen;
2839     PyUnicodeObject *res = NULL;
2840     int reslen = 0;
2841     Py_UNICODE *p;
2842     int sz = 100;
2843     int i;
2844     PyObject *it;
2845
2846     it = PyObject_GetIter(seq);
2847     if (it == NULL)
2848         return NULL;
2849
2850     if (separator == NULL) {
2851         Py_UNICODE blank = ' ';
2852         sep = &blank;
2853         seplen = 1;
2854     }
2855     else {
2856         separator = PyUnicode_FromObject(separator);
2857         if (separator == NULL)
2858             goto onError;
2859         sep = PyUnicode_AS_UNICODE(separator);
2860         seplen = PyUnicode_GET_SIZE(separator);
2861     }
2862
2863     res = _PyUnicode_New(sz);
2864     if (res == NULL)
2865         goto onError;
2866     p = PyUnicode_AS_UNICODE(res);
2867     reslen = 0;
2868
2869     for (i = 0; ; ++i) {
2870         int itemlen;
2871         PyObject *item = PyIter_Next(it);
2872         if (item == NULL) {
2873             if (PyErr_Occurred())
2874                 goto onError;
2875             break;
2876         }
2877         if (!PyUnicode_Check(item)) {
2878             PyObject *v;
2879             v = PyUnicode_FromObject(item);
2880             Py_DECREF(item);
2881             item = v;
2882             if (item == NULL)
2883                 goto onError;
2884         }
2885         itemlen = PyUnicode_GET_SIZE(item);
2886         while (reslen + itemlen + seplen >= sz) {
2887             if (_PyUnicode_Resize(&res, sz*2))
2888                 goto onError;
2889             sz *= 2;
2890             p = PyUnicode_AS_UNICODE(res) + reslen;
2891         }
2892         if (i > 0) {
2893             Py_UNICODE_COPY(p, sep, seplen);
2894             p += seplen;
2895             reslen += seplen;
2896         }
2897         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
2898         p += itemlen;
2899         reslen += itemlen;
2900         Py_DECREF(item);
2901     }
2902     if (_PyUnicode_Resize(&res, reslen))
2903         goto onError;
2904
2905     Py_XDECREF(separator);
2906     Py_DECREF(it);
2907     return (PyObject *)res;
2908
2909  onError:
2910     Py_XDECREF(separator);
2911     Py_XDECREF(res);
2912     Py_DECREF(it);
2913     return NULL;
2914 }
2915
2916 static
2917 PyUnicodeObject *pad(PyUnicodeObject *self,
2918                      int left,
2919                      int right,
2920                      Py_UNICODE fill)
2921 {
2922     PyUnicodeObject *u;
2923
2924     if (left < 0)
2925         left = 0;
2926     if (right < 0)
2927         right = 0;
2928
2929     if (left == 0 && right == 0) {
2930         Py_INCREF(self);
2931         return self;
2932     }
2933
2934     u = _PyUnicode_New(left + self->length + right);
2935     if (u) {
2936         if (left)
2937             Py_UNICODE_FILL(u->str, fill, left);
2938         Py_UNICODE_COPY(u->str + left, self->str, self->length);
2939         if (right)
2940             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2941     }
2942
2943     return u;
2944 }
2945
2946 #define SPLIT_APPEND(data, left, right)                                 \
2947         str = PyUnicode_FromUnicode(data + left, right - left);         \
2948         if (!str)                                                       \
2949             goto onError;                                               \
2950         if (PyList_Append(list, str)) {                                 \
2951             Py_DECREF(str);                                             \
2952             goto onError;                                               \
2953         }                                                               \
2954         else                                                            \
2955             Py_DECREF(str);
2956
2957 static
2958 PyObject *split_whitespace(PyUnicodeObject *self,
2959                            PyObject *list,
2960                            int maxcount)
2961 {
2962     register int i;
2963     register int j;
2964     int len = self->length;
2965     PyObject *str;
2966
2967     for (i = j = 0; i < len; ) {
2968         /* find a token */
2969         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2970             i++;
2971         j = i;
2972         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2973             i++;
2974         if (j < i) {
2975             if (maxcount-- <= 0)
2976                 break;
2977             SPLIT_APPEND(self->str, j, i);
2978             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2979                 i++;
2980             j = i;
2981         }
2982     }
2983     if (j < len) {
2984         SPLIT_APPEND(self->str, j, len);
2985     }
2986     return list;
2987
2988  onError:
2989     Py_DECREF(list);
2990     return NULL;
2991 }
2992
2993 PyObject *PyUnicode_Splitlines(PyObject *string,
2994                                int keepends)
2995 {
2996     register int i;
2997     register int j;
2998     int len;
2999     PyObject *list;
3000     PyObject *str;
3001     Py_UNICODE *data;
3002
3003     string = PyUnicode_FromObject(string);
3004     if (string == NULL)
3005         return NULL;
3006     data = PyUnicode_AS_UNICODE(string);
3007     len = PyUnicode_GET_SIZE(string);
3008
3009     list = PyList_New(0);
3010     if (!list)
3011         goto onError;
3012
3013     for (i = j = 0; i < len; ) {
3014         int eol;
3015
3016         /* Find a line and append it */
3017         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3018             i++;
3019
3020         /* Skip the line break reading CRLF as one line break */
3021         eol = i;
3022         if (i < len) {
3023             if (data[i] == '\r' && i + 1 < len &&
3024                 data[i+1] == '\n')
3025                 i += 2;
3026             else
3027                 i++;
3028             if (keepends)
3029                 eol = i;
3030         }
3031         SPLIT_APPEND(data, j, eol);
3032         j = i;
3033     }
3034     if (j < len) {
3035         SPLIT_APPEND(data, j, len);
3036     }
3037
3038     Py_DECREF(string);
3039     return list;
3040
3041  onError:
3042     Py_DECREF(list);
3043     Py_DECREF(string);
3044     return NULL;
3045 }
3046
3047 static
3048 PyObject *split_char(PyUnicodeObject *self,
3049                      PyObject *list,
3050                      Py_UNICODE ch,
3051                      int maxcount)
3052 {
3053     register int i;
3054     register int j;
3055     int len = self->length;
3056     PyObject *str;
3057
3058     for (i = j = 0; i < len; ) {
3059         if (self->str[i] == ch) {
3060             if (maxcount-- <= 0)
3061                 break;
3062             SPLIT_APPEND(self->str, j, i);
3063             i = j = i + 1;
3064         } else
3065             i++;
3066     }
3067     if (j <= len) {
3068         SPLIT_APPEND(self->str, j, len);
3069     }
3070     return list;
3071
3072  onError:
3073     Py_DECREF(list);
3074     return NULL;
3075 }
3076
3077 static
3078 PyObject *split_substring(PyUnicodeObject *self,
3079                           PyObject *list,
3080                           PyUnicodeObject *substring,
3081                           int maxcount)
3082 {
3083     register int i;
3084     register int j;
3085     int len = self->length;
3086     int sublen = substring->length;
3087     PyObject *str;
3088
3089     for (i = j = 0; i <= len - sublen; ) {
3090         if (Py_UNICODE_MATCH(self, i, substring)) {
3091             if (maxcount-- <= 0)
3092                 break;
3093             SPLIT_APPEND(self->str, j, i);
3094             i = j = i + sublen;
3095         } else
3096             i++;
3097     }
3098     if (j <= len) {
3099         SPLIT_APPEND(self->str, j, len);
3100     }
3101     return list;
3102
3103  onError:
3104     Py_DECREF(list);
3105     return NULL;
3106 }
3107
3108 #undef SPLIT_APPEND
3109
3110 static
3111 PyObject *split(PyUnicodeObject *self,
3112                 PyUnicodeObject *substring,
3113                 int maxcount)
3114 {
3115     PyObject *list;
3116
3117     if (maxcount < 0)
3118         maxcount = INT_MAX;
3119
3120     list = PyList_New(0);
3121     if (!list)
3122         return NULL;
3123
3124     if (substring == NULL)
3125         return split_whitespace(self,list,maxcount);
3126
3127     else if (substring->length == 1)
3128         return split_char(self,list,substring->str[0],maxcount);
3129
3130     else if (substring->length == 0) {
3131         Py_DECREF(list);
3132         PyErr_SetString(PyExc_ValueError, "empty separator");
3133         return NULL;
3134     }
3135     else
3136         return split_substring(self,list,substring,maxcount);
3137 }
3138
3139 static
3140 PyObject *strip(PyUnicodeObject *self,
3141                 int left,
3142                 int right)
3143 {
3144     Py_UNICODE *p = self->str;
3145     int start = 0;
3146     int end = self->length;
3147
3148     if (left)
3149         while (start < end && Py_UNICODE_ISSPACE(p[start]))
3150             start++;
3151
3152     if (right)
3153         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3154             end--;
3155
3156     if (start == 0 && end == self->length) {
3157         /* couldn't strip anything off, return original string */
3158         Py_INCREF(self);
3159         return (PyObject*) self;
3160     }
3161
3162     return (PyObject*) PyUnicode_FromUnicode(
3163         self->str + start,
3164         end - start
3165         );
3166 }
3167
3168 static
3169 PyObject *replace(PyUnicodeObject *self,
3170                   PyUnicodeObject *str1,
3171                   PyUnicodeObject *str2,
3172                   int maxcount)
3173 {
3174     PyUnicodeObject *u;
3175
3176     if (maxcount < 0)
3177         maxcount = INT_MAX;
3178
3179     if (str1->length == 1 && str2->length == 1) {
3180         int i;
3181
3182         /* replace characters */
3183         if (!findchar(self->str, self->length, str1->str[0])) {
3184             /* nothing to replace, return original string */
3185             Py_INCREF(self);
3186             u = self;
3187         } else {
3188             Py_UNICODE u1 = str1->str[0];
3189             Py_UNICODE u2 = str2->str[0];
3190
3191             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3192                 NULL,
3193                 self->length
3194                 );
3195             if (u != NULL) {
3196                 Py_UNICODE_COPY(u->str, self->str,
3197                                 self->length);
3198                 for (i = 0; i < u->length; i++)
3199                     if (u->str[i] == u1) {
3200                         if (--maxcount < 0)
3201                             break;
3202                         u->str[i] = u2;
3203                     }
3204         }
3205         }
3206
3207     } else {
3208         int n, i;
3209         Py_UNICODE *p;
3210
3211         /* replace strings */
3212         n = count(self, 0, self->length, str1);
3213         if (n > maxcount)
3214             n = maxcount;
3215         if (n == 0) {
3216             /* nothing to replace, return original string */
3217             Py_INCREF(self);
3218             u = self;
3219         } else {
3220             u = _PyUnicode_New(
3221                 self->length + n * (str2->length - str1->length));
3222             if (u) {
3223                 i = 0;
3224                 p = u->str;
3225                 while (i <= self->length - str1->length)
3226                     if (Py_UNICODE_MATCH(self, i, str1)) {
3227                         /* replace string segment */
3228                         Py_UNICODE_COPY(p, str2->str, str2->length);
3229                         p += str2->length;
3230                         i += str1->length;
3231                         if (--n <= 0) {
3232                             /* copy remaining part */
3233                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3234                             break;
3235                         }
3236                     } else
3237                         *p++ = self->str[i++];
3238             }
3239         }
3240     }
3241
3242     return (PyObject *) u;
3243 }
3244
3245 /* --- Unicode Object Methods --------------------------------------------- */
3246
3247 static char title__doc__[] =
3248 "S.title() -> unicode\n\
3249 \n\
3250 Return a titlecased version of S, i.e. words start with title case\n\
3251 characters, all remaining cased characters have lower case.";
3252
3253 static PyObject*
3254 unicode_title(PyUnicodeObject *self, PyObject *args)
3255 {
3256     if (!PyArg_NoArgs(args))
3257         return NULL;
3258     return fixup(self, fixtitle);
3259 }
3260
3261 static char capitalize__doc__[] =
3262 "S.capitalize() -> unicode\n\
3263 \n\
3264 Return a capitalized version of S, i.e. make the first character\n\
3265 have upper case.";
3266
3267 static PyObject*
3268 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3269 {
3270     if (!PyArg_NoArgs(args))
3271         return NULL;
3272     return fixup(self, fixcapitalize);
3273 }
3274
3275 #if 0
3276 static char capwords__doc__[] =
3277 "S.capwords() -> unicode\n\
3278 \n\
3279 Apply .capitalize() to all words in S and return the result with\n\
3280 normalized whitespace (all whitespace strings are replaced by ' ').";
3281
3282 static PyObject*
3283 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3284 {
3285     PyObject *list;
3286     PyObject *item;
3287     int i;
3288
3289     if (!PyArg_NoArgs(args))
3290         return NULL;
3291
3292     /* Split into words */
3293     list = split(self, NULL, -1);
3294     if (!list)
3295         return NULL;
3296
3297     /* Capitalize each word */
3298     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3299         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3300                      fixcapitalize);
3301         if (item == NULL)
3302             goto onError;
3303         Py_DECREF(PyList_GET_ITEM(list, i));
3304         PyList_SET_ITEM(list, i, item);
3305     }
3306
3307     /* Join the words to form a new string */
3308     item = PyUnicode_Join(NULL, list);
3309
3310 onError:
3311     Py_DECREF(list);
3312     return (PyObject *)item;
3313 }
3314 #endif
3315
3316 static char center__doc__[] =
3317 "S.center(width) -> unicode\n\
3318 \n\
3319 Return S centered in a Unicode string of length width. Padding is done\n\
3320 using spaces.";
3321
3322 static PyObject *
3323 unicode_center(PyUnicodeObject *self, PyObject *args)
3324 {
3325     int marg, left;
3326     int width;
3327
3328     if (!PyArg_ParseTuple(args, "i:center", &width))
3329         return NULL;
3330
3331     if (self->length >= width) {
3332         Py_INCREF(self);
3333         return (PyObject*) self;
3334     }
3335
3336     marg = width - self->length;
3337     left = marg / 2 + (marg & width & 1);
3338
3339     return (PyObject*) pad(self, left, marg - left, ' ');
3340 }
3341
3342 #if 0
3343
3344 /* This code should go into some future Unicode collation support
3345    module. The basic comparison should compare ordinals on a naive
3346    basis (this is what Java does and thus JPython too). */
3347
3348 /* speedy UTF-16 code point order comparison */
3349 /* gleaned from: */
3350 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3351
3352 static short utf16Fixup[32] =
3353 {
3354     0, 0, 0, 0, 0, 0, 0, 0,
3355     0, 0, 0, 0, 0, 0, 0, 0,
3356     0, 0, 0, 0, 0, 0, 0, 0,
3357     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3358 };
3359
3360 static int
3361 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3362 {
3363     int len1, len2;
3364
3365     Py_UNICODE *s1 = str1->str;
3366     Py_UNICODE *s2 = str2->str;
3367
3368     len1 = str1->length;
3369     len2 = str2->length;
3370
3371     while (len1 > 0 && len2 > 0) {
3372         Py_UNICODE c1, c2;
3373
3374         c1 = *s1++;
3375         c2 = *s2++;
3376
3377         if (c1 > (1<<11) * 26)
3378             c1 += utf16Fixup[c1>>11];
3379         if (c2 > (1<<11) * 26)
3380             c2 += utf16Fixup[c2>>11];
3381         /* now c1 and c2 are in UTF-32-compatible order */
3382
3383         if (c1 != c2)
3384             return (c1 < c2) ? -1 : 1;
3385
3386         len1--; len2--;
3387     }
3388
3389     return (len1 < len2) ? -1 : (len1 != len2);
3390 }
3391
3392 #else
3393
3394 static int
3395 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3396 {
3397     register int len1, len2;
3398
3399     Py_UNICODE *s1 = str1->str;
3400     Py_UNICODE *s2 = str2->str;
3401
3402     len1 = str1->length;
3403     len2 = str2->length;
3404
3405     while (len1 > 0 && len2 > 0) {
3406         Py_UNICODE c1, c2;
3407
3408         c1 = *s1++;
3409         c2 = *s2++;
3410
3411         if (c1 != c2)
3412             return (c1 < c2) ? -1 : 1;
3413
3414         len1--; len2--;
3415     }
3416
3417     return (len1 < len2) ? -1 : (len1 != len2);
3418 }
3419
3420 #endif
3421
3422 int PyUnicode_Compare(PyObject *left,
3423                       PyObject *right)
3424 {
3425     PyUnicodeObject *u = NULL, *v = NULL;
3426     int result;
3427
3428     /* Coerce the two arguments */
3429     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3430     if (u == NULL)
3431         goto onError;
3432     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3433     if (v == NULL)
3434         goto onError;
3435
3436     /* Shortcut for empty or interned objects */
3437     if (v == u) {
3438         Py_DECREF(u);
3439         Py_DECREF(v);
3440         return 0;
3441     }
3442
3443     result = unicode_compare(u, v);
3444
3445     Py_DECREF(u);
3446     Py_DECREF(v);
3447     return result;
3448
3449 onError:
3450     Py_XDECREF(u);
3451     Py_XDECREF(v);
3452     return -1;
3453 }
3454
3455 int PyUnicode_Contains(PyObject *container,
3456                        PyObject *element)
3457 {
3458     PyUnicodeObject *u = NULL, *v = NULL;
3459     int result;
3460     register const Py_UNICODE *p, *e;
3461     register Py_UNICODE ch;
3462
3463     /* Coerce the two arguments */
3464     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3465     if (v == NULL) {
3466         PyErr_SetString(PyExc_TypeError,
3467             "'in <string>' requires character as left operand");
3468         goto onError;
3469     }
3470     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3471     if (u == NULL) {
3472         Py_DECREF(v);
3473         goto onError;
3474     }
3475
3476     /* Check v in u */
3477     if (PyUnicode_GET_SIZE(v) != 1) {
3478         PyErr_SetString(PyExc_TypeError,
3479             "'in <string>' requires character as left operand");
3480         goto onError;
3481     }
3482     ch = *PyUnicode_AS_UNICODE(v);
3483     p = PyUnicode_AS_UNICODE(u);
3484     e = p + PyUnicode_GET_SIZE(u);
3485     result = 0;
3486     while (p < e) {
3487         if (*p++ == ch) {
3488             result = 1;
3489             break;
3490         }
3491     }
3492
3493     Py_DECREF(u);
3494     Py_DECREF(v);
3495     return result;
3496
3497 onError:
3498     Py_XDECREF(u);
3499     Py_XDECREF(v);
3500     return -1;
3501 }
3502
3503 /* Concat to string or Unicode object giving a new Unicode object. */
3504
3505 PyObject *PyUnicode_Concat(PyObject *left,
3506                            PyObject *right)
3507 {
3508     PyUnicodeObject *u = NULL, *v = NULL, *w;
3509
3510     /* Coerce the two arguments */
3511     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3512     if (u == NULL)
3513         goto onError;
3514     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3515     if (v == NULL)
3516         goto onError;
3517
3518     /* Shortcuts */
3519     if (v == unicode_empty) {
3520         Py_DECREF(v);
3521         return (PyObject *)u;
3522     }
3523     if (u == unicode_empty) {
3524         Py_DECREF(u);
3525         return (PyObject *)v;
3526     }
3527
3528     /* Concat the two Unicode strings */
3529     w = _PyUnicode_New(u->length + v->length);
3530     if (w == NULL)
3531         goto onError;
3532     Py_UNICODE_COPY(w->str, u->str, u->length);
3533     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3534
3535     Py_DECREF(u);
3536     Py_DECREF(v);
3537     return (PyObject *)w;
3538
3539 onError:
3540     Py_XDECREF(u);
3541     Py_XDECREF(v);
3542     return NULL;
3543 }
3544
3545 static char count__doc__[] =
3546 "S.count(sub[, start[, end]]) -> int\n\
3547 \n\
3548 Return the number of occurrences of substring sub in Unicode string\n\
3549 S[start:end].  Optional arguments start and end are\n\
3550 interpreted as in slice notation.";
3551
3552 static PyObject *
3553 unicode_count(PyUnicodeObject *self, PyObject *args)
3554 {
3555     PyUnicodeObject *substring;
3556     int start = 0;
3557     int end = INT_MAX;
3558     PyObject *result;
3559
3560     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3561                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3562         return NULL;
3563
3564     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3565                                                 (PyObject *)substring);
3566     if (substring == NULL)
3567         return NULL;
3568
3569     if (start < 0)
3570         start += self->length;
3571     if (start < 0)
3572         start = 0;
3573     if (end > self->length)
3574         end = self->length;
3575     if (end < 0)
3576         end += self->length;
3577     if (end < 0)
3578         end = 0;
3579
3580     result = PyInt_FromLong((long) count(self, start, end, substring));
3581
3582     Py_DECREF(substring);
3583     return result;
3584 }
3585
3586 static char encode__doc__[] =
3587 "S.encode([encoding[,errors]]) -> string\n\
3588 \n\
3589 Return an encoded string version of S. Default encoding is the current\n\
3590 default string encoding. errors may be given to set a different error\n\
3591 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3592 a ValueError. Other possible values are 'ignore' and 'replace'.";
3593
3594 static PyObject *
3595 unicode_encode(PyUnicodeObject *self, PyObject *args)
3596 {
3597     char *encoding = NULL;
3598     char *errors = NULL;
3599     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3600         return NULL;
3601     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3602 }
3603
3604 static char expandtabs__doc__[] =
3605 "S.expandtabs([tabsize]) -> unicode\n\
3606 \n\
3607 Return a copy of S where all tab characters are expanded using spaces.\n\
3608 If tabsize is not given, a tab size of 8 characters is assumed.";
3609
3610 static PyObject*
3611 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3612 {
3613     Py_UNICODE *e;
3614     Py_UNICODE *p;
3615     Py_UNICODE *q;
3616     int i, j;
3617     PyUnicodeObject *u;
3618     int tabsize = 8;
3619
3620     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3621         return NULL;
3622
3623     /* First pass: determine size of output string */
3624     i = j = 0;
3625     e = self->str + self->length;
3626     for (p = self->str; p < e; p++)
3627         if (*p == '\t') {
3628             if (tabsize > 0)
3629                 j += tabsize - (j % tabsize);
3630         }
3631         else {
3632             j++;
3633             if (*p == '\n' || *p == '\r') {
3634                 i += j;
3635                 j = 0;
3636             }
3637         }
3638
3639     /* Second pass: create output string and fill it */
3640     u = _PyUnicode_New(i + j);
3641     if (!u)
3642         return NULL;
3643
3644     j = 0;
3645     q = u->str;
3646
3647     for (p = self->str; p < e; p++)
3648         if (*p == '\t') {
3649             if (tabsize > 0) {
3650                 i = tabsize - (j % tabsize);
3651                 j += i;
3652                 while (i--)
3653                     *q++ = ' ';
3654             }
3655         }
3656         else {
3657             j++;
3658             *q++ = *p;
3659             if (*p == '\n' || *p == '\r')
3660                 j = 0;
3661         }
3662
3663     return (PyObject*) u;
3664 }
3665
3666 static char find__doc__[] =
3667 "S.find(sub [,start [,end]]) -> int\n\
3668 \n\
3669 Return the lowest index in S where substring sub is found,\n\
3670 such that sub is contained within s[start,end].  Optional\n\
3671 arguments start and end are interpreted as in slice notation.\n\
3672 \n\
3673 Return -1 on failure.";
3674
3675 static PyObject *
3676 unicode_find(PyUnicodeObject *self, PyObject *args)
3677 {
3678     PyUnicodeObject *substring;
3679     int start = 0;
3680     int end = INT_MAX;
3681     PyObject *result;
3682
3683     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3684                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3685         return NULL;
3686     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3687                                                 (PyObject *)substring);
3688     if (substring == NULL)
3689         return NULL;
3690
3691     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3692
3693     Py_DECREF(substring);
3694     return result;
3695 }
3696
3697 static PyObject *
3698 unicode_getitem(PyUnicodeObject *self, int index)
3699 {
3700     if (index < 0 || index >= self->length) {
3701         PyErr_SetString(PyExc_IndexError, "string index out of range");
3702         return NULL;
3703     }
3704
3705     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3706 }
3707
3708 static long
3709 unicode_hash(PyUnicodeObject *self)
3710 {
3711     /* Since Unicode objects compare equal to their ASCII string
3712        counterparts, they should use the individual character values
3713        as basis for their hash value.  This is needed to assure that
3714        strings and Unicode objects behave in the same way as
3715        dictionary keys. */
3716
3717     register int len;
3718     register Py_UNICODE *p;
3719     register long x;
3720
3721     if (self->hash != -1)
3722         return self->hash;
3723     len = PyUnicode_GET_SIZE(self);
3724     p = PyUnicode_AS_UNICODE(self);
3725     x = *p << 7;
3726     while (--len >= 0)
3727         x = (1000003*x) ^ *p++;
3728     x ^= PyUnicode_GET_SIZE(self);
3729     if (x == -1)
3730         x = -2;
3731     self->hash = x;
3732     return x;
3733 }
3734
3735 static char index__doc__[] =
3736 "S.index(sub [,start [,end]]) -> int\n\
3737 \n\
3738 Like S.find() but raise ValueError when the substring is not found.";
3739
3740 static PyObject *
3741 unicode_index(PyUnicodeObject *self, PyObject *args)
3742 {
3743     int result;
3744     PyUnicodeObject *substring;
3745     int start = 0;
3746     int end = INT_MAX;
3747
3748     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3749                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3750         return NULL;
3751
3752     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3753                                                 (PyObject *)substring);
3754     if (substring == NULL)
3755         return NULL;
3756
3757     result = findstring(self, substring, start, end, 1);
3758
3759     Py_DECREF(substring);
3760     if (result < 0) {
3761         PyErr_SetString(PyExc_ValueError, "substring not found");
3762         return NULL;
3763     }
3764     return PyInt_FromLong(result);
3765 }
3766
3767 static char islower__doc__[] =
3768 "S.islower() -> int\n\
3769 \n\
3770 Return 1 if  all cased characters in S are lowercase and there is\n\
3771 at least one cased character in S, 0 otherwise.";
3772
3773 static PyObject*
3774 unicode_islower(PyUnicodeObject *self, PyObject *args)
3775 {
3776     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3777     register const Py_UNICODE *e;
3778     int cased;
3779
3780     if (!PyArg_NoArgs(args))
3781         return NULL;
3782
3783     /* Shortcut for single character strings */
3784     if (PyUnicode_GET_SIZE(self) == 1)
3785         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3786
3787     /* Special case for empty strings */
3788     if (PyString_GET_SIZE(self) == 0)
3789         return PyInt_FromLong(0);
3790
3791     e = p + PyUnicode_GET_SIZE(self);
3792     cased = 0;
3793     for (; p < e; p++) {
3794         register const Py_UNICODE ch = *p;
3795
3796         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3797             return PyInt_FromLong(0);
3798         else if (!cased && Py_UNICODE_ISLOWER(ch))
3799             cased = 1;
3800     }
3801     return PyInt_FromLong(cased);
3802 }
3803
3804 static char isupper__doc__[] =
3805 "S.isupper() -> int\n\
3806 \n\
3807 Return 1 if  all cased characters in S are uppercase and there is\n\
3808 at least one cased character in S, 0 otherwise.";
3809
3810 static PyObject*
3811 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3812 {
3813     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3814     register const Py_UNICODE *e;
3815     int cased;
3816
3817     if (!PyArg_NoArgs(args))
3818         return NULL;
3819
3820     /* Shortcut for single character strings */
3821     if (PyUnicode_GET_SIZE(self) == 1)
3822         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3823
3824     /* Special case for empty strings */
3825     if (PyString_GET_SIZE(self) == 0)
3826         return PyInt_FromLong(0);
3827
3828     e = p + PyUnicode_GET_SIZE(self);
3829     cased = 0;
3830     for (; p < e; p++) {
3831         register const Py_UNICODE ch = *p;
3832
3833         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3834             return PyInt_FromLong(0);
3835         else if (!cased && Py_UNICODE_ISUPPER(ch))
3836             cased = 1;
3837     }
3838     return PyInt_FromLong(cased);
3839 }
3840
3841 static char istitle__doc__[] =
3842 "S.istitle() -> int\n\
3843 \n\
3844 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3845 may only follow uncased characters and lowercase characters only cased\n\
3846 ones. Return 0 otherwise.";
3847
3848 static PyObject*
3849 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3850 {
3851     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3852     register const Py_UNICODE *e;
3853     int cased, previous_is_cased;
3854
3855     if (!PyArg_NoArgs(args))
3856         return NULL;
3857
3858     /* Shortcut for single character strings */
3859     if (PyUnicode_GET_SIZE(self) == 1)
3860         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3861                               (Py_UNICODE_ISUPPER(*p) != 0));
3862
3863     /* Special case for empty strings */
3864     if (PyString_GET_SIZE(self) == 0)
3865         return PyInt_FromLong(0);
3866
3867     e = p + PyUnicode_GET_SIZE(self);
3868     cased = 0;
3869     previous_is_cased = 0;
3870     for (; p < e; p++) {
3871         register const Py_UNICODE ch = *p;
3872
3873         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3874             if (previous_is_cased)
3875                 return PyInt_FromLong(0);
3876             previous_is_cased = 1;
3877             cased = 1;
3878         }
3879         else if (Py_UNICODE_ISLOWER(ch)) {
3880             if (!previous_is_cased)
3881                 return PyInt_FromLong(0);
3882             previous_is_cased = 1;
3883             cased = 1;
3884         }
3885         else
3886             previous_is_cased = 0;
3887     }
3888     return PyInt_FromLong(cased);
3889 }
3890
3891 static char isspace__doc__[] =
3892 "S.isspace() -> int\n\
3893 \n\
3894 Return 1 if there are only whitespace characters in S,\n\
3895 0 otherwise.";
3896
3897 static PyObject*
3898 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3899 {
3900     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3901     register const Py_UNICODE *e;
3902
3903     if (!PyArg_NoArgs(args))
3904         return NULL;
3905
3906     /* Shortcut for single character strings */
3907     if (PyUnicode_GET_SIZE(self) == 1 &&
3908         Py_UNICODE_ISSPACE(*p))
3909         return PyInt_FromLong(1);
3910
3911     /* Special case for empty strings */
3912     if (PyString_GET_SIZE(self) == 0)
3913         return PyInt_FromLong(0);
3914
3915     e = p + PyUnicode_GET_SIZE(self);
3916     for (; p < e; p++) {
3917         if (!Py_UNICODE_ISSPACE(*p))
3918             return PyInt_FromLong(0);
3919     }
3920     return PyInt_FromLong(1);
3921 }
3922
3923 static char isalpha__doc__[] =
3924 "S.isalpha() -> int\n\
3925 \n\
3926 Return 1 if  all characters in S are alphabetic\n\
3927 and there is at least one character in S, 0 otherwise.";
3928
3929 static PyObject*
3930 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3931 {
3932     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3933     register const Py_UNICODE *e;
3934
3935     if (!PyArg_NoArgs(args))
3936         return NULL;
3937
3938     /* Shortcut for single character strings */
3939     if (PyUnicode_GET_SIZE(self) == 1 &&
3940         Py_UNICODE_ISALPHA(*p))
3941         return PyInt_FromLong(1);
3942
3943     /* Special case for empty strings */
3944     if (PyString_GET_SIZE(self) == 0)
3945         return PyInt_FromLong(0);
3946
3947     e = p + PyUnicode_GET_SIZE(self);
3948     for (; p < e; p++) {
3949         if (!Py_UNICODE_ISALPHA(*p))
3950             return PyInt_FromLong(0);
3951     }
3952     return PyInt_FromLong(1);
3953 }
3954
3955 static char isalnum__doc__[] =
3956 "S.isalnum() -> int\n\
3957 \n\
3958 Return 1 if  all characters in S are alphanumeric\n\
3959 and there is at least one character in S, 0 otherwise.";
3960
3961 static PyObject*
3962 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3963 {
3964     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3965     register const Py_UNICODE *e;
3966
3967     if (!PyArg_NoArgs(args))
3968         return NULL;
3969
3970     /* Shortcut for single character strings */
3971     if (PyUnicode_GET_SIZE(self) == 1 &&
3972         Py_UNICODE_ISALNUM(*p))
3973         return PyInt_FromLong(1);
3974
3975     /* Special case for empty strings */
3976     if (PyString_GET_SIZE(self) == 0)
3977         return PyInt_FromLong(0);
3978
3979     e = p + PyUnicode_GET_SIZE(self);
3980     for (; p < e; p++) {
3981         if (!Py_UNICODE_ISALNUM(*p))
3982             return PyInt_FromLong(0);
3983     }
3984     return PyInt_FromLong(1);
3985 }
3986
3987 static char isdecimal__doc__[] =
3988 "S.isdecimal() -> int\n\
3989 \n\
3990 Return 1 if there are only decimal characters in S,\n\
3991 0 otherwise.";
3992
3993 static PyObject*
3994 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3995 {
3996     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3997     register const Py_UNICODE *e;
3998
3999     if (!PyArg_NoArgs(args))
4000         return NULL;
4001
4002     /* Shortcut for single character strings */
4003     if (PyUnicode_GET_SIZE(self) == 1 &&
4004         Py_UNICODE_ISDECIMAL(*p))
4005         return PyInt_FromLong(1);
4006
4007     /* Special case for empty strings */
4008     if (PyString_GET_SIZE(self) == 0)
4009         return PyInt_FromLong(0);
4010
4011     e = p + PyUnicode_GET_SIZE(self);
4012     for (; p < e; p++) {
4013         if (!Py_UNICODE_ISDECIMAL(*p))
4014             return PyInt_FromLong(0);
4015     }
4016     return PyInt_FromLong(1);
4017 }
4018
4019 static char isdigit__doc__[] =
4020 "S.isdigit() -> int\n\
4021 \n\
4022 Return 1 if there are only digit characters in S,\n\
4023 0 otherwise.";
4024
4025 static PyObject*
4026 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
4027 {
4028     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4029     register const Py_UNICODE *e;
4030
4031     if (!PyArg_NoArgs(args))
4032         return NULL;
4033
4034     /* Shortcut for single character strings */
4035     if (PyUnicode_GET_SIZE(self) == 1 &&
4036         Py_UNICODE_ISDIGIT(*p))
4037         return PyInt_FromLong(1);
4038
4039     /* Special case for empty strings */
4040     if (PyString_GET_SIZE(self) == 0)
4041         return PyInt_FromLong(0);
4042
4043     e = p + PyUnicode_GET_SIZE(self);
4044     for (; p < e; p++) {
4045         if (!Py_UNICODE_ISDIGIT(*p))
4046             return PyInt_FromLong(0);
4047     }
4048     return PyInt_FromLong(1);
4049 }
4050
4051 static char isnumeric__doc__[] =
4052 "S.isnumeric() -> int\n\
4053 \n\
4054 Return 1 if there are only numeric characters in S,\n\
4055 0 otherwise.";
4056
4057 static PyObject*
4058 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4059 {
4060     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4061     register const Py_UNICODE *e;
4062
4063     if (!PyArg_NoArgs(args))
4064         return NULL;
4065
4066     /* Shortcut for single character strings */
4067     if (PyUnicode_GET_SIZE(self) == 1 &&
4068         Py_UNICODE_ISNUMERIC(*p))
4069         return PyInt_FromLong(1);
4070
4071     /* Special case for empty strings */
4072     if (PyString_GET_SIZE(self) == 0)
4073         return PyInt_FromLong(0);
4074
4075     e = p + PyUnicode_GET_SIZE(self);
4076     for (; p < e; p++) {
4077         if (!Py_UNICODE_ISNUMERIC(*p))
4078             return PyInt_FromLong(0);
4079     }
4080     return PyInt_FromLong(1);
4081 }
4082
4083 static char join__doc__[] =
4084 "S.join(sequence) -> unicode\n\
4085 \n\
4086 Return a string which is the concatenation of the strings in the\n\
4087 sequence.  The separator between elements is S.";
4088
4089 static PyObject*
4090 unicode_join(PyUnicodeObject *self, PyObject *args)
4091 {
4092     PyObject *data;
4093     if (!PyArg_ParseTuple(args, "O:join", &data))
4094         return NULL;
4095
4096     return PyUnicode_Join((PyObject *)self, data);
4097 }
4098
4099 static int
4100 unicode_length(PyUnicodeObject *self)
4101 {
4102     return self->length;
4103 }
4104
4105 static char ljust__doc__[] =
4106 "S.ljust(width) -> unicode\n\
4107 \n\
4108 Return S left justified in a Unicode string of length width. Padding is\n\
4109 done using spaces.";
4110
4111 static PyObject *
4112 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4113 {
4114     int width;
4115     if (!PyArg_ParseTuple(args, "i:ljust", &width))
4116         return NULL;
4117
4118     if (self->length >= width) {
4119         Py_INCREF(self);
4120         return (PyObject*) self;
4121     }
4122
4123     return (PyObject*) pad(self, 0, width - self->length, ' ');
4124 }
4125
4126 static char lower__doc__[] =
4127 "S.lower() -> unicode\n\
4128 \n\
4129 Return a copy of the string S converted to lowercase.";
4130
4131 static PyObject*
4132 unicode_lower(PyUnicodeObject *self, PyObject *args)
4133 {
4134     if (!PyArg_NoArgs(args))
4135         return NULL;
4136     return fixup(self, fixlower);
4137 }
4138
4139 static char lstrip__doc__[] =
4140 "S.lstrip() -> unicode\n\
4141 \n\
4142 Return a copy of the string S with leading whitespace removed.";
4143
4144 static PyObject *
4145 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4146 {
4147     if (!PyArg_NoArgs(args))
4148         return NULL;
4149     return strip(self, 1, 0);
4150 }
4151
4152 static PyObject*
4153 unicode_repeat(PyUnicodeObject *str, int len)
4154 {
4155     PyUnicodeObject *u;
4156     Py_UNICODE *p;
4157     int nchars;
4158     size_t nbytes;
4159
4160     if (len < 0)
4161         len = 0;
4162
4163     if (len == 1) {
4164         /* no repeat, return original string */
4165         Py_INCREF(str);
4166         return (PyObject*) str;
4167     }
4168
4169     /* ensure # of chars needed doesn't overflow int and # of bytes
4170      * needed doesn't overflow size_t
4171      */
4172     nchars = len * str->length;
4173     if (len && nchars / len != str->length) {
4174         PyErr_SetString(PyExc_OverflowError,
4175                         "repeated string is too long");
4176         return NULL;
4177     }
4178     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4179     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4180         PyErr_SetString(PyExc_OverflowError,
4181                         "repeated string is too long");
4182         return NULL;
4183     }
4184     u = _PyUnicode_New(nchars);
4185     if (!u)
4186         return NULL;
4187
4188     p = u->str;
4189
4190     while (len-- > 0) {
4191         Py_UNICODE_COPY(p, str->str, str->length);
4192         p += str->length;
4193     }
4194
4195     return (PyObject*) u;
4196 }
4197
4198 PyObject *PyUnicode_Replace(PyObject *obj,
4199                             PyObject *subobj,
4200                             PyObject *replobj,
4201                             int maxcount)
4202 {
4203     PyObject *self;
4204     PyObject *str1;
4205     PyObject *str2;
4206     PyObject *result;
4207
4208     self = PyUnicode_FromObject(obj);
4209     if (self == NULL)
4210         return NULL;
4211     str1 = PyUnicode_FromObject(subobj);
4212     if (str1 == NULL) {
4213         Py_DECREF(self);
4214         return NULL;
4215     }
4216     str2 = PyUnicode_FromObject(replobj);
4217     if (str2 == NULL) {
4218         Py_DECREF(self);
4219         Py_DECREF(str1);
4220         return NULL;
4221     }
4222     result = replace((PyUnicodeObject *)self,
4223                      (PyUnicodeObject *)str1,
4224                      (PyUnicodeObject *)str2,
4225                      maxcount);
4226     Py_DECREF(self);
4227     Py_DECREF(str1);
4228     Py_DECREF(str2);
4229     return result;
4230 }
4231
4232 static char replace__doc__[] =
4233 "S.replace (old, new[, maxsplit]) -> unicode\n\
4234 \n\
4235 Return a copy of S with all occurrences of substring\n\
4236 old replaced by new.  If the optional argument maxsplit is\n\
4237 given, only the first maxsplit occurrences are replaced.";
4238
4239 static PyObject*
4240 unicode_replace(PyUnicodeObject *self, PyObject *args)
4241 {
4242     PyUnicodeObject *str1;
4243     PyUnicodeObject *str2;
4244     int maxcount = -1;
4245     PyObject *result;
4246
4247     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4248         return NULL;
4249     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4250     if (str1 == NULL)
4251         return NULL;
4252     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4253     if (str2 == NULL)
4254         return NULL;
4255
4256     result = replace(self, str1, str2, maxcount);
4257
4258     Py_DECREF(str1);
4259     Py_DECREF(str2);
4260     return result;
4261 }
4262
4263 static
4264 PyObject *unicode_repr(PyObject *unicode)
4265 {
4266     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4267                                 PyUnicode_GET_SIZE(unicode),
4268                                 1);
4269 }
4270
4271 static char rfind__doc__[] =
4272 "S.rfind(sub [,start [,end]]) -> int\n\
4273 \n\
4274 Return the highest index in S where substring sub is found,\n\
4275 such that sub is contained within s[start,end].  Optional\n\
4276 arguments start and end are interpreted as in slice notation.\n\
4277 \n\
4278 Return -1 on failure.";
4279
4280 static PyObject *
4281 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4282 {
4283     PyUnicodeObject *substring;
4284     int start = 0;
4285     int end = INT_MAX;
4286     PyObject *result;
4287
4288     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4289                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4290         return NULL;
4291     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4292                                                 (PyObject *)substring);
4293     if (substring == NULL)
4294         return NULL;
4295
4296     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4297
4298     Py_DECREF(substring);
4299     return result;
4300 }
4301
4302 static char rindex__doc__[] =
4303 "S.rindex(sub [,start [,end]]) -> int\n\
4304 \n\
4305 Like S.rfind() but raise ValueError when the substring is not found.";
4306
4307 static PyObject *
4308 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4309 {
4310     int result;
4311     PyUnicodeObject *substring;
4312     int start = 0;
4313     int end = INT_MAX;
4314
4315     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4316                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4317         return NULL;
4318     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4319                                                 (PyObject *)substring);
4320     if (substring == NULL)
4321         return NULL;
4322
4323     result = findstring(self, substring, start, end, -1);
4324
4325     Py_DECREF(substring);
4326     if (result < 0) {
4327         PyErr_SetString(PyExc_ValueError, "substring not found");
4328         return NULL;
4329     }
4330     return PyInt_FromLong(result);
4331 }
4332
4333 static char rjust__doc__[] =
4334 "S.rjust(width) -> unicode\n\
4335 \n\
4336 Return S right justified in a Unicode string of length width. Padding is\n\
4337 done using spaces.";
4338
4339 static PyObject *
4340 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4341 {
4342     int width;
4343     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4344         return NULL;
4345
4346     if (self->length >= width) {
4347         Py_INCREF(self);
4348         return (PyObject*) self;
4349     }
4350
4351     return (PyObject*) pad(self, width - self->length, 0, ' ');
4352 }
4353
4354 static char rstrip__doc__[] =
4355 "S.rstrip() -> unicode\n\
4356 \n\
4357 Return a copy of the string S with trailing whitespace removed.";
4358
4359 static PyObject *
4360 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4361 {
4362     if (!PyArg_NoArgs(args))
4363         return NULL;
4364     return strip(self, 0, 1);
4365 }
4366
4367 static PyObject*
4368 unicode_slice(PyUnicodeObject *self, int start, int end)
4369 {
4370     /* standard clamping */
4371     if (start < 0)
4372         start = 0;
4373     if (end < 0)
4374         end = 0;
4375     if (end > self->length)
4376         end = self->length;
4377     if (start == 0 && end == self->length) {
4378         /* full slice, return original string */
4379         Py_INCREF(self);
4380         return (PyObject*) self;
4381     }
4382     if (start > end)
4383         start = end;
4384     /* copy slice */
4385     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4386                                              end - start);
4387 }
4388
4389 PyObject *PyUnicode_Split(PyObject *s,
4390                           PyObject *sep,
4391                           int maxsplit)
4392 {
4393     PyObject *result;
4394
4395     s = PyUnicode_FromObject(s);
4396     if (s == NULL)
4397         return NULL;
4398     if (sep != NULL) {
4399         sep = PyUnicode_FromObject(sep);
4400         if (sep == NULL) {
4401             Py_DECREF(s);
4402             return NULL;
4403         }
4404     }
4405
4406     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4407
4408     Py_DECREF(s);
4409     Py_XDECREF(sep);
4410     return result;
4411 }
4412
4413 static char split__doc__[] =
4414 "S.split([sep [,maxsplit]]) -> list of strings\n\
4415 \n\
4416 Return a list of the words in S, using sep as the\n\
4417 delimiter string.  If maxsplit is given, at most maxsplit\n\
4418 splits are done. If sep is not specified, any whitespace string\n\
4419 is a separator.";
4420
4421 static PyObject*
4422 unicode_split(PyUnicodeObject *self, PyObject *args)
4423 {
4424     PyObject *substring = Py_None;
4425     int maxcount = -1;
4426
4427     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4428         return NULL;
4429
4430     if (substring == Py_None)
4431         return split(self, NULL, maxcount);
4432     else if (PyUnicode_Check(substring))
4433         return split(self, (PyUnicodeObject *)substring, maxcount);
4434     else
4435         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4436 }
4437
4438 static char splitlines__doc__[] =
4439 "S.splitlines([keepends]]) -> list of strings\n\
4440 \n\
4441 Return a list of the lines in S, breaking at line boundaries.\n\
4442 Line breaks are not included in the resulting list unless keepends\n\
4443 is given and true.";
4444
4445 static PyObject*
4446 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4447 {
4448     int keepends = 0;
4449
4450     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4451         return NULL;
4452
4453     return PyUnicode_Splitlines((PyObject *)self, keepends);
4454 }
4455
4456 static
4457 PyObject *unicode_str(PyUnicodeObject *self)
4458 {
4459     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4460 }
4461
4462 static char strip__doc__[] =
4463 "S.strip() -> unicode\n\
4464 \n\
4465 Return a copy of S with leading and trailing whitespace removed.";
4466
4467 static PyObject *
4468 unicode_strip(PyUnicodeObject *self, PyObject *args)
4469 {
4470     if (!PyArg_NoArgs(args))
4471         return NULL;
4472     return strip(self, 1, 1);
4473 }
4474
4475 static char swapcase__doc__[] =
4476 "S.swapcase() -> unicode\n\
4477 \n\
4478 Return a copy of S with uppercase characters converted to lowercase\n\
4479 and vice versa.";
4480
4481 static PyObject*
4482 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4483 {
4484     if (!PyArg_NoArgs(args))
4485         return NULL;
4486     return fixup(self, fixswapcase);
4487 }
4488
4489 static char translate__doc__[] =
4490 "S.translate(table) -> unicode\n\
4491 \n\
4492 Return a copy of the string S, where all characters have been mapped\n\
4493 through the given translation table, which must be a mapping of\n\
4494 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4495 are left untouched. Characters mapped to None are deleted.";
4496
4497 static PyObject*
4498 unicode_translate(PyUnicodeObject *self, PyObject *args)
4499 {
4500     PyObject *table;
4501
4502     if (!PyArg_ParseTuple(args, "O:translate", &table))
4503         return NULL;
4504     return PyUnicode_TranslateCharmap(self->str,
4505                                       self->length,
4506                                       table,
4507                                       "ignore");
4508 }
4509
4510 static char upper__doc__[] =
4511 "S.upper() -> unicode\n\
4512 \n\
4513 Return a copy of S converted to uppercase.";
4514
4515 static PyObject*
4516 unicode_upper(PyUnicodeObject *self, PyObject *args)
4517 {
4518     if (!PyArg_NoArgs(args))
4519         return NULL;
4520     return fixup(self, fixupper);
4521 }
4522
4523 #if 0
4524 static char zfill__doc__[] =
4525 "S.zfill(width) -> unicode\n\
4526 \n\
4527 Pad a numeric string x with zeros on the left, to fill a field\n\
4528 of the specified width. The string x is never truncated.";
4529
4530 static PyObject *
4531 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4532 {
4533     int fill;
4534     PyUnicodeObject *u;
4535
4536     int width;
4537     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4538         return NULL;
4539
4540     if (self->length >= width) {
4541         Py_INCREF(self);
4542         return (PyObject*) self;
4543     }
4544
4545     fill = width - self->length;
4546
4547     u = pad(self, fill, 0, '0');
4548
4549     if (u->str[fill] == '+' || u->str[fill] == '-') {
4550         /* move sign to beginning of string */
4551         u->str[0] = u->str[fill];
4552         u->str[fill] = '0';
4553     }
4554
4555     return (PyObject*) u;
4556 }
4557 #endif
4558
4559 #if 0
4560 static PyObject*
4561 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4562 {
4563     if (!PyArg_NoArgs(args))
4564         return NULL;
4565     return PyInt_FromLong(unicode_freelist_size);
4566 }
4567 #endif
4568
4569 static char startswith__doc__[] =
4570 "S.startswith(prefix[, start[, end]]) -> int\n\
4571 \n\
4572 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4573 optional start, test S beginning at that position.  With optional end, stop\n\
4574 comparing S at that position.";
4575
4576 static PyObject *
4577 unicode_startswith(PyUnicodeObject *self,
4578                    PyObject *args)
4579 {
4580     PyUnicodeObject *substring;
4581     int start = 0;
4582     int end = INT_MAX;
4583     PyObject *result;
4584
4585     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4586                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4587         return NULL;
4588     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4589                                                 (PyObject *)substring);
4590     if (substring == NULL)
4591         return NULL;
4592
4593     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4594
4595     Py_DECREF(substring);
4596     return result;
4597 }
4598
4599
4600 static char endswith__doc__[] =
4601 "S.endswith(suffix[, start[, end]]) -> int\n\
4602 \n\
4603 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4604 optional start, test S beginning at that position.  With optional end, stop\n\
4605 comparing S at that position.";
4606
4607 static PyObject *
4608 unicode_endswith(PyUnicodeObject *self,
4609                  PyObject *args)
4610 {
4611     PyUnicodeObject *substring;
4612     int start = 0;
4613     int end = INT_MAX;
4614     PyObject *result;
4615
4616     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4617                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4618         return NULL;
4619     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4620                                                 (PyObject *)substring);
4621     if (substring == NULL)
4622         return NULL;
4623
4624     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4625
4626     Py_DECREF(substring);
4627     return result;
4628 }
4629
4630
4631 static PyMethodDef unicode_methods[] = {
4632
4633     /* Order is according to common usage: often used methods should
4634        appear first, since lookup is done sequentially. */
4635
4636     {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4637     {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4638     {"split", (PyCFunction) unicode_split, 1, split__doc__},
4639     {"join", (PyCFunction) unicode_join, 1, join__doc__},
4640     {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4641     {"title", (PyCFunction) unicode_title, 0, title__doc__},
4642     {"center", (PyCFunction) unicode_center, 1, center__doc__},
4643     {"count", (PyCFunction) unicode_count, 1, count__doc__},
4644     {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4645     {"find", (PyCFunction) unicode_find, 1, find__doc__},
4646     {"index", (PyCFunction) unicode_index, 1, index__doc__},
4647     {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4648     {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4649     {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4650 /*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4651     {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4652     {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4653     {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4654     {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4655     {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4656     {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4657     {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4658     {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4659     {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4660     {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4661     {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4662     {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4663     {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4664     {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4665     {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4666     {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4667     {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4668     {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4669     {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4670     {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4671 #if 0
4672     {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4673     {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4674 #endif
4675
4676 #if 0
4677     /* This one is just used for debugging the implementation. */
4678     {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4679 #endif
4680
4681     {NULL, NULL}
4682 };
4683
4684 static PySequenceMethods unicode_as_sequence = {
4685     (inquiry) unicode_length,           /* sq_length */
4686     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4687     (intargfunc) unicode_repeat,        /* sq_repeat */
4688     (intargfunc) unicode_getitem,       /* sq_item */
4689     (intintargfunc) unicode_slice,      /* sq_slice */
4690     0,                                  /* sq_ass_item */
4691     0,                                  /* sq_ass_slice */
4692     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4693 };
4694
4695 static int
4696 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4697                           int index,
4698                           const void **ptr)
4699 {
4700     if (index != 0) {
4701         PyErr_SetString(PyExc_SystemError,
4702                         "accessing non-existent unicode segment");
4703         return -1;
4704     }
4705     *ptr = (void *) self->str;
4706     return PyUnicode_GET_DATA_SIZE(self);
4707 }
4708
4709 static int
4710 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4711                            const void **ptr)
4712 {
4713     PyErr_SetString(PyExc_TypeError,
4714                     "cannot use unicode as modifyable buffer");
4715     return -1;
4716 }
4717
4718 static int
4719 unicode_buffer_getsegcount(PyUnicodeObject *self,
4720                            int *lenp)
4721 {
4722     if (lenp)
4723         *lenp = PyUnicode_GET_DATA_SIZE(self);
4724     return 1;
4725 }
4726
4727 static int
4728 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4729                           int index,
4730                           const void **ptr)
4731 {
4732     PyObject *str;
4733
4734     if (index != 0) {
4735         PyErr_SetString(PyExc_SystemError,
4736                         "accessing non-existent unicode segment");
4737         return -1;
4738     }
4739     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4740     if (str == NULL)
4741         return -1;
4742     *ptr = (void *) PyString_AS_STRING(str);
4743     return PyString_GET_SIZE(str);
4744 }
4745
4746 /* Helpers for PyUnicode_Format() */
4747
4748 static PyObject *
4749 getnextarg(PyObject *args, int arglen, int *p_argidx)
4750 {
4751     int argidx = *p_argidx;
4752     if (argidx < arglen) {
4753         (*p_argidx)++;
4754         if (arglen < 0)
4755             return args;
4756         else
4757             return PyTuple_GetItem(args, argidx);
4758     }
4759     PyErr_SetString(PyExc_TypeError,
4760                     "not enough arguments for format string");
4761     return NULL;
4762 }
4763
4764 #define F_LJUST (1<<0)
4765 #define F_SIGN  (1<<1)
4766 #define F_BLANK (1<<2)
4767 #define F_ALT   (1<<3)
4768 #define F_ZERO  (1<<4)
4769
4770 static
4771 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4772 {
4773     register int i;
4774     int len;
4775     va_list va;
4776     char *charbuffer;
4777     va_start(va, format);
4778
4779     /* First, format the string as char array, then expand to Py_UNICODE
4780        array. */
4781     charbuffer = (char *)buffer;
4782     len = vsprintf(charbuffer, format, va);
4783     for (i = len - 1; i >= 0; i--)
4784         buffer[i] = (Py_UNICODE) charbuffer[i];
4785
4786     va_end(va);
4787     return len;
4788 }
4789
4790 static int
4791 formatfloat(Py_UNICODE *buf,
4792             size_t buflen,
4793             int flags,
4794             int prec,
4795             int type,
4796             PyObject *v)
4797 {
4798     /* fmt = '%#.' + `prec` + `type`
4799        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4800     char fmt[20];
4801     double x;
4802
4803     x = PyFloat_AsDouble(v);
4804     if (x == -1.0 && PyErr_Occurred())
4805         return -1;
4806     if (prec < 0)
4807         prec = 6;
4808     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4809         type = 'g';
4810     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4811     /* worst case length calc to ensure no buffer overrun:
4812          fmt = %#.<prec>g
4813          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4814             for any double rep.)
4815          len = 1 + prec + 1 + 2 + 5 = 9 + prec
4816        If prec=0 the effective precision is 1 (the leading digit is
4817        always given), therefore increase by one to 10+prec. */
4818     if (buflen <= (size_t)10 + (size_t)prec) {
4819         PyErr_SetString(PyExc_OverflowError,
4820             "formatted float is too long (precision too long?)");
4821         return -1;
4822     }
4823     return usprintf(buf, fmt, x);
4824 }
4825
4826 static PyObject*
4827 formatlong(PyObject *val, int flags, int prec, int type)
4828 {
4829         char *buf;
4830         int i, len;
4831         PyObject *str; /* temporary string object. */
4832         PyUnicodeObject *result;
4833
4834         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4835         if (!str)
4836                 return NULL;
4837         result = _PyUnicode_New(len);
4838         for (i = 0; i < len; i++)
4839                 result->str[i] = buf[i];
4840         result->str[len] = 0;
4841         Py_DECREF(str);
4842         return (PyObject*)result;
4843 }
4844
4845 static int
4846 formatint(Py_UNICODE *buf,
4847           size_t buflen,
4848           int flags,
4849           int prec,
4850           int type,
4851           PyObject *v)
4852 {
4853     /* fmt = '%#.' + `prec` + 'l' + `type`
4854        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4855        + 1 + 1 = 24*/
4856     char fmt[64]; /* plenty big enough! */
4857     long x;
4858     int use_native_c_format = 1;
4859
4860     x = PyInt_AsLong(v);
4861     if (x == -1 && PyErr_Occurred())
4862         return -1;
4863     if (prec < 0)
4864         prec = 1;
4865     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4866        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4867     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4868         PyErr_SetString(PyExc_OverflowError,
4869             "formatted integer is too long (precision too long?)");
4870         return -1;
4871     }
4872     /* When converting 0 under %#x or %#X, C leaves off the base marker,
4873      * but we want it (for consistency with other %#x conversions, and
4874      * for consistency with Python's hex() function).
4875      * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
4876      * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4877      * So add it only if the platform doesn't already.
4878      */
4879     if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4880         /* Only way to know what the platform does is to try it. */
4881         sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4882         if (fmt[1] != (char)type) {
4883             /* Supply our own leading 0x/0X -- needed under std C */
4884             use_native_c_format = 0;
4885             sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4886         }
4887     }
4888     if (use_native_c_format)
4889          sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4890     return usprintf(buf, fmt, x);
4891 }
4892
4893 static int
4894 formatchar(Py_UNICODE *buf,
4895            size_t buflen,
4896            PyObject *v)
4897 {
4898     /* presume that the buffer is at least 2 characters long */
4899     if (PyUnicode_Check(v)) {
4900         if (PyUnicode_GET_SIZE(v) != 1)
4901             goto onError;
4902         buf[0] = PyUnicode_AS_UNICODE(v)[0];
4903     }
4904
4905     else if (PyString_Check(v)) {
4906         if (PyString_GET_SIZE(v) != 1)
4907             goto onError;
4908         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4909     }
4910
4911     else {
4912         /* Integer input truncated to a character */
4913         long x;
4914         x = PyInt_AsLong(v);
4915         if (x == -1 && PyErr_Occurred())
4916             goto onError;
4917         buf[0] = (char) x;
4918     }
4919     buf[1] = '\0';
4920     return 1;
4921
4922  onError:
4923     PyErr_SetString(PyExc_TypeError,
4924                     "%c requires int or char");
4925     return -1;
4926 }
4927
4928 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4929
4930    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4931    chars are formatted. XXX This is a magic number. Each formatting
4932    routine does bounds checking to ensure no overflow, but a better
4933    solution may be to malloc a buffer of appropriate size for each
4934    format. For now, the current solution is sufficient.
4935 */
4936 #define FORMATBUFLEN (size_t)120
4937
4938 PyObject *PyUnicode_Format(PyObject *format,
4939                            PyObject *args)
4940 {
4941     Py_UNICODE *fmt, *res;
4942     int fmtcnt, rescnt, reslen, arglen, argidx;
4943     int args_owned = 0;
4944     PyUnicodeObject *result = NULL;
4945     PyObject *dict = NULL;
4946     PyObject *uformat;
4947
4948     if (format == NULL || args == NULL) {
4949         PyErr_BadInternalCall();
4950         return NULL;
4951     }
4952     uformat = PyUnicode_FromObject(format);
4953     if (uformat == NULL)
4954         return NULL;
4955     fmt = PyUnicode_AS_UNICODE(uformat);
4956     fmtcnt = PyUnicode_GET_SIZE(uformat);
4957
4958     reslen = rescnt = fmtcnt + 100;
4959     result = _PyUnicode_New(reslen);
4960     if (result == NULL)
4961         goto onError;
4962     res = PyUnicode_AS_UNICODE(result);
4963
4964     if (PyTuple_Check(args)) {
4965         arglen = PyTuple_Size(args);
4966         argidx = 0;
4967     }
4968     else {
4969         arglen = -1;
4970         argidx = -2;
4971     }
4972     if (args->ob_type->tp_as_mapping)
4973         dict = args;
4974
4975     while (--fmtcnt >= 0) {
4976         if (*fmt != '%') {
4977             if (--rescnt < 0) {
4978                 rescnt = fmtcnt + 100;
4979                 reslen += rescnt;
4980                 if (_PyUnicode_Resize(&result, reslen) < 0)
4981                     return NULL;
4982                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4983                 --rescnt;
4984             }
4985             *res++ = *fmt++;
4986         }
4987         else {
4988             /* Got a format specifier */
4989             int flags = 0;
4990             int width = -1;
4991             int prec = -1;
4992             Py_UNICODE c = '\0';
4993             Py_UNICODE fill;
4994             PyObject *v = NULL;
4995             PyObject *temp = NULL;
4996             Py_UNICODE *pbuf;
4997             Py_UNICODE sign;
4998             int len;
4999             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5000
5001             fmt++;
5002             if (*fmt == '(') {
5003                 Py_UNICODE *keystart;
5004                 int keylen;
5005                 PyObject *key;
5006                 int pcount = 1;
5007
5008                 if (dict == NULL) {
5009                     PyErr_SetString(PyExc_TypeError,
5010                                     "format requires a mapping");
5011                     goto onError;
5012                 }
5013                 ++fmt;
5014                 --fmtcnt;
5015                 keystart = fmt;
5016                 /* Skip over balanced parentheses */
5017                 while (pcount > 0 && --fmtcnt >= 0) {
5018                     if (*fmt == ')')
5019                         --pcount;
5020                     else if (*fmt == '(')
5021                         ++pcount;
5022                     fmt++;
5023                 }
5024                 keylen = fmt - keystart - 1;
5025                 if (fmtcnt < 0 || pcount > 0) {
5026                     PyErr_SetString(PyExc_ValueError,
5027                                     "incomplete format key");
5028                     goto onError;
5029                 }
5030                 /* keys are converted to strings using UTF-8 and
5031                    then looked up since Python uses strings to hold
5032                    variables names etc. in its namespaces and we
5033                    wouldn't want to break common idioms. */
5034                 key = PyUnicode_EncodeUTF8(keystart,
5035                                            keylen,
5036                                            NULL);
5037                 if (key == NULL)
5038                     goto onError;
5039                 if (args_owned) {
5040                     Py_DECREF(args);
5041                     args_owned = 0;
5042                 }
5043                 args = PyObject_GetItem(dict, key);
5044                 Py_DECREF(key);
5045                 if (args == NULL) {
5046                     goto onError;
5047                 }
5048                 args_owned = 1;
5049                 arglen = -1;
5050                 argidx = -2;
5051             }
5052             while (--fmtcnt >= 0) {
5053                 switch (c = *fmt++) {
5054                 case '-': flags |= F_LJUST; continue;
5055                 case '+': flags |= F_SIGN; continue;
5056                 case ' ': flags |= F_BLANK; continue;
5057                 case '#': flags |= F_ALT; continue;
5058                 case '0': flags |= F_ZERO; continue;
5059                 }
5060                 break;
5061             }
5062             if (c == '*') {
5063                 v = getnextarg(args, arglen, &argidx);
5064                 if (v == NULL)
5065                     goto onError;
5066                 if (!PyInt_Check(v)) {
5067                     PyErr_SetString(PyExc_TypeError,
5068                                     "* wants int");
5069                     goto onError;
5070                 }
5071                 width = PyInt_AsLong(v);
5072                 if (width < 0) {
5073                     flags |= F_LJUST;
5074                     width = -width;
5075                 }
5076                 if (--fmtcnt >= 0)
5077                     c = *fmt++;
5078             }
5079             else if (c >= '0' && c <= '9') {
5080                 width = c - '0';
5081                 while (--fmtcnt >= 0) {
5082                     c = *fmt++;
5083                     if (c < '0' || c > '9')
5084                         break;
5085                     if ((width*10) / 10 != width) {
5086                         PyErr_SetString(PyExc_ValueError,
5087                                         "width too big");
5088                         goto onError;
5089                     }
5090                     width = width*10 + (c - '0');
5091                 }
5092             }
5093             if (c == '.') {
5094                 prec = 0;
5095                 if (--fmtcnt >= 0)
5096                     c = *fmt++;
5097                 if (c == '*') {
5098                     v = getnextarg(args, arglen, &argidx);
5099                     if (v == NULL)
5100                         goto onError;
5101                     if (!PyInt_Check(v)) {
5102                         PyErr_SetString(PyExc_TypeError,
5103                                         "* wants int");
5104                         goto onError;
5105                     }
5106                     prec = PyInt_AsLong(v);
5107                     if (prec < 0)
5108                         prec = 0;
5109                     if (--fmtcnt >= 0)
5110                         c = *fmt++;
5111                 }
5112                 else if (c >= '0' && c <= '9') {
5113                     prec = c - '0';
5114                     while (--fmtcnt >= 0) {
5115                         c = Py_CHARMASK(*fmt++);
5116                         if (c < '0' || c > '9')
5117                             break;
5118                         if ((prec*10) / 10 != prec) {
5119                             PyErr_SetString(PyExc_ValueError,
5120                                             "prec too big");
5121                             goto onError;
5122                         }
5123                         prec = prec*10 + (c - '0');
5124                     }
5125                 }
5126             } /* prec */
5127             if (fmtcnt >= 0) {
5128                 if (c == 'h' || c == 'l' || c == 'L') {
5129                     if (--fmtcnt >= 0)
5130                         c = *fmt++;
5131                 }
5132             }
5133             if (fmtcnt < 0) {
5134                 PyErr_SetString(PyExc_ValueError,
5135                                 "incomplete format");
5136                 goto onError;
5137             }
5138             if (c != '%') {
5139                 v = getnextarg(args, arglen, &argidx);
5140                 if (v == NULL)
5141                     goto onError;
5142             }
5143             sign = 0;
5144             fill = ' ';
5145             switch (c) {
5146
5147             case '%':
5148                 pbuf = formatbuf;
5149                 /* presume that buffer length is at least 1 */
5150                 pbuf[0] = '%';
5151                 len = 1;
5152                 break;
5153
5154             case 's':
5155             case 'r':
5156                 if (PyUnicode_Check(v) && c == 's') {
5157                     temp = v;
5158                     Py_INCREF(temp);
5159                 }
5160                 else {
5161                     PyObject *unicode;
5162                     if (c == 's')
5163                         temp = PyObject_Str(v);
5164                     else
5165                         temp = PyObject_Repr(v);
5166                     if (temp == NULL)
5167                         goto onError;
5168                     if (!PyString_Check(temp)) {
5169                         /* XXX Note: this should never happen, since
5170                                PyObject_Repr() and PyObject_Str() assure
5171                                this */
5172                         Py_DECREF(temp);
5173                         PyErr_SetString(PyExc_TypeError,
5174                                         "%s argument has non-string str()");
5175                         goto onError;
5176                     }
5177                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5178                                                    PyString_GET_SIZE(temp),
5179                                                NULL,
5180                                                    "strict");
5181                     Py_DECREF(temp);
5182                     temp = unicode;
5183                     if (temp == NULL)
5184                         goto onError;
5185                 }
5186                 pbuf = PyUnicode_AS_UNICODE(temp);
5187                 len = PyUnicode_GET_SIZE(temp);
5188                 if (prec >= 0 && len > prec)
5189                     len = prec;
5190                 break;
5191
5192             case 'i':
5193             case 'd':
5194             case 'u':
5195             case 'o':
5196             case 'x':
5197             case 'X':
5198                 if (c == 'i')
5199                     c = 'd';
5200                 if (PyLong_Check(v)) {
5201                     temp = formatlong(v, flags, prec, c);
5202                     if (!temp)
5203                         goto onError;
5204                     pbuf = PyUnicode_AS_UNICODE(temp);
5205                     len = PyUnicode_GET_SIZE(temp);
5206                     /* unbounded ints can always produce
5207                        a sign character! */
5208                     sign = 1;
5209                 }
5210                 else {
5211                     pbuf = formatbuf;
5212                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5213                                     flags, prec, c, v);
5214                     if (len < 0)
5215                         goto onError;
5216                     /* only d conversion is signed */
5217                     sign = c == 'd';
5218                 }
5219                 if (flags & F_ZERO)
5220                     fill = '0';
5221                 break;
5222
5223             case 'e':
5224             case 'E':
5225             case 'f':
5226             case 'g':
5227             case 'G':
5228                 pbuf = formatbuf;
5229                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5230                         flags, prec, c, v);
5231                 if (len < 0)
5232                     goto onError;
5233                 sign = 1;
5234                 if (flags & F_ZERO)
5235                     fill = '0';
5236                 break;
5237
5238             case 'c':
5239                 pbuf = formatbuf;
5240                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5241                 if (len < 0)
5242                     goto onError;
5243                 break;
5244
5245             default:
5246                 PyErr_Format(PyExc_ValueError,
5247                              "unsupported format character '%c' (0x%x) "
5248                              "at index %i",
5249                              (31<=c && c<=126) ? c : '?',
5250                              c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5251                 goto onError;
5252             }
5253             if (sign) {
5254                 if (*pbuf == '-' || *pbuf == '+') {
5255                     sign = *pbuf++;
5256                     len--;
5257                 }
5258                 else if (flags & F_SIGN)
5259                     sign = '+';
5260                 else if (flags & F_BLANK)
5261                     sign = ' ';
5262                 else
5263                     sign = 0;
5264             }
5265             if (width < len)
5266                 width = len;
5267             if (rescnt < width + (sign != 0)) {
5268                 reslen -= rescnt;
5269                 rescnt = width + fmtcnt + 100;
5270                 reslen += rescnt;
5271                 if (_PyUnicode_Resize(&result, reslen) < 0)
5272                     return NULL;
5273                 res = PyUnicode_AS_UNICODE(result)
5274                     + reslen - rescnt;
5275             }
5276             if (sign) {
5277                 if (fill != ' ')
5278                     *res++ = sign;
5279                 rescnt--;
5280                 if (width > len)
5281                     width--;
5282             }
5283             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5284                 assert(pbuf[0] == '0');
5285                 assert(pbuf[1] == c);
5286                 if (fill != ' ') {
5287                     *res++ = *pbuf++;
5288                     *res++ = *pbuf++;
5289                 }
5290                 rescnt -= 2;
5291                 width -= 2;
5292                 if (width < 0)
5293                     width = 0;
5294                 len -= 2;
5295             }
5296             if (width > len && !(flags & F_LJUST)) {
5297                 do {
5298                     --rescnt;
5299                     *res++ = fill;
5300                 } while (--width > len);
5301             }
5302             if (fill == ' ') {
5303                 if (sign)
5304                     *res++ = sign;
5305                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5306                     assert(pbuf[0] == '0');
5307                     assert(pbuf[1] == c);
5308                     *res++ = *pbuf++;
5309                     *res++ = *pbuf++;
5310                 }
5311             }
5312             Py_UNICODE_COPY(res, pbuf, len);
5313             res += len;
5314             rescnt -= len;
5315             while (--width >= len) {
5316                 --rescnt;
5317                 *res++ = ' ';
5318             }
5319             if (dict && (argidx < arglen) && c != '%') {
5320                 PyErr_SetString(PyExc_TypeError,
5321                                 "not all arguments converted");
5322                 goto onError;
5323             }
5324             Py_XDECREF(temp);
5325         } /* '%' */
5326     } /* until end */
5327     if (argidx < arglen && !dict) {
5328         PyErr_SetString(PyExc_TypeError,
5329                         "not all arguments converted");
5330         goto onError;
5331     }
5332
5333     if (args_owned) {
5334         Py_DECREF(args);
5335     }
5336     Py_DECREF(uformat);
5337     if (_PyUnicode_Resize(&result, reslen - rescnt))
5338         goto onError;
5339     return (PyObject *)result;
5340
5341  onError:
5342     Py_XDECREF(result);
5343     Py_DECREF(uformat);
5344     if (args_owned) {
5345         Py_DECREF(args);
5346     }
5347     return NULL;
5348 }
5349
5350 static PyBufferProcs unicode_as_buffer = {
5351     (getreadbufferproc) unicode_buffer_getreadbuf,
5352     (getwritebufferproc) unicode_buffer_getwritebuf,
5353     (getsegcountproc) unicode_buffer_getsegcount,
5354     (getcharbufferproc) unicode_buffer_getcharbuf,
5355 };
5356
5357 static PyObject *
5358 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5359 {
5360         PyObject *x = NULL;
5361         static char *kwlist[] = {"string", "encoding", "errors", 0};
5362         char *encoding = NULL;
5363         char *errors = NULL;
5364
5365         assert(type == &PyUnicode_Type);
5366         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5367                                           kwlist, &x, &encoding, &errors))
5368             return NULL;
5369         if (x == NULL)
5370                 return (PyObject *)_PyUnicode_New(0);
5371         return PyUnicode_FromEncodedObject(x, encoding, errors);
5372 }
5373
5374 static char unicode_doc[] =
5375 "unicode(string [, encoding[, errors]]) -> object\n\
5376 \n\
5377 Create a new Unicode object from the given encoded string.\n\
5378 encoding defaults to the current default string encoding and \n\
5379 errors, defining the error handling, to 'strict'.";
5380
5381 PyTypeObject PyUnicode_Type = {
5382     PyObject_HEAD_INIT(&PyType_Type)
5383     0,                                  /* ob_size */
5384     "unicode",                          /* tp_name */
5385     sizeof(PyUnicodeObject),            /* tp_size */
5386     0,                                  /* tp_itemsize */
5387     /* Slots */
5388     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5389     0,                                  /* tp_print */
5390     0,                                  /* tp_getattr */
5391     0,                                  /* tp_setattr */
5392     (cmpfunc) unicode_compare,          /* tp_compare */
5393     (reprfunc) unicode_repr,            /* tp_repr */
5394     0,                                  /* tp_as_number */
5395     &unicode_as_sequence,               /* tp_as_sequence */
5396     0,                                  /* tp_as_mapping */
5397     (hashfunc) unicode_hash,            /* tp_hash*/
5398     0,                                  /* tp_call*/
5399     (reprfunc) unicode_str,             /* tp_str */
5400     PyObject_GenericGetAttr,            /* tp_getattro */
5401     0,                                  /* tp_setattro */
5402     &unicode_as_buffer,                 /* tp_as_buffer */
5403     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
5404     unicode_doc,                        /* tp_doc */
5405     0,                                  /* tp_traverse */
5406     0,                                  /* tp_clear */
5407     0,                                  /* tp_richcompare */
5408     0,                                  /* tp_weaklistoffset */
5409     0,                                  /* tp_iter */
5410     0,                                  /* tp_iternext */
5411     unicode_methods,                    /* tp_methods */
5412     0,                                  /* tp_members */
5413     0,                                  /* tp_getset */
5414     0,                                  /* tp_base */
5415     0,                                  /* tp_dict */
5416     0,                                  /* tp_descr_get */
5417     0,                                  /* tp_descr_set */
5418     0,                                  /* tp_dictoffset */
5419     0,                                  /* tp_init */
5420     0,                                  /* tp_alloc */
5421     unicode_new,                        /* tp_new */
5422 };
5423
5424 /* Initialize the Unicode implementation */
5425
5426 void _PyUnicode_Init(void)
5427 {
5428     int i;
5429
5430     /* Init the implementation */
5431     unicode_freelist = NULL;
5432     unicode_freelist_size = 0;
5433     unicode_empty = _PyUnicode_New(0);
5434     strcpy(unicode_default_encoding, "ascii");
5435     for (i = 0; i < 256; i++)
5436         unicode_latin1[i] = NULL;
5437 }
5438
5439 /* Finalize the Unicode implementation */
5440
5441 void
5442 _PyUnicode_Fini(void)
5443 {
5444     PyUnicodeObject *u;
5445     int i;
5446
5447     Py_XDECREF(unicode_empty);
5448     unicode_empty = NULL;
5449
5450     for (i = 0; i < 256; i++) {
5451         if (unicode_latin1[i]) {
5452             Py_DECREF(unicode_latin1[i]);
5453             unicode_latin1[i] = NULL;
5454         }
5455     }
5456
5457     for (u = unicode_freelist; u != NULL;) {
5458         PyUnicodeObject *v = u;
5459         u = *(PyUnicodeObject **)u;
5460         if (v->str)
5461             PyMem_DEL(v->str);
5462         Py_XDECREF(v->defenc);
5463         PyObject_DEL(v);
5464     }
5465     unicode_freelist = NULL;
5466     unicode_freelist_size = 0;
5467 }