Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9
  10  Original header:
  11  --------------------------------------------------------------------
  12
  13  * Yet another Unicode string type for Python.  This type supports the
  14  * 16-bit Basic Multilingual Plane (BMP) only.
  15  *
  16  * Note that this string class supports embedded NULL characters.  End
  17  * of string is given by the length attribute.  However, the internal
  18  * representation always stores a trailing NULL to make it easier to
  19  * use unicode strings with standard APIs.
  20  *
  21  * History:
  22  * 1999-01-23 fl  Created
  23  * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
  24  * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
  25  * 1999-03-06 fl  Moved declarations to separate file, etc.
  26  * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
  27  * 1999-08-10 fl  Some minor tweaks
  28  *
  29  * Written by Fredrik Lundh, January 1999.
  30  *
  31  * Copyright (c) 1999 by Secret Labs AB.
  32  * Copyright (c) 1999 by Fredrik Lundh.
  33  *
  34  * fredrik@pythonware.com
  35  * http://www.pythonware.com
  36  *
  37  * --------------------------------------------------------------------
  38  * This Unicode String Type is
  39  *
  40  * Copyright (c) 1999 by Secret Labs AB
  41  * Copyright (c) 1999 by Fredrik Lundh
  42  *
  43  * By obtaining, using, and/or copying this software and/or its
  44  * associated documentation, you agree that you have read, understood,
  45  * and will comply with the following terms and conditions:
  46  *
  47  * Permission to use, copy, modify, and distribute this software and its
  48  * associated documentation for any purpose and without fee is hereby
  49  * granted, provided that the above copyright notice appears in all
  50  * copies, and that both that copyright notice and this permission notice
  51  * appear in supporting documentation, and that the name of Secret Labs
  52  * AB or the author not be used in advertising or publicity pertaining to
  53  * distribution of the software without specific, written prior
  54  * permission.
  55  *
  56  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  57  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  58  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  59  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  60  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  61  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  62  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  63  * -------------------------------------------------------------------- */
  64
  65 #include "Python.h"
  66
  67 #include "unicodeobject.h"
  68 #include "ucnhash.h"
  69
  70 #ifdef MS_WIN32
  71 #include <windows.h>
  72 #endif
  73
  74 /* Limit for the Unicode object free list */
  75
  76 #define MAX_UNICODE_FREELIST_SIZE       1024
  77
  78 /* Limit for the Unicode object free list stay alive optimization.
  79
  80    The implementation will keep allocated Unicode memory intact for
  81    all objects on the free list having a size less than this
  82    limit. This reduces malloc() overhead for small Unicode objects.
  83
  84    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  85    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  86    malloc()-overhead) bytes of unused garbage.
  87
  88    Setting the limit to 0 effectively turns the feature off.
  89
  90    Note: This is an experimental feature ! If you get core dumps when
  91    using Unicode objects, turn this feature off.
  92
  93 */
  94
  95 #define KEEPALIVE_SIZE_LIMIT       9
  96
  97 /* Endianness switches; defaults to little endian */
  98
  99 #ifdef WORDS_BIGENDIAN
 100 # define BYTEORDER_IS_BIG_ENDIAN
 101 #else
 102 # define BYTEORDER_IS_LITTLE_ENDIAN
 103 #endif
 104
 105 /* --- Globals ------------------------------------------------------------
 106
 107    The globals are initialized by the _PyUnicode_Init() API and should
 108    not be used before calling that API.
 109
 110 */
 111
 112 /* The empty Unicode object */
 113 static PyUnicodeObject *unicode_empty;
 114
 115 /* Free list for Unicode objects */
 116 static PyUnicodeObject *unicode_freelist;
 117 static int unicode_freelist_size;
 118
 119 /* Default encoding to use and assume when NULL is passed as encoding
 120    parameter; it is initialized by _PyUnicode_Init().
 121
 122    Always use the PyUnicode_SetDefaultEncoding() and
 123    PyUnicode_GetDefaultEncoding() APIs to access this global.
 124
 125 */
 126
 127 static char unicode_default_encoding[100];
 128
 129 /* --- Unicode Object ----------------------------------------------------- */
 130
 131 static
 132 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
 133                       int length)
 134 {
 135     void *oldstr;
 136
 137     /* Shortcut if there's nothing much to do. */
 138     if (unicode->length == length)
 139         goto reset;
 140
 141     /* Resizing unicode_empty is not allowed. */
 142     if (unicode == unicode_empty) {
 143         PyErr_SetString(PyExc_SystemError,
 144                         "can't resize empty unicode object");
 145         return -1;
 146     }
 147
 148     /* We allocate one more byte to make sure the string is
 149        Ux0000 terminated -- XXX is this needed ? */
 150     oldstr = unicode->str;
 151     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 152     if (!unicode->str) {
 153         unicode->str = oldstr;
 154         PyErr_NoMemory();
 155         return -1;
 156     }
 157     unicode->str[length] = 0;
 158     unicode->length = length;
 159
 160  reset:
 161     /* Reset the object caches */
 162     if (unicode->defenc) {
 163         Py_DECREF(unicode->defenc);
 164         unicode->defenc = NULL;
 165     }
 166     unicode->hash = -1;
 167
 168     return 0;
 169 }
 170
 171 int PyUnicode_Resize(PyObject **unicode,
 172                      int length)
 173 {
 174     PyUnicodeObject *v;
 175
 176     if (unicode == NULL) {
 177         PyErr_BadInternalCall();
 178         return -1;
 179     }
 180     v = (PyUnicodeObject *)*unicode;
 181     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 182         PyErr_BadInternalCall();
 183         return -1;
 184     }
 185     return _PyUnicode_Resize(v, length);
 186 }
 187
 188 /* We allocate one more byte to make sure the string is
 189    Ux0000 terminated -- XXX is this needed ?
 190
 191    XXX This allocator could further be enhanced by assuring that the
 192        free list never reduces its size below 1.
 193
 194 */
 195
 196 static
 197 PyUnicodeObject *_PyUnicode_New(int length)
 198 {
 199     register PyUnicodeObject *unicode;
 200
 201     /* Optimization for empty strings */
 202     if (length == 0 && unicode_empty != NULL) {
 203         Py_INCREF(unicode_empty);
 204         return unicode_empty;
 205     }
 206
 207     /* Unicode freelist & memory allocation */
 208     if (unicode_freelist) {
 209         unicode = unicode_freelist;
 210         unicode_freelist = *(PyUnicodeObject **)unicode;
 211         unicode_freelist_size--;
 212         if (unicode->str) {
 213             /* Keep-Alive optimization: we only upsize the buffer,
 214                never downsize it. */
 215             if ((unicode->length < length) &&
 216                 _PyUnicode_Resize(unicode, length)) {
 217                 PyMem_DEL(unicode->str);
 218                 goto onError;
 219             }
 220         }
 221       else {
 222             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 223       }
 224       PyObject_INIT(unicode, &PyUnicode_Type);
 225     }
 226     else {
 227         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 228         if (unicode == NULL)
 229             return NULL;
 230         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 231     }
 232
 233     if (!unicode->str) {
 234         PyErr_NoMemory();
 235         goto onError;
 236     }
 237     unicode->str[length] = 0;
 238     unicode->length = length;
 239     unicode->hash = -1;
 240     unicode->defenc = NULL;
 241     return unicode;
 242
 243  onError:
 244     _Py_ForgetReference((PyObject *)unicode);
 245     PyObject_DEL(unicode);
 246     return NULL;
 247 }
 248
 249 static
 250 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 251 {
 252     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 253         /* Keep-Alive optimization */
 254         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 255             PyMem_DEL(unicode->str);
 256             unicode->str = NULL;
 257             unicode->length = 0;
 258         }
 259         if (unicode->defenc) {
 260             Py_DECREF(unicode->defenc);
 261             unicode->defenc = NULL;
 262         }
 263         /* Add to free list */
 264         *(PyUnicodeObject **)unicode = unicode_freelist;
 265         unicode_freelist = unicode;
 266         unicode_freelist_size++;
 267     }
 268     else {
 269         PyMem_DEL(unicode->str);
 270         Py_XDECREF(unicode->defenc);
 271         PyObject_DEL(unicode);
 272     }
 273 }
 274
 275 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 276                                 int size)
 277 {
 278     PyUnicodeObject *unicode;
 279
 280     unicode = _PyUnicode_New(size);
 281     if (!unicode)
 282         return NULL;
 283
 284     /* Copy the Unicode data into the new object */
 285     if (u != NULL)
 286         memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
 287
 288     return (PyObject *)unicode;
 289 }
 290
 291 #ifdef HAVE_WCHAR_H
 292
 293 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 294                                  int size)
 295 {
 296     PyUnicodeObject *unicode;
 297
 298     if (w == NULL) {
 299         PyErr_BadInternalCall();
 300         return NULL;
 301     }
 302
 303     unicode = _PyUnicode_New(size);
 304     if (!unicode)
 305         return NULL;
 306
 307     /* Copy the wchar_t data into the new object */
 308 #ifdef HAVE_USABLE_WCHAR_T
 309     memcpy(unicode->str, w, size * sizeof(wchar_t));
 310 #else
 311     {
 312         register Py_UNICODE *u;
 313         register int i;
 314         u = PyUnicode_AS_UNICODE(unicode);
 315         for (i = size; i >= 0; i--)
 316             *u++ = *w++;
 317     }
 318 #endif
 319
 320     return (PyObject *)unicode;
 321 }
 322
 323 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 324                          register wchar_t *w,
 325                          int size)
 326 {
 327     if (unicode == NULL) {
 328         PyErr_BadInternalCall();
 329         return -1;
 330     }
 331     if (size > PyUnicode_GET_SIZE(unicode))
 332         size = PyUnicode_GET_SIZE(unicode);
 333 #ifdef HAVE_USABLE_WCHAR_T
 334     memcpy(w, unicode->str, size * sizeof(wchar_t));
 335 #else
 336     {
 337         register Py_UNICODE *u;
 338         register int i;
 339         u = PyUnicode_AS_UNICODE(unicode);
 340         for (i = size; i >= 0; i--)
 341             *w++ = *u++;
 342     }
 343 #endif
 344
 345     return size;
 346 }
 347
 348 #endif
 349
 350 PyObject *PyUnicode_FromObject(register PyObject *obj)
 351 {
 352     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 353 }
 354
 355 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 356                                       const char *encoding,
 357                                       const char *errors)
 358 {
 359     const char *s;
 360     int len;
 361     int owned = 0;
 362     PyObject *v;
 363
 364     if (obj == NULL) {
 365         PyErr_BadInternalCall();
 366         return NULL;
 367     }
 368
 369     /* Coerce object */
 370     if (PyInstance_Check(obj)) {
 371         PyObject *func;
 372         func = PyObject_GetAttrString(obj, "__str__");
 373         if (func == NULL) {
 374             PyErr_SetString(PyExc_TypeError,
 375                   "coercing to Unicode: instance doesn't define __str__");
 376             return NULL;
 377         }
 378         obj = PyEval_CallObject(func, NULL);
 379         Py_DECREF(func);
 380         if (obj == NULL)
 381             return NULL;
 382         owned = 1;
 383     }
 384     if (PyUnicode_Check(obj)) {
 385         Py_INCREF(obj);
 386         v = obj;
 387         if (encoding) {
 388             PyErr_SetString(PyExc_TypeError,
 389                             "decoding Unicode is not supported");
 390             return NULL;
 391         }
 392         goto done;
 393     }
 394     else if (PyString_Check(obj)) {
 395         s = PyString_AS_STRING(obj);
 396         len = PyString_GET_SIZE(obj);
 397     }
 398     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 399         /* Overwrite the error message with something more useful in
 400            case of a TypeError. */
 401         if (PyErr_ExceptionMatches(PyExc_TypeError))
 402             PyErr_Format(PyExc_TypeError,
 403                          "coercing to Unicode: need string or buffer, "
 404                          "%.80s found",
 405                          obj->ob_type->tp_name);
 406         goto onError;
 407     }
 408
 409     /* Convert to Unicode */
 410     if (len == 0) {
 411         Py_INCREF(unicode_empty);
 412         v = (PyObject *)unicode_empty;
 413     }
 414     else
 415         v = PyUnicode_Decode(s, len, encoding, errors);
 416  done:
 417     if (owned) {
 418         Py_DECREF(obj);
 419     }
 420     return v;
 421
 422  onError:
 423     if (owned) {
 424         Py_DECREF(obj);
 425     }
 426     return NULL;
 427 }
 428
 429 PyObject *PyUnicode_Decode(const char *s,
 430                            int size,
 431                            const char *encoding,
 432                            const char *errors)
 433 {
 434     PyObject *buffer = NULL, *unicode;
 435
 436     if (encoding == NULL)
 437         encoding = PyUnicode_GetDefaultEncoding();
 438
 439     /* Shortcuts for common default encodings */
 440     if (strcmp(encoding, "utf-8") == 0)
 441         return PyUnicode_DecodeUTF8(s, size, errors);
 442     else if (strcmp(encoding, "latin-1") == 0)
 443         return PyUnicode_DecodeLatin1(s, size, errors);
 444     else if (strcmp(encoding, "ascii") == 0)
 445         return PyUnicode_DecodeASCII(s, size, errors);
 446
 447     /* Decode via the codec registry */
 448     buffer = PyBuffer_FromMemory((void *)s, size);
 449     if (buffer == NULL)
 450         goto onError;
 451     unicode = PyCodec_Decode(buffer, encoding, errors);
 452     if (unicode == NULL)
 453         goto onError;
 454     if (!PyUnicode_Check(unicode)) {
 455         PyErr_Format(PyExc_TypeError,
 456                      "decoder did not return an unicode object (type=%.400s)",
 457                      unicode->ob_type->tp_name);
 458         Py_DECREF(unicode);
 459         goto onError;
 460     }
 461     Py_DECREF(buffer);
 462     return unicode;
 463
 464  onError:
 465     Py_XDECREF(buffer);
 466     return NULL;
 467 }
 468
 469 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 470                            int size,
 471                            const char *encoding,
 472                            const char *errors)
 473 {
 474     PyObject *v, *unicode;
 475
 476     unicode = PyUnicode_FromUnicode(s, size);
 477     if (unicode == NULL)
 478         return NULL;
 479     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 480     Py_DECREF(unicode);
 481     return v;
 482 }
 483
 484 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 485                                     const char *encoding,
 486                                     const char *errors)
 487 {
 488     PyObject *v;
 489
 490     if (!PyUnicode_Check(unicode)) {
 491         PyErr_BadArgument();
 492         goto onError;
 493     }
 494
 495     if (encoding == NULL)
 496         encoding = PyUnicode_GetDefaultEncoding();
 497
 498     /* Shortcuts for common default encodings */
 499     if (errors == NULL) {
 500         if (strcmp(encoding, "utf-8") == 0)
 501         return PyUnicode_AsUTF8String(unicode);
 502         else if (strcmp(encoding, "latin-1") == 0)
 503             return PyUnicode_AsLatin1String(unicode);
 504         else if (strcmp(encoding, "ascii") == 0)
 505             return PyUnicode_AsASCIIString(unicode);
 506     }
 507
 508     /* Encode via the codec registry */
 509     v = PyCodec_Encode(unicode, encoding, errors);
 510     if (v == NULL)
 511         goto onError;
 512     /* XXX Should we really enforce this ? */
 513     if (!PyString_Check(v)) {
 514         PyErr_Format(PyExc_TypeError,
 515                      "encoder did not return a string object (type=%.400s)",
 516                      v->ob_type->tp_name);
 517         Py_DECREF(v);
 518         goto onError;
 519     }
 520     return v;
 521
 522  onError:
 523     return NULL;
 524 }
 525
 526 /* Return a Python string holding the default encoded value of the
 527    Unicode object.
 528
 529    The resulting string is cached in the Unicode object for subsequent
 530    usage by this function. The cached version is needed to implement
 531    the character buffer interface and will live (at least) as long as
 532    the Unicode object itself.
 533
 534    The refcount of the string is *not* incremented.
 535
 536    *** Exported for internal use by the interpreter only !!! ***
 537
 538 */
 539
 540 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 541                                             const char *errors)
 542 {
 543     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 544
 545     if (v)
 546         return v;
 547     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 548     if (v && errors == NULL)
 549         ((PyUnicodeObject *)unicode)->defenc = v;
 550     return v;
 551 }
 552
 553 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 554 {
 555     if (!PyUnicode_Check(unicode)) {
 556         PyErr_BadArgument();
 557         goto onError;
 558     }
 559     return PyUnicode_AS_UNICODE(unicode);
 560
 561  onError:
 562     return NULL;
 563 }
 564
 565 int PyUnicode_GetSize(PyObject *unicode)
 566 {
 567     if (!PyUnicode_Check(unicode)) {
 568         PyErr_BadArgument();
 569         goto onError;
 570     }
 571     return PyUnicode_GET_SIZE(unicode);
 572
 573  onError:
 574     return -1;
 575 }
 576
 577 const char *PyUnicode_GetDefaultEncoding(void)
 578 {
 579     return unicode_default_encoding;
 580 }
 581
 582 int PyUnicode_SetDefaultEncoding(const char *encoding)
 583 {
 584     PyObject *v;
 585
 586     /* Make sure the encoding is valid. As side effect, this also
 587        loads the encoding into the codec registry cache. */
 588     v = _PyCodec_Lookup(encoding);
 589     if (v == NULL)
 590         goto onError;
 591     Py_DECREF(v);
 592     strncpy(unicode_default_encoding,
 593             encoding,
 594             sizeof(unicode_default_encoding));
 595     return 0;
 596
 597  onError:
 598     return -1;
 599 }
 600
 601 /* --- UTF-8 Codec -------------------------------------------------------- */
 602
 603 static
 604 char utf8_code_length[256] = {
 605     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 606        illegal prefix.  see RFC 2279 for details */
 607     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 608     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 609     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 610     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 611     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 612     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 613     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 614     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 615     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 616     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 617     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 618     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 619     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 620     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 622     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 623 };
 624
 625 static
 626 int utf8_decoding_error(const char **source,
 627                         Py_UNICODE **dest,
 628                         const char *errors,
 629                         const char *details)
 630 {
 631     if ((errors == NULL) ||
 632         (strcmp(errors,"strict") == 0)) {
 633         PyErr_Format(PyExc_UnicodeError,
 634                      "UTF-8 decoding error: %.400s",
 635                      details);
 636         return -1;
 637     }
 638     else if (strcmp(errors,"ignore") == 0) {
 639         (*source)++;
 640         return 0;
 641     }
 642     else if (strcmp(errors,"replace") == 0) {
 643         (*source)++;
 644         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 645         (*dest)++;
 646         return 0;
 647     }
 648     else {
 649         PyErr_Format(PyExc_ValueError,
 650                      "UTF-8 decoding error; unknown error handling code: %.400s",
 651                      errors);
 652         return -1;
 653     }
 654 }
 655
 656 PyObject *PyUnicode_DecodeUTF8(const char *s,
 657                                int size,
 658                                const char *errors)
 659 {
 660     int n;
 661     const char *e;
 662     PyUnicodeObject *unicode;
 663     Py_UNICODE *p;
 664     const char *errmsg = "";
 665
 666     /* Note: size will always be longer than the resulting Unicode
 667        character count */
 668     unicode = _PyUnicode_New(size);
 669     if (!unicode)
 670         return NULL;
 671     if (size == 0)
 672         return (PyObject *)unicode;
 673
 674     /* Unpack UTF-8 encoded data */
 675     p = unicode->str;
 676     e = s + size;
 677
 678     while (s < e) {
 679         Py_UCS4 ch = (unsigned char)*s;
 680
 681         if (ch < 0x80) {
 682             *p++ = (Py_UNICODE)ch;
 683             s++;
 684             continue;
 685         }
 686
 687         n = utf8_code_length[ch];
 688
 689         if (s + n > e) {
 690             errmsg = "unexpected end of data";
 691             goto utf8Error;
 692         }
 693
 694         switch (n) {
 695
 696         case 0:
 697             errmsg = "unexpected code byte";
 698             goto utf8Error;
 699             break;
 700
 701         case 1:
 702             errmsg = "internal error";
 703             goto utf8Error;
 704             break;
 705
 706         case 2:
 707             if ((s[1] & 0xc0) != 0x80) {
 708                 errmsg = "invalid data";
 709                 goto utf8Error;
 710             }
 711             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
 712             if (ch < 0x80) {
 713                 errmsg = "illegal encoding";
 714                 goto utf8Error;
 715             }
 716             else
 717                 *p++ = (Py_UNICODE)ch;
 718             break;
 719
 720         case 3:
 721             if ((s[1] & 0xc0) != 0x80 ||
 722                 (s[2] & 0xc0) != 0x80) {
 723                 errmsg = "invalid data";
 724                 goto utf8Error;
 725             }
 726             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
 727             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
 728                 errmsg = "illegal encoding";
 729                 goto utf8Error;
 730             }
 731             else
 732                                 *p++ = (Py_UNICODE)ch;
 733             break;
 734
 735         case 4:
 736             if ((s[1] & 0xc0) != 0x80 ||
 737                 (s[2] & 0xc0) != 0x80 ||
 738                 (s[3] & 0xc0) != 0x80) {
 739                 errmsg = "invalid data";
 740                 goto utf8Error;
 741             }
 742             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
 743                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
 744             /* validate and convert to UTF-16 */
 745             if ((ch < 0x10000) ||   /* minimum value allowed for 4
 746                                        byte encoding */
 747                 (ch > 0x10ffff)) {  /* maximum value allowed for
 748                                        UTF-16 */
 749                 errmsg = "illegal encoding";
 750                 goto utf8Error;
 751             }
 752             /*  compute and append the two surrogates: */
 753
 754             /*  translate from 10000..10FFFF to 0..FFFF */
 755             ch -= 0x10000;
 756
 757             /*  high surrogate = top 10 bits added to D800 */
 758             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
 759
 760             /*  low surrogate = bottom 10 bits added to DC00 */
 761             *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
 762             break;
 763
 764         default:
 765             /* Other sizes are only needed for UCS-4 */
 766             errmsg = "unsupported Unicode code range";
 767             goto utf8Error;
 768             break;
 769         }
 770         s += n;
 771         continue;
 772
 773     utf8Error:
 774       if (utf8_decoding_error(&s, &p, errors, errmsg))
 775           goto onError;
 776     }
 777
 778     /* Adjust length */
 779     if (_PyUnicode_Resize(unicode, p - unicode->str))
 780         goto onError;
 781
 782     return (PyObject *)unicode;
 783
 784 onError:
 785     Py_DECREF(unicode);
 786     return NULL;
 787 }
 788
 789 /* Not used anymore, now that the encoder supports UTF-16
 790    surrogates. */
 791 #if 0
 792 static
 793 int utf8_encoding_error(const Py_UNICODE **source,
 794                         char **dest,
 795                         const char *errors,
 796                         const char *details)
 797 {
 798     if ((errors == NULL) ||
 799         (strcmp(errors,"strict") == 0)) {
 800         PyErr_Format(PyExc_UnicodeError,
 801                      "UTF-8 encoding error: %.400s",
 802                      details);
 803         return -1;
 804     }
 805     else if (strcmp(errors,"ignore") == 0) {
 806         return 0;
 807     }
 808     else if (strcmp(errors,"replace") == 0) {
 809         **dest = '?';
 810         (*dest)++;
 811         return 0;
 812     }
 813     else {
 814         PyErr_Format(PyExc_ValueError,
 815                      "UTF-8 encoding error; "
 816                      "unknown error handling code: %.400s",
 817                      errors);
 818         return -1;
 819     }
 820 }
 821 #endif
 822
 823 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 824                                int size,
 825                                const char *errors)
 826 {
 827     PyObject *v;
 828     char *p;
 829     char *q;
 830     Py_UCS4 ch2;
 831     unsigned int cbAllocated = 3 * size;
 832     unsigned int cbWritten = 0;
 833     int i = 0;
 834
 835     v = PyString_FromStringAndSize(NULL, cbAllocated);
 836     if (v == NULL)
 837         return NULL;
 838     if (size == 0)
 839         return v;
 840
 841     p = q = PyString_AS_STRING(v);
 842     while (i < size) {
 843         Py_UCS4 ch = s[i++];
 844         if (ch < 0x80) {
 845             *p++ = (char) ch;
 846             cbWritten++;
 847         }
 848         else if (ch < 0x0800) {
 849             *p++ = 0xc0 | (ch >> 6);
 850             *p++ = 0x80 | (ch & 0x3f);
 851             cbWritten += 2;
 852         }
 853         else {
 854             /* Check for high surrogate */
 855             if (0xD800 <= ch && ch <= 0xDBFF) {
 856                 if (i != size) {
 857                     ch2 = s[i];
 858                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 859
 860                         if (cbWritten >= (cbAllocated - 4)) {
 861                             /* Provide enough room for some more
 862                                surrogates */
 863                             cbAllocated += 4*10;
 864                             if (_PyString_Resize(&v, cbAllocated))
 865                                 goto onError;
 866                         }
 867
 868                         /* combine the two values */
 869                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
 870
 871                         *p++ = (char)((ch >> 18) | 0xf0);
 872                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 873                         i++;
 874                         cbWritten += 4;
 875                     }
 876                 }
 877             }
 878             else {
 879                 *p++ = (char)(0xe0 | (ch >> 12));
 880                 cbWritten += 3;
 881             }
 882             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 883             *p++ = (char)(0x80 | (ch & 0x3f));
 884         }
 885     }
 886     *p = '\0';
 887     if (_PyString_Resize(&v, p - q))
 888         goto onError;
 889     return v;
 890
 891  onError:
 892     Py_DECREF(v);
 893     return NULL;
 894 }
 895
 896 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 897 {
 898     if (!PyUnicode_Check(unicode)) {
 899         PyErr_BadArgument();
 900         return NULL;
 901     }
 902     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
 903                                 PyUnicode_GET_SIZE(unicode),
 904                                 NULL);
 905 }
 906
 907 /* --- UTF-16 Codec ------------------------------------------------------- */
 908
 909 static
 910 int utf16_decoding_error(const Py_UNICODE **source,
 911                          Py_UNICODE **dest,
 912                          const char *errors,
 913                          const char *details)
 914 {
 915     if ((errors == NULL) ||
 916         (strcmp(errors,"strict") == 0)) {
 917         PyErr_Format(PyExc_UnicodeError,
 918                      "UTF-16 decoding error: %.400s",
 919                      details);
 920         return -1;
 921     }
 922     else if (strcmp(errors,"ignore") == 0) {
 923         return 0;
 924     }
 925     else if (strcmp(errors,"replace") == 0) {
 926         if (dest) {
 927             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 928             (*dest)++;
 929         }
 930         return 0;
 931     }
 932     else {
 933         PyErr_Format(PyExc_ValueError,
 934                      "UTF-16 decoding error; "
 935                      "unknown error handling code: %.400s",
 936                      errors);
 937         return -1;
 938     }
 939 }
 940
 941 PyObject *PyUnicode_DecodeUTF16(const char *s,
 942                                 int size,
 943                                 const char *errors,
 944                                 int *byteorder)
 945 {
 946     PyUnicodeObject *unicode;
 947     Py_UNICODE *p;
 948     const Py_UNICODE *q, *e;
 949     int bo = 0;
 950     const char *errmsg = "";
 951
 952     /* size should be an even number */
 953     if (size % sizeof(Py_UNICODE) != 0) {
 954         if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
 955             return NULL;
 956         /* The remaining input chars are ignored if we fall through
 957            here... */
 958     }
 959
 960     /* Note: size will always be longer than the resulting Unicode
 961        character count */
 962     unicode = _PyUnicode_New(size);
 963     if (!unicode)
 964         return NULL;
 965     if (size == 0)
 966         return (PyObject *)unicode;
 967
 968     /* Unpack UTF-16 encoded data */
 969     p = unicode->str;
 970     q = (Py_UNICODE *)s;
 971     e = q + (size / sizeof(Py_UNICODE));
 972
 973     if (byteorder)
 974         bo = *byteorder;
 975
 976     while (q < e) {
 977         register Py_UNICODE ch = *q++;
 978
 979         /* Check for BOM marks (U+FEFF) in the input and adjust
 980            current byte order setting accordingly. Swap input
 981            bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
 982            !) */
 983 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 984         if (ch == 0xFEFF) {
 985             bo = -1;
 986             continue;
 987         } else if (ch == 0xFFFE) {
 988             bo = 1;
 989             continue;
 990         }
 991         if (bo == 1)
 992             ch = (ch >> 8) | (ch << 8);
 993 #else
 994         if (ch == 0xFEFF) {
 995             bo = 1;
 996             continue;
 997         } else if (ch == 0xFFFE) {
 998             bo = -1;
 999             continue;
1000         }
1001         if (bo == -1)
1002             ch = (ch >> 8) | (ch << 8);
1003 #endif
1004         if (ch < 0xD800 || ch > 0xDFFF) {
1005             *p++ = ch;
1006             continue;
1007         }
1008
1009         /* UTF-16 code pair: */
1010         if (q >= e) {
1011             errmsg = "unexpected end of data";
1012             goto utf16Error;
1013         }
1014         if (0xDC00 <= *q && *q <= 0xDFFF) {
1015             q++;
1016             if (0xD800 <= *q && *q <= 0xDBFF) {
1017                 /* This is valid data (a UTF-16 surrogate pair), but
1018                    we are not able to store this information since our
1019                    Py_UNICODE type only has 16 bits... this might
1020                    change someday, even though it's unlikely. */
1021                 errmsg = "code pairs are not supported";
1022                 goto utf16Error;
1023             }
1024             else
1025                 continue;
1026         }
1027         errmsg = "illegal encoding";
1028         /* Fall through to report the error */
1029
1030     utf16Error:
1031         if (utf16_decoding_error(&q, &p, errors, errmsg))
1032             goto onError;
1033     }
1034
1035     if (byteorder)
1036         *byteorder = bo;
1037
1038     /* Adjust length */
1039     if (_PyUnicode_Resize(unicode, p - unicode->str))
1040         goto onError;
1041
1042     return (PyObject *)unicode;
1043
1044 onError:
1045     Py_DECREF(unicode);
1046     return NULL;
1047 }
1048
1049 #undef UTF16_ERROR
1050
1051 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1052                                 int size,
1053                                 const char *errors,
1054                                 int byteorder)
1055 {
1056     PyObject *v;
1057     Py_UNICODE *p;
1058     char *q;
1059
1060     /* We don't create UTF-16 pairs... */
1061     v = PyString_FromStringAndSize(NULL,
1062                         sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1063     if (v == NULL)
1064         return NULL;
1065
1066     q = PyString_AS_STRING(v);
1067     p = (Py_UNICODE *)q;
1068     if (byteorder == 0)
1069         *p++ = 0xFEFF;
1070     if (size == 0)
1071         return v;
1072     if (byteorder == 0 ||
1073 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1074         byteorder == -1
1075 #else
1076         byteorder == 1
1077 #endif
1078         )
1079         memcpy(p, s, size * sizeof(Py_UNICODE));
1080     else
1081         while (size-- > 0) {
1082             Py_UNICODE ch = *s++;
1083             *p++ = (ch >> 8) | (ch << 8);
1084         }
1085     return v;
1086 }
1087
1088 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1089 {
1090     if (!PyUnicode_Check(unicode)) {
1091         PyErr_BadArgument();
1092         return NULL;
1093     }
1094     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1095                                  PyUnicode_GET_SIZE(unicode),
1096                                  NULL,
1097                                  0);
1098 }
1099
1100 /* --- Unicode Escape Codec ----------------------------------------------- */
1101
1102 static
1103 int unicodeescape_decoding_error(const char **source,
1104                                  Py_UNICODE *x,
1105                                  const char *errors,
1106                                  const char *details)
1107 {
1108     if ((errors == NULL) ||
1109         (strcmp(errors,"strict") == 0)) {
1110         PyErr_Format(PyExc_UnicodeError,
1111                      "Unicode-Escape decoding error: %.400s",
1112                      details);
1113         return -1;
1114     }
1115     else if (strcmp(errors,"ignore") == 0) {
1116         return 0;
1117     }
1118     else if (strcmp(errors,"replace") == 0) {
1119         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1120         return 0;
1121     }
1122     else {
1123         PyErr_Format(PyExc_ValueError,
1124                      "Unicode-Escape decoding error; "
1125                      "unknown error handling code: %.400s",
1126                      errors);
1127         return -1;
1128     }
1129 }
1130
1131 static _Py_UCNHashAPI *pucnHash = NULL;
1132
1133 static
1134 int mystrnicmp(const char *s1, const char *s2, size_t count)
1135 {
1136     char c1, c2;
1137
1138     if (count)
1139     {
1140         do
1141         {
1142            c1 = tolower(*(s1++));
1143            c2 = tolower(*(s2++));
1144         }
1145         while(--count && c1 == c2);
1146
1147         return c1 - c2;
1148     }
1149
1150     return 0;
1151 }
1152
1153 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1154                                         int size,
1155                                         const char *errors)
1156 {
1157     PyUnicodeObject *v;
1158     Py_UNICODE *p = NULL, *buf = NULL;
1159     const char *end;
1160     Py_UCS4 chr;
1161
1162     /* Escaped strings will always be longer than the resulting
1163        Unicode string, so we start with size here and then reduce the
1164        length after conversion to the true value. */
1165     v = _PyUnicode_New(size);
1166     if (v == NULL)
1167         goto onError;
1168     if (size == 0)
1169         return (PyObject *)v;
1170     p = buf = PyUnicode_AS_UNICODE(v);
1171     end = s + size;
1172     while (s < end) {
1173         unsigned char c;
1174         Py_UNICODE x;
1175         int i;
1176
1177         /* Non-escape characters are interpreted as Unicode ordinals */
1178         if (*s != '\\') {
1179             *p++ = (unsigned char)*s++;
1180             continue;
1181         }
1182
1183         /* \ - Escapes */
1184         s++;
1185         switch (*s++) {
1186
1187         /* \x escapes */
1188         case '\n': break;
1189         case '\\': *p++ = '\\'; break;
1190         case '\'': *p++ = '\''; break;
1191         case '\"': *p++ = '\"'; break;
1192         case 'b': *p++ = '\b'; break;
1193         case 'f': *p++ = '\014'; break; /* FF */
1194         case 't': *p++ = '\t'; break;
1195         case 'n': *p++ = '\n'; break;
1196         case 'r': *p++ = '\r'; break;
1197         case 'v': *p++ = '\013'; break; /* VT */
1198         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1199
1200         /* \OOO (octal) escapes */
1201         case '0': case '1': case '2': case '3':
1202         case '4': case '5': case '6': case '7':
1203             x = s[-1] - '0';
1204             if ('0' <= *s && *s <= '7') {
1205                 x = (x<<3) + *s++ - '0';
1206                 if ('0' <= *s && *s <= '7')
1207                     x = (x<<3) + *s++ - '0';
1208             }
1209             *p++ = x;
1210             break;
1211
1212         /* \xXX with two hex digits */
1213         case 'x':
1214             for (x = 0, i = 0; i < 2; i++) {
1215                 c = (unsigned char)s[i];
1216                 if (!isxdigit(c)) {
1217                     if (unicodeescape_decoding_error(&s, &x, errors,
1218                                                      "truncated \\xXX"))
1219                         goto onError;
1220                     i++;
1221                     break;
1222                 }
1223                 x = (x<<4) & ~0xF;
1224                 if (c >= '0' && c <= '9')
1225                     x += c - '0';
1226                 else if (c >= 'a' && c <= 'f')
1227                     x += 10 + c - 'a';
1228                 else
1229                     x += 10 + c - 'A';
1230             }
1231             s += i;
1232             *p++ = x;
1233             break;
1234
1235         /* \uXXXX with 4 hex digits */
1236         case 'u':
1237             for (x = 0, i = 0; i < 4; i++) {
1238                 c = (unsigned char)s[i];
1239                 if (!isxdigit(c)) {
1240                     if (unicodeescape_decoding_error(&s, &x, errors,
1241                                                      "truncated \\uXXXX"))
1242                         goto onError;
1243                     i++;
1244                     break;
1245                 }
1246                 x = (x<<4) & ~0xF;
1247                 if (c >= '0' && c <= '9')
1248                     x += c - '0';
1249                 else if (c >= 'a' && c <= 'f')
1250                     x += 10 + c - 'a';
1251                 else
1252                     x += 10 + c - 'A';
1253             }
1254             s += i;
1255             *p++ = x;
1256             break;
1257
1258         /* \UXXXXXXXX with 8 hex digits */
1259         case 'U':
1260             for (chr = 0, i = 0; i < 8; i++) {
1261                 c = (unsigned char)s[i];
1262                 if (!isxdigit(c)) {
1263                     if (unicodeescape_decoding_error(&s, &x, errors,
1264                                                      "truncated \\uXXXX"))
1265                         goto onError;
1266                     i++;
1267                     break;
1268                 }
1269                 chr = (chr<<4) & ~0xF;
1270                 if (c >= '0' && c <= '9')
1271                     chr += c - '0';
1272                 else if (c >= 'a' && c <= 'f')
1273                     chr += 10 + c - 'a';
1274                 else
1275                     chr += 10 + c - 'A';
1276             }
1277             s += i;
1278             goto store;
1279
1280         case 'N':
1281             /* Ok, we need to deal with Unicode Character Names now,
1282              * make sure we've imported the hash table data...
1283              */
1284             if (pucnHash == NULL) {
1285                 PyObject *mod = 0, *v = 0;
1286                 mod = PyImport_ImportModule("ucnhash");
1287                 if (mod == NULL)
1288                     goto onError;
1289                 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1290                 Py_DECREF(mod);
1291                 if (v == NULL)
1292                     goto onError;
1293                 pucnHash = PyCObject_AsVoidPtr(v);
1294                 Py_DECREF(v);
1295                 if (pucnHash == NULL)
1296                     goto onError;
1297             }
1298
1299             if (*s == '{') {
1300                 const char *start = s + 1;
1301                 const char *endBrace = start;
1302                 unsigned long j;
1303
1304                 /* look for either the closing brace, or we
1305                  * exceed the maximum length of the unicode character names
1306                  */
1307                 while (*endBrace != '}' &&
1308                        (unsigned int)(endBrace - start) <=
1309                            pucnHash->cchMax &&
1310                        endBrace < end)
1311                 {
1312                     endBrace++;
1313                 }
1314                 if (endBrace != end && *endBrace == '}') {
1315                     j = pucnHash->hash(start, endBrace - start);
1316                     if (j > pucnHash->cKeys ||
1317                         mystrnicmp(
1318                             start,
1319                             ((_Py_UnicodeCharacterName *)
1320                              (pucnHash->getValue(j)))->pszUCN,
1321                             (int)(endBrace - start)) != 0)
1322                     {
1323                         if (unicodeescape_decoding_error(
1324                                 &s, &x, errors,
1325                                 "Invalid Unicode Character Name"))
1326                         {
1327                             goto onError;
1328                         }
1329                         goto ucnFallthrough;
1330                     }
1331                     chr = ((_Py_UnicodeCharacterName *)
1332                            (pucnHash->getValue(j)))->value;
1333                     s = endBrace + 1;
1334                     goto store;
1335                 } else {
1336                     if (unicodeescape_decoding_error(
1337                             &s, &x, errors,
1338                             "Unicode name missing closing brace"))
1339                         goto onError;
1340                     goto ucnFallthrough;
1341                 }
1342                 break;
1343             }
1344             if (unicodeescape_decoding_error(
1345                     &s, &x, errors,
1346                     "Missing opening brace for Unicode Character Name escape"))
1347                 goto onError;
1348 ucnFallthrough:
1349             /* fall through on purpose */
1350                 default:
1351             *p++ = '\\';
1352             *p++ = (unsigned char)s[-1];
1353             break;
1354 store:
1355             /* when we get here, chr is a 32-bit unicode character */
1356             if (chr <= 0xffff)
1357                 /* UCS-2 character */
1358                 *p++ = (Py_UNICODE) chr;
1359             else if (chr <= 0x10ffff) {
1360                 /* UCS-4 character.  store as two surrogate characters */
1361                 chr -= 0x10000L;
1362                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1363                 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1364             } else {
1365                 if (unicodeescape_decoding_error(
1366                     &s, &x, errors,
1367                     "Illegal Unicode character")
1368                     )
1369                     goto onError;
1370             }
1371         }
1372     }
1373     if (_PyUnicode_Resize(v, (int)(p - buf)))
1374                 goto onError;
1375     return (PyObject *)v;
1376
1377  onError:
1378     Py_XDECREF(v);
1379     return NULL;
1380 }
1381
1382 /* Return a Unicode-Escape string version of the Unicode object.
1383
1384    If quotes is true, the string is enclosed in u"" or u'' quotes as
1385    appropriate.
1386
1387 */
1388
1389 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1390                                   int size,
1391                                   Py_UNICODE ch);
1392
1393 static
1394 PyObject *unicodeescape_string(const Py_UNICODE *s,
1395                                int size,
1396                                int quotes)
1397 {
1398     PyObject *repr;
1399     char *p;
1400     char *q;
1401
1402     static const char *hexdigit = "0123456789ABCDEF";
1403
1404     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1405     if (repr == NULL)
1406         return NULL;
1407
1408     p = q = PyString_AS_STRING(repr);
1409
1410     if (quotes) {
1411         *p++ = 'u';
1412         *p++ = (findchar(s, size, '\'') &&
1413                 !findchar(s, size, '"')) ? '"' : '\'';
1414     }
1415     while (size-- > 0) {
1416         Py_UNICODE ch = *s++;
1417         /* Escape quotes */
1418         if (quotes && (ch == q[1] || ch == '\\')) {
1419             *p++ = '\\';
1420             *p++ = (char) ch;
1421         }
1422         /* Map 16-bit characters to '\uxxxx' */
1423         else if (ch >= 256) {
1424             *p++ = '\\';
1425             *p++ = 'u';
1426             *p++ = hexdigit[(ch >> 12) & 0xf];
1427             *p++ = hexdigit[(ch >> 8) & 0xf];
1428             *p++ = hexdigit[(ch >> 4) & 0xf];
1429             *p++ = hexdigit[ch & 15];
1430         }
1431         /* Map non-printable US ASCII to '\ooo' */
1432         else if (ch < ' ' || ch >= 128) {
1433             *p++ = '\\';
1434             *p++ = hexdigit[(ch >> 6) & 7];
1435             *p++ = hexdigit[(ch >> 3) & 7];
1436             *p++ = hexdigit[ch & 7];
1437         }
1438         /* Copy everything else as-is */
1439         else
1440             *p++ = (char) ch;
1441     }
1442     if (quotes)
1443         *p++ = q[1];
1444
1445     *p = '\0';
1446     if (_PyString_Resize(&repr, p - q))
1447         goto onError;
1448
1449     return repr;
1450
1451  onError:
1452     Py_DECREF(repr);
1453     return NULL;
1454 }
1455
1456 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1457                                         int size)
1458 {
1459     return unicodeescape_string(s, size, 0);
1460 }
1461
1462 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1463 {
1464     if (!PyUnicode_Check(unicode)) {
1465         PyErr_BadArgument();
1466         return NULL;
1467     }
1468     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1469                                          PyUnicode_GET_SIZE(unicode));
1470 }
1471
1472 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1473
1474 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1475                                            int size,
1476                                            const char *errors)
1477 {
1478     PyUnicodeObject *v;
1479     Py_UNICODE *p, *buf;
1480     const char *end;
1481     const char *bs;
1482
1483     /* Escaped strings will always be longer than the resulting
1484        Unicode string, so we start with size here and then reduce the
1485        length after conversion to the true value. */
1486     v = _PyUnicode_New(size);
1487     if (v == NULL)
1488         goto onError;
1489     if (size == 0)
1490         return (PyObject *)v;
1491     p = buf = PyUnicode_AS_UNICODE(v);
1492     end = s + size;
1493     while (s < end) {
1494         unsigned char c;
1495         Py_UNICODE x;
1496         int i;
1497
1498         /* Non-escape characters are interpreted as Unicode ordinals */
1499         if (*s != '\\') {
1500             *p++ = (unsigned char)*s++;
1501             continue;
1502         }
1503
1504         /* \u-escapes are only interpreted iff the number of leading
1505            backslashes if odd */
1506         bs = s;
1507         for (;s < end;) {
1508             if (*s != '\\')
1509                 break;
1510             *p++ = (unsigned char)*s++;
1511         }
1512         if (((s - bs) & 1) == 0 ||
1513             s >= end ||
1514             *s != 'u') {
1515             continue;
1516         }
1517         p--;
1518         s++;
1519
1520         /* \uXXXX with 4 hex digits */
1521         for (x = 0, i = 0; i < 4; i++) {
1522             c = (unsigned char)s[i];
1523             if (!isxdigit(c)) {
1524                 if (unicodeescape_decoding_error(&s, &x, errors,
1525                                                  "truncated \\uXXXX"))
1526                     goto onError;
1527                 i++;
1528                 break;
1529             }
1530             x = (x<<4) & ~0xF;
1531             if (c >= '0' && c <= '9')
1532                 x += c - '0';
1533             else if (c >= 'a' && c <= 'f')
1534                 x += 10 + c - 'a';
1535             else
1536                 x += 10 + c - 'A';
1537         }
1538         s += i;
1539         *p++ = x;
1540     }
1541     if (_PyUnicode_Resize(v, (int)(p - buf)))
1542         goto onError;
1543     return (PyObject *)v;
1544
1545  onError:
1546     Py_XDECREF(v);
1547     return NULL;
1548 }
1549
1550 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1551                                            int size)
1552 {
1553     PyObject *repr;
1554     char *p;
1555     char *q;
1556
1557     static const char *hexdigit = "0123456789ABCDEF";
1558
1559     repr = PyString_FromStringAndSize(NULL, 6 * size);
1560     if (repr == NULL)
1561         return NULL;
1562     if (size == 0)
1563         return repr;
1564
1565     p = q = PyString_AS_STRING(repr);
1566     while (size-- > 0) {
1567         Py_UNICODE ch = *s++;
1568         /* Map 16-bit characters to '\uxxxx' */
1569         if (ch >= 256) {
1570             *p++ = '\\';
1571             *p++ = 'u';
1572             *p++ = hexdigit[(ch >> 12) & 0xf];
1573             *p++ = hexdigit[(ch >> 8) & 0xf];
1574             *p++ = hexdigit[(ch >> 4) & 0xf];
1575             *p++ = hexdigit[ch & 15];
1576         }
1577         /* Copy everything else as-is */
1578         else
1579             *p++ = (char) ch;
1580     }
1581     *p = '\0';
1582     if (_PyString_Resize(&repr, p - q))
1583         goto onError;
1584
1585     return repr;
1586
1587  onError:
1588     Py_DECREF(repr);
1589     return NULL;
1590 }
1591
1592 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1593 {
1594     if (!PyUnicode_Check(unicode)) {
1595         PyErr_BadArgument();
1596         return NULL;
1597     }
1598     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1599                                             PyUnicode_GET_SIZE(unicode));
1600 }
1601
1602 /* --- Latin-1 Codec ------------------------------------------------------ */
1603
1604 PyObject *PyUnicode_DecodeLatin1(const char *s,
1605                                  int size,
1606                                  const char *errors)
1607 {
1608     PyUnicodeObject *v;
1609     Py_UNICODE *p;
1610
1611     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1612     v = _PyUnicode_New(size);
1613     if (v == NULL)
1614         goto onError;
1615     if (size == 0)
1616         return (PyObject *)v;
1617     p = PyUnicode_AS_UNICODE(v);
1618     while (size-- > 0)
1619         *p++ = (unsigned char)*s++;
1620     return (PyObject *)v;
1621
1622  onError:
1623     Py_XDECREF(v);
1624     return NULL;
1625 }
1626
1627 static
1628 int latin1_encoding_error(const Py_UNICODE **source,
1629                           char **dest,
1630                           const char *errors,
1631                           const char *details)
1632 {
1633     if ((errors == NULL) ||
1634         (strcmp(errors,"strict") == 0)) {
1635         PyErr_Format(PyExc_UnicodeError,
1636                      "Latin-1 encoding error: %.400s",
1637                      details);
1638         return -1;
1639     }
1640     else if (strcmp(errors,"ignore") == 0) {
1641         return 0;
1642     }
1643     else if (strcmp(errors,"replace") == 0) {
1644         **dest = '?';
1645         (*dest)++;
1646         return 0;
1647     }
1648     else {
1649         PyErr_Format(PyExc_ValueError,
1650                      "Latin-1 encoding error; "
1651                      "unknown error handling code: %.400s",
1652                      errors);
1653         return -1;
1654     }
1655 }
1656
1657 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1658                                  int size,
1659                                  const char *errors)
1660 {
1661     PyObject *repr;
1662     char *s, *start;
1663
1664     repr = PyString_FromStringAndSize(NULL, size);
1665     if (repr == NULL)
1666         return NULL;
1667     if (size == 0)
1668         return repr;
1669
1670     s = PyString_AS_STRING(repr);
1671     start = s;
1672     while (size-- > 0) {
1673         Py_UNICODE ch = *p++;
1674         if (ch >= 256) {
1675             if (latin1_encoding_error(&p, &s, errors,
1676                                       "ordinal not in range(256)"))
1677                 goto onError;
1678         }
1679         else
1680             *s++ = (char)ch;
1681     }
1682     /* Resize if error handling skipped some characters */
1683     if (s - start < PyString_GET_SIZE(repr))
1684         if (_PyString_Resize(&repr, s - start))
1685             goto onError;
1686     return repr;
1687
1688  onError:
1689     Py_DECREF(repr);
1690     return NULL;
1691 }
1692
1693 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1694 {
1695     if (!PyUnicode_Check(unicode)) {
1696         PyErr_BadArgument();
1697         return NULL;
1698     }
1699     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1700                                   PyUnicode_GET_SIZE(unicode),
1701                                   NULL);
1702 }
1703
1704 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1705
1706 static
1707 int ascii_decoding_error(const char **source,
1708                          Py_UNICODE **dest,
1709                          const char *errors,
1710                          const char *details)
1711 {
1712     if ((errors == NULL) ||
1713         (strcmp(errors,"strict") == 0)) {
1714         PyErr_Format(PyExc_UnicodeError,
1715                      "ASCII decoding error: %.400s",
1716                      details);
1717         return -1;
1718     }
1719     else if (strcmp(errors,"ignore") == 0) {
1720         return 0;
1721     }
1722     else if (strcmp(errors,"replace") == 0) {
1723         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1724         (*dest)++;
1725         return 0;
1726     }
1727     else {
1728         PyErr_Format(PyExc_ValueError,
1729                      "ASCII decoding error; "
1730                      "unknown error handling code: %.400s",
1731                      errors);
1732         return -1;
1733     }
1734 }
1735
1736 PyObject *PyUnicode_DecodeASCII(const char *s,
1737                                 int size,
1738                                 const char *errors)
1739 {
1740     PyUnicodeObject *v;
1741     Py_UNICODE *p;
1742
1743     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1744     v = _PyUnicode_New(size);
1745     if (v == NULL)
1746         goto onError;
1747     if (size == 0)
1748         return (PyObject *)v;
1749     p = PyUnicode_AS_UNICODE(v);
1750     while (size-- > 0) {
1751         register unsigned char c;
1752
1753         c = (unsigned char)*s++;
1754         if (c < 128)
1755             *p++ = c;
1756         else if (ascii_decoding_error(&s, &p, errors,
1757                                       "ordinal not in range(128)"))
1758                 goto onError;
1759     }
1760     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1761         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1762             goto onError;
1763     return (PyObject *)v;
1764
1765  onError:
1766     Py_XDECREF(v);
1767     return NULL;
1768 }
1769
1770 static
1771 int ascii_encoding_error(const Py_UNICODE **source,
1772                          char **dest,
1773                          const char *errors,
1774                          const char *details)
1775 {
1776     if ((errors == NULL) ||
1777         (strcmp(errors,"strict") == 0)) {
1778         PyErr_Format(PyExc_UnicodeError,
1779                      "ASCII encoding error: %.400s",
1780                      details);
1781         return -1;
1782     }
1783     else if (strcmp(errors,"ignore") == 0) {
1784         return 0;
1785     }
1786     else if (strcmp(errors,"replace") == 0) {
1787         **dest = '?';
1788         (*dest)++;
1789         return 0;
1790     }
1791     else {
1792         PyErr_Format(PyExc_ValueError,
1793                      "ASCII encoding error; "
1794                      "unknown error handling code: %.400s",
1795                      errors);
1796         return -1;
1797     }
1798 }
1799
1800 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1801                                 int size,
1802                                 const char *errors)
1803 {
1804     PyObject *repr;
1805     char *s, *start;
1806
1807     repr = PyString_FromStringAndSize(NULL, size);
1808     if (repr == NULL)
1809         return NULL;
1810     if (size == 0)
1811         return repr;
1812
1813     s = PyString_AS_STRING(repr);
1814     start = s;
1815     while (size-- > 0) {
1816         Py_UNICODE ch = *p++;
1817         if (ch >= 128) {
1818             if (ascii_encoding_error(&p, &s, errors,
1819                                       "ordinal not in range(128)"))
1820                 goto onError;
1821         }
1822         else
1823             *s++ = (char)ch;
1824     }
1825     /* Resize if error handling skipped some characters */
1826     if (s - start < PyString_GET_SIZE(repr))
1827         if (_PyString_Resize(&repr, s - start))
1828             goto onError;
1829     return repr;
1830
1831  onError:
1832     Py_DECREF(repr);
1833     return NULL;
1834 }
1835
1836 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1837 {
1838     if (!PyUnicode_Check(unicode)) {
1839         PyErr_BadArgument();
1840         return NULL;
1841     }
1842     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1843                                  PyUnicode_GET_SIZE(unicode),
1844                                  NULL);
1845 }
1846
1847 #ifdef MS_WIN32
1848
1849 /* --- MBCS codecs for Windows -------------------------------------------- */
1850
1851 PyObject *PyUnicode_DecodeMBCS(const char *s,
1852                                 int size,
1853                                 const char *errors)
1854 {
1855     PyUnicodeObject *v;
1856     Py_UNICODE *p;
1857
1858     /* First get the size of the result */
1859     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1860     if (size > 0 && usize==0)
1861         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862
1863     v = _PyUnicode_New(usize);
1864     if (v == NULL)
1865         return NULL;
1866     if (usize == 0)
1867         return (PyObject *)v;
1868     p = PyUnicode_AS_UNICODE(v);
1869     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1870         Py_DECREF(v);
1871         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872     }
1873
1874     return (PyObject *)v;
1875 }
1876
1877 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1878                                 int size,
1879                                 const char *errors)
1880 {
1881     PyObject *repr;
1882     char *s;
1883     DWORD mbcssize;
1884
1885     /* If there are no characters, bail now! */
1886     if (size==0)
1887             return PyString_FromString("");
1888
1889     /* First get the size of the result */
1890     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1891     if (mbcssize==0)
1892         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1893
1894     repr = PyString_FromStringAndSize(NULL, mbcssize);
1895     if (repr == NULL)
1896         return NULL;
1897     if (mbcssize == 0)
1898         return repr;
1899
1900     /* Do the conversion */
1901     s = PyString_AS_STRING(repr);
1902     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1903         Py_DECREF(repr);
1904         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1905     }
1906     return repr;
1907 }
1908
1909 #endif /* MS_WIN32 */
1910
1911 /* --- Character Mapping Codec -------------------------------------------- */
1912
1913 static
1914 int charmap_decoding_error(const char **source,
1915                          Py_UNICODE **dest,
1916                          const char *errors,
1917                          const char *details)
1918 {
1919     if ((errors == NULL) ||
1920         (strcmp(errors,"strict") == 0)) {
1921         PyErr_Format(PyExc_UnicodeError,
1922                      "charmap decoding error: %.400s",
1923                      details);
1924         return -1;
1925     }
1926     else if (strcmp(errors,"ignore") == 0) {
1927         return 0;
1928     }
1929     else if (strcmp(errors,"replace") == 0) {
1930         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1931         (*dest)++;
1932         return 0;
1933     }
1934     else {
1935         PyErr_Format(PyExc_ValueError,
1936                      "charmap decoding error; "
1937                      "unknown error handling code: %.400s",
1938                      errors);
1939         return -1;
1940     }
1941 }
1942
1943 PyObject *PyUnicode_DecodeCharmap(const char *s,
1944                                   int size,
1945                                   PyObject *mapping,
1946                                   const char *errors)
1947 {
1948     PyUnicodeObject *v;
1949     Py_UNICODE *p;
1950
1951     /* Default to Latin-1 */
1952     if (mapping == NULL)
1953         return PyUnicode_DecodeLatin1(s, size, errors);
1954
1955     v = _PyUnicode_New(size);
1956     if (v == NULL)
1957         goto onError;
1958     if (size == 0)
1959         return (PyObject *)v;
1960     p = PyUnicode_AS_UNICODE(v);
1961     while (size-- > 0) {
1962         unsigned char ch = *s++;
1963         PyObject *w, *x;
1964
1965         /* Get mapping (char ordinal -> integer, Unicode char or None) */
1966         w = PyInt_FromLong((long)ch);
1967         if (w == NULL)
1968             goto onError;
1969         x = PyObject_GetItem(mapping, w);
1970         Py_DECREF(w);
1971         if (x == NULL) {
1972             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1973                 /* No mapping found: default to Latin-1 mapping */
1974                 PyErr_Clear();
1975                 *p++ = (Py_UNICODE)ch;
1976                 continue;
1977             }
1978             goto onError;
1979         }
1980
1981         /* Apply mapping */
1982         if (PyInt_Check(x)) {
1983             long value = PyInt_AS_LONG(x);
1984             if (value < 0 || value > 65535) {
1985                 PyErr_SetString(PyExc_TypeError,
1986                                 "character mapping must be in range(65536)");
1987                 Py_DECREF(x);
1988                 goto onError;
1989             }
1990             *p++ = (Py_UNICODE)value;
1991         }
1992         else if (x == Py_None) {
1993             /* undefined mapping */
1994             if (charmap_decoding_error(&s, &p, errors,
1995                                        "character maps to <undefined>")) {
1996                 Py_DECREF(x);
1997                 goto onError;
1998             }
1999         }
2000         else if (PyUnicode_Check(x)) {
2001             if (PyUnicode_GET_SIZE(x) != 1) {
2002                 /* 1-n mapping */
2003                 PyErr_SetString(PyExc_NotImplementedError,
2004                                 "1-n mappings are currently not implemented");
2005                 Py_DECREF(x);
2006                 goto onError;
2007             }
2008             *p++ = *PyUnicode_AS_UNICODE(x);
2009         }
2010         else {
2011             /* wrong return value */
2012             PyErr_SetString(PyExc_TypeError,
2013                   "character mapping must return integer, None or unicode");
2014             Py_DECREF(x);
2015             goto onError;
2016         }
2017         Py_DECREF(x);
2018     }
2019     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2020         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2021             goto onError;
2022     return (PyObject *)v;
2023
2024  onError:
2025     Py_XDECREF(v);
2026     return NULL;
2027 }
2028
2029 static
2030 int charmap_encoding_error(const Py_UNICODE **source,
2031                            char **dest,
2032                            const char *errors,
2033                            const char *details)
2034 {
2035     if ((errors == NULL) ||
2036         (strcmp(errors,"strict") == 0)) {
2037         PyErr_Format(PyExc_UnicodeError,
2038                      "charmap encoding error: %.400s",
2039                      details);
2040         return -1;
2041     }
2042     else if (strcmp(errors,"ignore") == 0) {
2043         return 0;
2044     }
2045     else if (strcmp(errors,"replace") == 0) {
2046         **dest = '?';
2047         (*dest)++;
2048         return 0;
2049     }
2050     else {
2051         PyErr_Format(PyExc_ValueError,
2052                      "charmap encoding error; "
2053                      "unknown error handling code: %.400s",
2054                      errors);
2055         return -1;
2056     }
2057 }
2058
2059 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060                                   int size,
2061                                   PyObject *mapping,
2062                                   const char *errors)
2063 {
2064     PyObject *v;
2065     char *s;
2066
2067     /* Default to Latin-1 */
2068     if (mapping == NULL)
2069         return PyUnicode_EncodeLatin1(p, size, errors);
2070
2071     v = PyString_FromStringAndSize(NULL, size);
2072     if (v == NULL)
2073         return NULL;
2074     if (size == 0)
2075         return v;
2076     s = PyString_AS_STRING(v);
2077     while (size-- > 0) {
2078         Py_UNICODE ch = *p++;
2079         PyObject *w, *x;
2080
2081         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2082         w = PyInt_FromLong((long)ch);
2083         if (w == NULL)
2084             goto onError;
2085         x = PyObject_GetItem(mapping, w);
2086         Py_DECREF(w);
2087         if (x == NULL) {
2088             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2089                 /* No mapping found: default to Latin-1 mapping if possible */
2090                 PyErr_Clear();
2091                 if (ch < 256) {
2092                     *s++ = (char)ch;
2093                     continue;
2094                 }
2095                 else if (!charmap_encoding_error(&p, &s, errors,
2096                                      "missing character mapping"))
2097                     continue;
2098             }
2099             goto onError;
2100         }
2101
2102         /* Apply mapping */
2103         if (PyInt_Check(x)) {
2104             long value = PyInt_AS_LONG(x);
2105             if (value < 0 || value > 255) {
2106                 PyErr_SetString(PyExc_TypeError,
2107                                 "character mapping must be in range(256)");
2108                 Py_DECREF(x);
2109                 goto onError;
2110             }
2111             *s++ = (char)value;
2112         }
2113         else if (x == Py_None) {
2114             /* undefined mapping */
2115             if (charmap_encoding_error(&p, &s, errors,
2116                                        "character maps to <undefined>")) {
2117                 Py_DECREF(x);
2118                 goto onError;
2119             }
2120         }
2121         else if (PyString_Check(x)) {
2122             if (PyString_GET_SIZE(x) != 1) {
2123                 /* 1-n mapping */
2124                 PyErr_SetString(PyExc_NotImplementedError,
2125                       "1-n mappings are currently not implemented");
2126                 Py_DECREF(x);
2127                 goto onError;
2128             }
2129             *s++ = *PyString_AS_STRING(x);
2130         }
2131         else {
2132             /* wrong return value */
2133             PyErr_SetString(PyExc_TypeError,
2134                   "character mapping must return integer, None or unicode");
2135             Py_DECREF(x);
2136             goto onError;
2137         }
2138         Py_DECREF(x);
2139     }
2140     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2141         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2142             goto onError;
2143     return v;
2144
2145  onError:
2146     Py_DECREF(v);
2147     return NULL;
2148 }
2149
2150 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2151                                     PyObject *mapping)
2152 {
2153     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2154         PyErr_BadArgument();
2155         return NULL;
2156     }
2157     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2158                                    PyUnicode_GET_SIZE(unicode),
2159                                    mapping,
2160                                    NULL);
2161 }
2162
2163 static
2164 int translate_error(const Py_UNICODE **source,
2165                     Py_UNICODE **dest,
2166                     const char *errors,
2167                     const char *details)
2168 {
2169     if ((errors == NULL) ||
2170         (strcmp(errors,"strict") == 0)) {
2171         PyErr_Format(PyExc_UnicodeError,
2172                      "translate error: %.400s",
2173                      details);
2174         return -1;
2175     }
2176     else if (strcmp(errors,"ignore") == 0) {
2177         return 0;
2178     }
2179     else if (strcmp(errors,"replace") == 0) {
2180         **dest = '?';
2181         (*dest)++;
2182         return 0;
2183     }
2184     else {
2185         PyErr_Format(PyExc_ValueError,
2186                      "translate error; "
2187                      "unknown error handling code: %.400s",
2188                      errors);
2189         return -1;
2190     }
2191 }
2192
2193 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2194                                      int size,
2195                                      PyObject *mapping,
2196                                      const char *errors)
2197 {
2198     PyUnicodeObject *v;
2199     Py_UNICODE *p;
2200
2201     if (mapping == NULL) {
2202         PyErr_BadArgument();
2203         return NULL;
2204     }
2205
2206     /* Output will never be longer than input */
2207     v = _PyUnicode_New(size);
2208     if (v == NULL)
2209         goto onError;
2210     if (size == 0)
2211         goto done;
2212     p = PyUnicode_AS_UNICODE(v);
2213     while (size-- > 0) {
2214         Py_UNICODE ch = *s++;
2215         PyObject *w, *x;
2216
2217         /* Get mapping */
2218         w = PyInt_FromLong(ch);
2219         if (w == NULL)
2220             goto onError;
2221         x = PyObject_GetItem(mapping, w);
2222         Py_DECREF(w);
2223         if (x == NULL) {
2224             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2225                 /* No mapping found: default to 1-1 mapping */
2226                 PyErr_Clear();
2227                 *p++ = ch;
2228                 continue;
2229             }
2230             goto onError;
2231         }
2232
2233         /* Apply mapping */
2234         if (PyInt_Check(x))
2235             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2236         else if (x == Py_None) {
2237             /* undefined mapping */
2238             if (translate_error(&s, &p, errors,
2239                                 "character maps to <undefined>")) {
2240                 Py_DECREF(x);
2241                 goto onError;
2242             }
2243         }
2244         else if (PyUnicode_Check(x)) {
2245             if (PyUnicode_GET_SIZE(x) != 1) {
2246                 /* 1-n mapping */
2247                 PyErr_SetString(PyExc_NotImplementedError,
2248                                 "1-n mappings are currently not implemented");
2249                 Py_DECREF(x);
2250                 goto onError;
2251             }
2252             *p++ = *PyUnicode_AS_UNICODE(x);
2253         }
2254         else {
2255             /* wrong return value */
2256             PyErr_SetString(PyExc_TypeError,
2257                   "translate mapping must return integer, None or unicode");
2258             Py_DECREF(x);
2259             goto onError;
2260         }
2261         Py_DECREF(x);
2262     }
2263     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2264         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2265             goto onError;
2266
2267  done:
2268     return (PyObject *)v;
2269
2270  onError:
2271     Py_XDECREF(v);
2272     return NULL;
2273 }
2274
2275 PyObject *PyUnicode_Translate(PyObject *str,
2276                               PyObject *mapping,
2277                               const char *errors)
2278 {
2279     PyObject *result;
2280
2281     str = PyUnicode_FromObject(str);
2282     if (str == NULL)
2283         goto onError;
2284     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2285                                         PyUnicode_GET_SIZE(str),
2286                                         mapping,
2287                                         errors);
2288     Py_DECREF(str);
2289     return result;
2290
2291  onError:
2292     Py_XDECREF(str);
2293     return NULL;
2294 }
2295
2296 /* --- Decimal Encoder ---------------------------------------------------- */
2297
2298 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2299                             int length,
2300                             char *output,
2301                             const char *errors)
2302 {
2303     Py_UNICODE *p, *end;
2304
2305     if (output == NULL) {
2306         PyErr_BadArgument();
2307         return -1;
2308     }
2309
2310     p = s;
2311     end = s + length;
2312     while (p < end) {
2313         register Py_UNICODE ch = *p++;
2314         int decimal;
2315
2316         if (Py_UNICODE_ISSPACE(ch)) {
2317             *output++ = ' ';
2318             continue;
2319         }
2320         decimal = Py_UNICODE_TODECIMAL(ch);
2321         if (decimal >= 0) {
2322             *output++ = '0' + decimal;
2323             continue;
2324         }
2325         if (0 < ch && ch < 256) {
2326             *output++ = (char)ch;
2327             continue;
2328         }
2329         /* All other characters are considered invalid */
2330         if (errors == NULL || strcmp(errors, "strict") == 0) {
2331             PyErr_SetString(PyExc_ValueError,
2332                             "invalid decimal Unicode string");
2333             goto onError;
2334         }
2335         else if (strcmp(errors, "ignore") == 0)
2336             continue;
2337         else if (strcmp(errors, "replace") == 0) {
2338             *output++ = '?';
2339             continue;
2340         }
2341     }
2342     /* 0-terminate the output string */
2343     *output++ = '\0';
2344     return 0;
2345
2346  onError:
2347     return -1;
2348 }
2349
2350 /* --- Helpers ------------------------------------------------------------ */
2351
2352 static
2353 int count(PyUnicodeObject *self,
2354           int start,
2355           int end,
2356           PyUnicodeObject *substring)
2357 {
2358     int count = 0;
2359
2360     if (substring->length == 0)
2361         return (end - start + 1);
2362
2363     end -= substring->length;
2364
2365     while (start <= end)
2366         if (Py_UNICODE_MATCH(self, start, substring)) {
2367             count++;
2368             start += substring->length;
2369         } else
2370             start++;
2371
2372     return count;
2373 }
2374
2375 int PyUnicode_Count(PyObject *str,
2376                     PyObject *substr,
2377                     int start,
2378                     int end)
2379 {
2380     int result;
2381
2382     str = PyUnicode_FromObject(str);
2383     if (str == NULL)
2384         return -1;
2385     substr = PyUnicode_FromObject(substr);
2386     if (substr == NULL) {
2387         Py_DECREF(str);
2388         return -1;
2389     }
2390
2391     result = count((PyUnicodeObject *)str,
2392                    start, end,
2393                    (PyUnicodeObject *)substr);
2394
2395     Py_DECREF(str);
2396     Py_DECREF(substr);
2397     return result;
2398 }
2399
2400 static
2401 int findstring(PyUnicodeObject *self,
2402                PyUnicodeObject *substring,
2403                int start,
2404                int end,
2405                int direction)
2406 {
2407     if (start < 0)
2408         start += self->length;
2409     if (start < 0)
2410         start = 0;
2411
2412     if (substring->length == 0)
2413         return start;
2414
2415     if (end > self->length)
2416         end = self->length;
2417     if (end < 0)
2418         end += self->length;
2419     if (end < 0)
2420         end = 0;
2421
2422     end -= substring->length;
2423
2424     if (direction < 0) {
2425         for (; end >= start; end--)
2426             if (Py_UNICODE_MATCH(self, end, substring))
2427                 return end;
2428     } else {
2429         for (; start <= end; start++)
2430             if (Py_UNICODE_MATCH(self, start, substring))
2431                 return start;
2432     }
2433
2434     return -1;
2435 }
2436
2437 int PyUnicode_Find(PyObject *str,
2438                    PyObject *substr,
2439                    int start,
2440                    int end,
2441                    int direction)
2442 {
2443     int result;
2444
2445     str = PyUnicode_FromObject(str);
2446     if (str == NULL)
2447         return -1;
2448     substr = PyUnicode_FromObject(substr);
2449     if (substr == NULL) {
2450         Py_DECREF(substr);
2451         return -1;
2452     }
2453
2454     result = findstring((PyUnicodeObject *)str,
2455                         (PyUnicodeObject *)substr,
2456                         start, end, direction);
2457     Py_DECREF(str);
2458     Py_DECREF(substr);
2459     return result;
2460 }
2461
2462 static
2463 int tailmatch(PyUnicodeObject *self,
2464               PyUnicodeObject *substring,
2465               int start,
2466               int end,
2467               int direction)
2468 {
2469     if (start < 0)
2470         start += self->length;
2471     if (start < 0)
2472         start = 0;
2473
2474     if (substring->length == 0)
2475         return 1;
2476
2477     if (end > self->length)
2478         end = self->length;
2479     if (end < 0)
2480         end += self->length;
2481     if (end < 0)
2482         end = 0;
2483
2484     end -= substring->length;
2485     if (end < start)
2486         return 0;
2487
2488     if (direction > 0) {
2489         if (Py_UNICODE_MATCH(self, end, substring))
2490             return 1;
2491     } else {
2492         if (Py_UNICODE_MATCH(self, start, substring))
2493             return 1;
2494     }
2495
2496     return 0;
2497 }
2498
2499 int PyUnicode_Tailmatch(PyObject *str,
2500                         PyObject *substr,
2501                         int start,
2502                         int end,
2503                         int direction)
2504 {
2505     int result;
2506
2507     str = PyUnicode_FromObject(str);
2508     if (str == NULL)
2509         return -1;
2510     substr = PyUnicode_FromObject(substr);
2511     if (substr == NULL) {
2512         Py_DECREF(substr);
2513         return -1;
2514     }
2515
2516     result = tailmatch((PyUnicodeObject *)str,
2517                        (PyUnicodeObject *)substr,
2518                        start, end, direction);
2519     Py_DECREF(str);
2520     Py_DECREF(substr);
2521     return result;
2522 }
2523
2524 static
2525 const Py_UNICODE *findchar(const Py_UNICODE *s,
2526                      int size,
2527                      Py_UNICODE ch)
2528 {
2529     /* like wcschr, but doesn't stop at NULL characters */
2530
2531     while (size-- > 0) {
2532         if (*s == ch)
2533             return s;
2534         s++;
2535     }
2536
2537     return NULL;
2538 }
2539
2540 /* Apply fixfct filter to the Unicode object self and return a
2541    reference to the modified object */
2542
2543 static
2544 PyObject *fixup(PyUnicodeObject *self,
2545                 int (*fixfct)(PyUnicodeObject *s))
2546 {
2547
2548     PyUnicodeObject *u;
2549
2550     u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2551                                                  self->length);
2552     if (u == NULL)
2553         return NULL;
2554     if (!fixfct(u)) {
2555         /* fixfct should return TRUE if it modified the buffer. If
2556            FALSE, return a reference to the original buffer instead
2557            (to save space, not time) */
2558         Py_INCREF(self);
2559         Py_DECREF(u);
2560         return (PyObject*) self;
2561     }
2562     return (PyObject*) u;
2563 }
2564
2565 static
2566 int fixupper(PyUnicodeObject *self)
2567 {
2568     int len = self->length;
2569     Py_UNICODE *s = self->str;
2570     int status = 0;
2571
2572     while (len-- > 0) {
2573         register Py_UNICODE ch;
2574
2575         ch = Py_UNICODE_TOUPPER(*s);
2576         if (ch != *s) {
2577             status = 1;
2578             *s = ch;
2579         }
2580         s++;
2581     }
2582
2583     return status;
2584 }
2585
2586 static
2587 int fixlower(PyUnicodeObject *self)
2588 {
2589     int len = self->length;
2590     Py_UNICODE *s = self->str;
2591     int status = 0;
2592
2593     while (len-- > 0) {
2594         register Py_UNICODE ch;
2595
2596         ch = Py_UNICODE_TOLOWER(*s);
2597         if (ch != *s) {
2598             status = 1;
2599             *s = ch;
2600         }
2601         s++;
2602     }
2603
2604     return status;
2605 }
2606
2607 static
2608 int fixswapcase(PyUnicodeObject *self)
2609 {
2610     int len = self->length;
2611     Py_UNICODE *s = self->str;
2612     int status = 0;
2613
2614     while (len-- > 0) {
2615         if (Py_UNICODE_ISUPPER(*s)) {
2616             *s = Py_UNICODE_TOLOWER(*s);
2617             status = 1;
2618         } else if (Py_UNICODE_ISLOWER(*s)) {
2619             *s = Py_UNICODE_TOUPPER(*s);
2620             status = 1;
2621         }
2622         s++;
2623     }
2624
2625     return status;
2626 }
2627
2628 static
2629 int fixcapitalize(PyUnicodeObject *self)
2630 {
2631     if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2632         self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2633         return 1;
2634     }
2635     return 0;
2636 }
2637
2638 static
2639 int fixtitle(PyUnicodeObject *self)
2640 {
2641     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2642     register Py_UNICODE *e;
2643     int previous_is_cased;
2644
2645     /* Shortcut for single character strings */
2646     if (PyUnicode_GET_SIZE(self) == 1) {
2647         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2648         if (*p != ch) {
2649             *p = ch;
2650             return 1;
2651         }
2652         else
2653             return 0;
2654     }
2655
2656     e = p + PyUnicode_GET_SIZE(self);
2657     previous_is_cased = 0;
2658     for (; p < e; p++) {
2659         register const Py_UNICODE ch = *p;
2660
2661         if (previous_is_cased)
2662             *p = Py_UNICODE_TOLOWER(ch);
2663         else
2664             *p = Py_UNICODE_TOTITLE(ch);
2665
2666         if (Py_UNICODE_ISLOWER(ch) ||
2667             Py_UNICODE_ISUPPER(ch) ||
2668             Py_UNICODE_ISTITLE(ch))
2669             previous_is_cased = 1;
2670         else
2671             previous_is_cased = 0;
2672     }
2673     return 1;
2674 }
2675
2676 PyObject *PyUnicode_Join(PyObject *separator,
2677                          PyObject *seq)
2678 {
2679     Py_UNICODE *sep;
2680     int seplen;
2681     PyUnicodeObject *res = NULL;
2682     int reslen = 0;
2683     Py_UNICODE *p;
2684     int seqlen = 0;
2685     int sz = 100;
2686     int i;
2687
2688     seqlen = PySequence_Size(seq);
2689     if (seqlen < 0 && PyErr_Occurred())
2690         return NULL;
2691
2692     if (separator == NULL) {
2693         Py_UNICODE blank = ' ';
2694         sep = &blank;
2695         seplen = 1;
2696     }
2697     else {
2698         separator = PyUnicode_FromObject(separator);
2699         if (separator == NULL)
2700             return NULL;
2701         sep = PyUnicode_AS_UNICODE(separator);
2702         seplen = PyUnicode_GET_SIZE(separator);
2703     }
2704
2705     res = _PyUnicode_New(sz);
2706     if (res == NULL)
2707         goto onError;
2708     p = PyUnicode_AS_UNICODE(res);
2709     reslen = 0;
2710
2711     for (i = 0; i < seqlen; i++) {
2712         int itemlen;
2713         PyObject *item;
2714
2715         item = PySequence_GetItem(seq, i);
2716         if (item == NULL)
2717             goto onError;
2718         if (!PyUnicode_Check(item)) {
2719             PyObject *v;
2720             v = PyUnicode_FromObject(item);
2721             Py_DECREF(item);
2722             item = v;
2723             if (item == NULL)
2724                 goto onError;
2725         }
2726         itemlen = PyUnicode_GET_SIZE(item);
2727         while (reslen + itemlen + seplen >= sz) {
2728             if (_PyUnicode_Resize(res, sz*2))
2729                 goto onError;
2730             sz *= 2;
2731             p = PyUnicode_AS_UNICODE(res) + reslen;
2732         }
2733         if (i > 0) {
2734             memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2735             p += seplen;
2736             reslen += seplen;
2737         }
2738         memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2739         p += itemlen;
2740         reslen += itemlen;
2741         Py_DECREF(item);
2742     }
2743     if (_PyUnicode_Resize(res, reslen))
2744         goto onError;
2745
2746     Py_XDECREF(separator);
2747     return (PyObject *)res;
2748
2749  onError:
2750     Py_XDECREF(separator);
2751     Py_DECREF(res);
2752     return NULL;
2753 }
2754
2755 static
2756 PyUnicodeObject *pad(PyUnicodeObject *self,
2757                      int left,
2758                      int right,
2759                      Py_UNICODE fill)
2760 {
2761     PyUnicodeObject *u;
2762
2763     if (left < 0)
2764         left = 0;
2765     if (right < 0)
2766         right = 0;
2767
2768     if (left == 0 && right == 0) {
2769         Py_INCREF(self);
2770         return self;
2771     }
2772
2773     u = _PyUnicode_New(left + self->length + right);
2774     if (u) {
2775         if (left)
2776             Py_UNICODE_FILL(u->str, fill, left);
2777         Py_UNICODE_COPY(u->str + left, self->str, self->length);
2778         if (right)
2779             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2780     }
2781
2782     return u;
2783 }
2784
2785 #define SPLIT_APPEND(data, left, right)                                 \
2786         str = PyUnicode_FromUnicode(data + left, right - left);         \
2787         if (!str)                                                       \
2788             goto onError;                                               \
2789         if (PyList_Append(list, str)) {                                 \
2790             Py_DECREF(str);                                             \
2791             goto onError;                                               \
2792         }                                                               \
2793         else                                                            \
2794             Py_DECREF(str);
2795
2796 static
2797 PyObject *split_whitespace(PyUnicodeObject *self,
2798                            PyObject *list,
2799                            int maxcount)
2800 {
2801     register int i;
2802     register int j;
2803     int len = self->length;
2804     PyObject *str;
2805
2806     for (i = j = 0; i < len; ) {
2807         /* find a token */
2808         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2809             i++;
2810         j = i;
2811         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2812             i++;
2813         if (j < i) {
2814             if (maxcount-- <= 0)
2815                 break;
2816             SPLIT_APPEND(self->str, j, i);
2817             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2818                 i++;
2819             j = i;
2820         }
2821     }
2822     if (j < len) {
2823         SPLIT_APPEND(self->str, j, len);
2824     }
2825     return list;
2826
2827  onError:
2828     Py_DECREF(list);
2829     return NULL;
2830 }
2831
2832 PyObject *PyUnicode_Splitlines(PyObject *string,
2833                                int keepends)
2834 {
2835     register int i;
2836     register int j;
2837     int len;
2838     PyObject *list;
2839     PyObject *str;
2840     Py_UNICODE *data;
2841
2842     string = PyUnicode_FromObject(string);
2843     if (string == NULL)
2844         return NULL;
2845     data = PyUnicode_AS_UNICODE(string);
2846     len = PyUnicode_GET_SIZE(string);
2847
2848     list = PyList_New(0);
2849     if (!list)
2850         goto onError;
2851
2852     for (i = j = 0; i < len; ) {
2853         int eol;
2854
2855         /* Find a line and append it */
2856         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2857             i++;
2858
2859         /* Skip the line break reading CRLF as one line break */
2860         eol = i;
2861         if (i < len) {
2862             if (data[i] == '\r' && i + 1 < len &&
2863                 data[i+1] == '\n')
2864                 i += 2;
2865             else
2866                 i++;
2867             if (keepends)
2868                 eol = i;
2869         }
2870         SPLIT_APPEND(data, j, eol);
2871         j = i;
2872     }
2873     if (j < len) {
2874         SPLIT_APPEND(data, j, len);
2875     }
2876
2877     Py_DECREF(string);
2878     return list;
2879
2880  onError:
2881     Py_DECREF(list);
2882     Py_DECREF(string);
2883     return NULL;
2884 }
2885
2886 static
2887 PyObject *split_char(PyUnicodeObject *self,
2888                      PyObject *list,
2889                      Py_UNICODE ch,
2890                      int maxcount)
2891 {
2892     register int i;
2893     register int j;
2894     int len = self->length;
2895     PyObject *str;
2896
2897     for (i = j = 0; i < len; ) {
2898         if (self->str[i] == ch) {
2899             if (maxcount-- <= 0)
2900                 break;
2901             SPLIT_APPEND(self->str, j, i);
2902             i = j = i + 1;
2903         } else
2904             i++;
2905     }
2906     if (j <= len) {
2907         SPLIT_APPEND(self->str, j, len);
2908     }
2909     return list;
2910
2911  onError:
2912     Py_DECREF(list);
2913     return NULL;
2914 }
2915
2916 static
2917 PyObject *split_substring(PyUnicodeObject *self,
2918                           PyObject *list,
2919                           PyUnicodeObject *substring,
2920                           int maxcount)
2921 {
2922     register int i;
2923     register int j;
2924     int len = self->length;
2925     int sublen = substring->length;
2926     PyObject *str;
2927
2928     for (i = j = 0; i < len - sublen; ) {
2929         if (Py_UNICODE_MATCH(self, i, substring)) {
2930             if (maxcount-- <= 0)
2931                 break;
2932             SPLIT_APPEND(self->str, j, i);
2933             i = j = i + sublen;
2934         } else
2935             i++;
2936     }
2937     if (j <= len) {
2938         SPLIT_APPEND(self->str, j, len);
2939     }
2940     return list;
2941
2942  onError:
2943     Py_DECREF(list);
2944     return NULL;
2945 }
2946
2947 #undef SPLIT_APPEND
2948
2949 static
2950 PyObject *split(PyUnicodeObject *self,
2951                 PyUnicodeObject *substring,
2952                 int maxcount)
2953 {
2954     PyObject *list;
2955
2956     if (maxcount < 0)
2957         maxcount = INT_MAX;
2958
2959     list = PyList_New(0);
2960     if (!list)
2961         return NULL;
2962
2963     if (substring == NULL)
2964         return split_whitespace(self,list,maxcount);
2965
2966     else if (substring->length == 1)
2967         return split_char(self,list,substring->str[0],maxcount);
2968
2969     else if (substring->length == 0) {
2970         Py_DECREF(list);
2971         PyErr_SetString(PyExc_ValueError, "empty separator");
2972         return NULL;
2973     }
2974     else
2975         return split_substring(self,list,substring,maxcount);
2976 }
2977
2978 static
2979 PyObject *strip(PyUnicodeObject *self,
2980                 int left,
2981                 int right)
2982 {
2983     Py_UNICODE *p = self->str;
2984     int start = 0;
2985     int end = self->length;
2986
2987     if (left)
2988         while (start < end && Py_UNICODE_ISSPACE(p[start]))
2989             start++;
2990
2991     if (right)
2992         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2993             end--;
2994
2995     if (start == 0 && end == self->length) {
2996         /* couldn't strip anything off, return original string */
2997         Py_INCREF(self);
2998         return (PyObject*) self;
2999     }
3000
3001     return (PyObject*) PyUnicode_FromUnicode(
3002         self->str + start,
3003         end - start
3004         );
3005 }
3006
3007 static
3008 PyObject *replace(PyUnicodeObject *self,
3009                   PyUnicodeObject *str1,
3010                   PyUnicodeObject *str2,
3011                   int maxcount)
3012 {
3013     PyUnicodeObject *u;
3014
3015     if (maxcount < 0)
3016         maxcount = INT_MAX;
3017
3018     if (str1->length == 1 && str2->length == 1) {
3019         int i;
3020
3021         /* replace characters */
3022         if (!findchar(self->str, self->length, str1->str[0])) {
3023             /* nothing to replace, return original string */
3024             Py_INCREF(self);
3025             u = self;
3026         } else {
3027             Py_UNICODE u1 = str1->str[0];
3028             Py_UNICODE u2 = str2->str[0];
3029
3030             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3031                 self->str,
3032                 self->length
3033                 );
3034             if (u)
3035                 for (i = 0; i < u->length; i++)
3036                     if (u->str[i] == u1) {
3037                         if (--maxcount < 0)
3038                             break;
3039                         u->str[i] = u2;
3040                     }
3041         }
3042
3043     } else {
3044         int n, i;
3045         Py_UNICODE *p;
3046
3047         /* replace strings */
3048         n = count(self, 0, self->length, str1);
3049         if (n > maxcount)
3050             n = maxcount;
3051         if (n == 0) {
3052             /* nothing to replace, return original string */
3053             Py_INCREF(self);
3054             u = self;
3055         } else {
3056             u = _PyUnicode_New(
3057                 self->length + n * (str2->length - str1->length));
3058             if (u) {
3059                 i = 0;
3060                 p = u->str;
3061                 while (i <= self->length - str1->length)
3062                     if (Py_UNICODE_MATCH(self, i, str1)) {
3063                         /* replace string segment */
3064                         Py_UNICODE_COPY(p, str2->str, str2->length);
3065                         p += str2->length;
3066                         i += str1->length;
3067                         if (--n <= 0) {
3068                             /* copy remaining part */
3069                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3070                             break;
3071                         }
3072                     } else
3073                         *p++ = self->str[i++];
3074             }
3075         }
3076     }
3077
3078     return (PyObject *) u;
3079 }
3080
3081 /* --- Unicode Object Methods --------------------------------------------- */
3082
3083 static char title__doc__[] =
3084 "S.title() -> unicode\n\
3085 \n\
3086 Return a titlecased version of S, i.e. words start with title case\n\
3087 characters, all remaining cased characters have lower case.";
3088
3089 static PyObject*
3090 unicode_title(PyUnicodeObject *self, PyObject *args)
3091 {
3092     if (!PyArg_NoArgs(args))
3093         return NULL;
3094     return fixup(self, fixtitle);
3095 }
3096
3097 static char capitalize__doc__[] =
3098 "S.capitalize() -> unicode\n\
3099 \n\
3100 Return a capitalized version of S, i.e. make the first character\n\
3101 have upper case.";
3102
3103 static PyObject*
3104 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3105 {
3106     if (!PyArg_NoArgs(args))
3107         return NULL;
3108     return fixup(self, fixcapitalize);
3109 }
3110
3111 #if 0
3112 static char capwords__doc__[] =
3113 "S.capwords() -> unicode\n\
3114 \n\
3115 Apply .capitalize() to all words in S and return the result with\n\
3116 normalized whitespace (all whitespace strings are replaced by ' ').";
3117
3118 static PyObject*
3119 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3120 {
3121     PyObject *list;
3122     PyObject *item;
3123     int i;
3124
3125     if (!PyArg_NoArgs(args))
3126         return NULL;
3127
3128     /* Split into words */
3129     list = split(self, NULL, -1);
3130     if (!list)
3131         return NULL;
3132
3133     /* Capitalize each word */
3134     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3135         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3136                      fixcapitalize);
3137         if (item == NULL)
3138             goto onError;
3139         Py_DECREF(PyList_GET_ITEM(list, i));
3140         PyList_SET_ITEM(list, i, item);
3141     }
3142
3143     /* Join the words to form a new string */
3144     item = PyUnicode_Join(NULL, list);
3145
3146 onError:
3147     Py_DECREF(list);
3148     return (PyObject *)item;
3149 }
3150 #endif
3151
3152 static char center__doc__[] =
3153 "S.center(width) -> unicode\n\
3154 \n\
3155 Return S centered in a Unicode string of length width. Padding is done\n\
3156 using spaces.";
3157
3158 static PyObject *
3159 unicode_center(PyUnicodeObject *self, PyObject *args)
3160 {
3161     int marg, left;
3162     int width;
3163
3164     if (!PyArg_ParseTuple(args, "i:center", &width))
3165         return NULL;
3166
3167     if (self->length >= width) {
3168         Py_INCREF(self);
3169         return (PyObject*) self;
3170     }
3171
3172     marg = width - self->length;
3173     left = marg / 2 + (marg & width & 1);
3174
3175     return (PyObject*) pad(self, left, marg - left, ' ');
3176 }
3177
3178 #if 0
3179
3180 /* This code should go into some future Unicode collation support
3181    module. The basic comparison should compare ordinals on a naive
3182    basis (this is what Java does and thus JPython too). */
3183
3184 /* speedy UTF-16 code point order comparison */
3185 /* gleaned from: */
3186 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3187
3188 static short utf16Fixup[32] =
3189 {
3190     0, 0, 0, 0, 0, 0, 0, 0,
3191     0, 0, 0, 0, 0, 0, 0, 0,
3192     0, 0, 0, 0, 0, 0, 0, 0,
3193     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3194 };
3195
3196 static int
3197 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3198 {
3199     int len1, len2;
3200
3201     Py_UNICODE *s1 = str1->str;
3202     Py_UNICODE *s2 = str2->str;
3203
3204     len1 = str1->length;
3205     len2 = str2->length;
3206
3207     while (len1 > 0 && len2 > 0) {
3208         Py_UNICODE c1, c2;
3209         long diff;
3210
3211         c1 = *s1++;
3212         c2 = *s2++;
3213         if (c1 > (1<<11) * 26)
3214             c1 += utf16Fixup[c1>>11];
3215         if (c2 > (1<<11) * 26)
3216             c2 += utf16Fixup[c2>>11];
3217
3218         /* now c1 and c2 are in UTF-32-compatible order */
3219         diff = (long)c1 - (long)c2;
3220         if (diff)
3221             return (diff < 0) ? -1 : (diff != 0);
3222         len1--; len2--;
3223     }
3224
3225     return (len1 < len2) ? -1 : (len1 != len2);
3226 }
3227
3228 #else
3229
3230 static int
3231 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3232 {
3233     register int len1, len2;
3234
3235     Py_UNICODE *s1 = str1->str;
3236     Py_UNICODE *s2 = str2->str;
3237
3238     len1 = str1->length;
3239     len2 = str2->length;
3240
3241     while (len1 > 0 && len2 > 0) {
3242         register long diff;
3243
3244         diff = (long)*s1++ - (long)*s2++;
3245         if (diff)
3246             return (diff < 0) ? -1 : (diff != 0);
3247         len1--; len2--;
3248     }
3249
3250     return (len1 < len2) ? -1 : (len1 != len2);
3251 }
3252
3253 #endif
3254
3255 int PyUnicode_Compare(PyObject *left,
3256                       PyObject *right)
3257 {
3258     PyUnicodeObject *u = NULL, *v = NULL;
3259     int result;
3260
3261     /* Coerce the two arguments */
3262     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3263     if (u == NULL)
3264         goto onError;
3265     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3266     if (v == NULL)
3267         goto onError;
3268
3269     /* Shortcut for empty or interned objects */
3270     if (v == u) {
3271         Py_DECREF(u);
3272         Py_DECREF(v);
3273         return 0;
3274     }
3275
3276     result = unicode_compare(u, v);
3277
3278     Py_DECREF(u);
3279     Py_DECREF(v);
3280     return result;
3281
3282 onError:
3283     Py_XDECREF(u);
3284     Py_XDECREF(v);
3285     return -1;
3286 }
3287
3288 int PyUnicode_Contains(PyObject *container,
3289                        PyObject *element)
3290 {
3291     PyUnicodeObject *u = NULL, *v = NULL;
3292     int result;
3293     register const Py_UNICODE *p, *e;
3294     register Py_UNICODE ch;
3295
3296     /* Coerce the two arguments */
3297     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3298     if (v == NULL) {
3299         PyErr_SetString(PyExc_TypeError,
3300             "'in <string>' requires character as left operand");
3301         goto onError;
3302     }
3303     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3304     if (u == NULL) {
3305         Py_DECREF(v);
3306         goto onError;
3307     }
3308
3309     /* Check v in u */
3310     if (PyUnicode_GET_SIZE(v) != 1) {
3311         PyErr_SetString(PyExc_TypeError,
3312             "'in <string>' requires character as left operand");
3313         goto onError;
3314     }
3315     ch = *PyUnicode_AS_UNICODE(v);
3316     p = PyUnicode_AS_UNICODE(u);
3317     e = p + PyUnicode_GET_SIZE(u);
3318     result = 0;
3319     while (p < e) {
3320         if (*p++ == ch) {
3321             result = 1;
3322             break;
3323         }
3324     }
3325
3326     Py_DECREF(u);
3327     Py_DECREF(v);
3328     return result;
3329
3330 onError:
3331     Py_XDECREF(u);
3332     Py_XDECREF(v);
3333     return -1;
3334 }
3335
3336 /* Concat to string or Unicode object giving a new Unicode object. */
3337
3338 PyObject *PyUnicode_Concat(PyObject *left,
3339                            PyObject *right)
3340 {
3341     PyUnicodeObject *u = NULL, *v = NULL, *w;
3342
3343     /* Coerce the two arguments */
3344     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3345     if (u == NULL)
3346         goto onError;
3347     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3348     if (v == NULL)
3349         goto onError;
3350
3351     /* Shortcuts */
3352     if (v == unicode_empty) {
3353         Py_DECREF(v);
3354         return (PyObject *)u;
3355     }
3356     if (u == unicode_empty) {
3357         Py_DECREF(u);
3358         return (PyObject *)v;
3359     }
3360
3361     /* Concat the two Unicode strings */
3362     w = _PyUnicode_New(u->length + v->length);
3363     if (w == NULL)
3364         goto onError;
3365     Py_UNICODE_COPY(w->str, u->str, u->length);
3366     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3367
3368     Py_DECREF(u);
3369     Py_DECREF(v);
3370     return (PyObject *)w;
3371
3372 onError:
3373     Py_XDECREF(u);
3374     Py_XDECREF(v);
3375     return NULL;
3376 }
3377
3378 static char count__doc__[] =
3379 "S.count(sub[, start[, end]]) -> int\n\
3380 \n\
3381 Return the number of occurrences of substring sub in Unicode string\n\
3382 S[start:end].  Optional arguments start and end are\n\
3383 interpreted as in slice notation.";
3384
3385 static PyObject *
3386 unicode_count(PyUnicodeObject *self, PyObject *args)
3387 {
3388     PyUnicodeObject *substring;
3389     int start = 0;
3390     int end = INT_MAX;
3391     PyObject *result;
3392
3393     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3394                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3395         return NULL;
3396
3397     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3398                                                 (PyObject *)substring);
3399     if (substring == NULL)
3400         return NULL;
3401
3402     if (start < 0)
3403         start += self->length;
3404     if (start < 0)
3405         start = 0;
3406     if (end > self->length)
3407         end = self->length;
3408     if (end < 0)
3409         end += self->length;
3410     if (end < 0)
3411         end = 0;
3412
3413     result = PyInt_FromLong((long) count(self, start, end, substring));
3414
3415     Py_DECREF(substring);
3416     return result;
3417 }
3418
3419 static char encode__doc__[] =
3420 "S.encode([encoding[,errors]]) -> string\n\
3421 \n\
3422 Return an encoded string version of S. Default encoding is the current\n\
3423 default string encoding. errors may be given to set a different error\n\
3424 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3425 a ValueError. Other possible values are 'ignore' and 'replace'.";
3426
3427 static PyObject *
3428 unicode_encode(PyUnicodeObject *self, PyObject *args)
3429 {
3430     char *encoding = NULL;
3431     char *errors = NULL;
3432     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3433         return NULL;
3434     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3435 }
3436
3437 static char expandtabs__doc__[] =
3438 "S.expandtabs([tabsize]) -> unicode\n\
3439 \n\
3440 Return a copy of S where all tab characters are expanded using spaces.\n\
3441 If tabsize is not given, a tab size of 8 characters is assumed.";
3442
3443 static PyObject*
3444 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3445 {
3446     Py_UNICODE *e;
3447     Py_UNICODE *p;
3448     Py_UNICODE *q;
3449     int i, j;
3450     PyUnicodeObject *u;
3451     int tabsize = 8;
3452
3453     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3454         return NULL;
3455
3456     /* First pass: determine size of output string */
3457     i = j = 0;
3458     e = self->str + self->length;
3459     for (p = self->str; p < e; p++)
3460         if (*p == '\t') {
3461             if (tabsize > 0)
3462                 j += tabsize - (j % tabsize);
3463         }
3464         else {
3465             j++;
3466             if (*p == '\n' || *p == '\r') {
3467                 i += j;
3468                 j = 0;
3469             }
3470         }
3471
3472     /* Second pass: create output string and fill it */
3473     u = _PyUnicode_New(i + j);
3474     if (!u)
3475         return NULL;
3476
3477     j = 0;
3478     q = u->str;
3479
3480     for (p = self->str; p < e; p++)
3481         if (*p == '\t') {
3482             if (tabsize > 0) {
3483                 i = tabsize - (j % tabsize);
3484                 j += i;
3485                 while (i--)
3486                     *q++ = ' ';
3487             }
3488         }
3489         else {
3490             j++;
3491             *q++ = *p;
3492             if (*p == '\n' || *p == '\r')
3493                 j = 0;
3494         }
3495
3496     return (PyObject*) u;
3497 }
3498
3499 static char find__doc__[] =
3500 "S.find(sub [,start [,end]]) -> int\n\
3501 \n\
3502 Return the lowest index in S where substring sub is found,\n\
3503 such that sub is contained within s[start,end].  Optional\n\
3504 arguments start and end are interpreted as in slice notation.\n\
3505 \n\
3506 Return -1 on failure.";
3507
3508 static PyObject *
3509 unicode_find(PyUnicodeObject *self, PyObject *args)
3510 {
3511     PyUnicodeObject *substring;
3512     int start = 0;
3513     int end = INT_MAX;
3514     PyObject *result;
3515
3516     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3517                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3518         return NULL;
3519     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3520                                                 (PyObject *)substring);
3521     if (substring == NULL)
3522         return NULL;
3523
3524     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3525
3526     Py_DECREF(substring);
3527     return result;
3528 }
3529
3530 static PyObject *
3531 unicode_getitem(PyUnicodeObject *self, int index)
3532 {
3533     if (index < 0 || index >= self->length) {
3534         PyErr_SetString(PyExc_IndexError, "string index out of range");
3535         return NULL;
3536     }
3537
3538     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3539 }
3540
3541 static long
3542 unicode_hash(PyUnicodeObject *self)
3543 {
3544     /* Since Unicode objects compare equal to their ASCII string
3545        counterparts, they should use the individual character values
3546        as basis for their hash value.  This is needed to assure that
3547        strings and Unicode objects behave in the same way as
3548        dictionary keys. */
3549
3550     register int len;
3551     register Py_UNICODE *p;
3552     register long x;
3553
3554     if (self->hash != -1)
3555         return self->hash;
3556     len = PyUnicode_GET_SIZE(self);
3557     p = PyUnicode_AS_UNICODE(self);
3558     x = *p << 7;
3559     while (--len >= 0)
3560         x = (1000003*x) ^ *p++;
3561     x ^= PyUnicode_GET_SIZE(self);
3562     if (x == -1)
3563         x = -2;
3564     self->hash = x;
3565     return x;
3566 }
3567
3568 static char index__doc__[] =
3569 "S.index(sub [,start [,end]]) -> int\n\
3570 \n\
3571 Like S.find() but raise ValueError when the substring is not found.";
3572
3573 static PyObject *
3574 unicode_index(PyUnicodeObject *self, PyObject *args)
3575 {
3576     int result;
3577     PyUnicodeObject *substring;
3578     int start = 0;
3579     int end = INT_MAX;
3580
3581     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3582                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3583         return NULL;
3584
3585     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3586                                                 (PyObject *)substring);
3587     if (substring == NULL)
3588         return NULL;
3589
3590     result = findstring(self, substring, start, end, 1);
3591
3592     Py_DECREF(substring);
3593     if (result < 0) {
3594         PyErr_SetString(PyExc_ValueError, "substring not found");
3595         return NULL;
3596     }
3597     return PyInt_FromLong(result);
3598 }
3599
3600 static char islower__doc__[] =
3601 "S.islower() -> int\n\
3602 \n\
3603 Return 1 if  all cased characters in S are lowercase and there is\n\
3604 at least one cased character in S, 0 otherwise.";
3605
3606 static PyObject*
3607 unicode_islower(PyUnicodeObject *self, PyObject *args)
3608 {
3609     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3610     register const Py_UNICODE *e;
3611     int cased;
3612
3613     if (!PyArg_NoArgs(args))
3614         return NULL;
3615
3616     /* Shortcut for single character strings */
3617     if (PyUnicode_GET_SIZE(self) == 1)
3618         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3619
3620     /* Special case for empty strings */
3621     if (PyString_GET_SIZE(self) == 0)
3622         return PyInt_FromLong(0);
3623
3624     e = p + PyUnicode_GET_SIZE(self);
3625     cased = 0;
3626     for (; p < e; p++) {
3627         register const Py_UNICODE ch = *p;
3628
3629         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3630             return PyInt_FromLong(0);
3631         else if (!cased && Py_UNICODE_ISLOWER(ch))
3632             cased = 1;
3633     }
3634     return PyInt_FromLong(cased);
3635 }
3636
3637 static char isupper__doc__[] =
3638 "S.isupper() -> int\n\
3639 \n\
3640 Return 1 if  all cased characters in S are uppercase and there is\n\
3641 at least one cased character in S, 0 otherwise.";
3642
3643 static PyObject*
3644 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3645 {
3646     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3647     register const Py_UNICODE *e;
3648     int cased;
3649
3650     if (!PyArg_NoArgs(args))
3651         return NULL;
3652
3653     /* Shortcut for single character strings */
3654     if (PyUnicode_GET_SIZE(self) == 1)
3655         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3656
3657     /* Special case for empty strings */
3658     if (PyString_GET_SIZE(self) == 0)
3659         return PyInt_FromLong(0);
3660
3661     e = p + PyUnicode_GET_SIZE(self);
3662     cased = 0;
3663     for (; p < e; p++) {
3664         register const Py_UNICODE ch = *p;
3665
3666         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3667             return PyInt_FromLong(0);
3668         else if (!cased && Py_UNICODE_ISUPPER(ch))
3669             cased = 1;
3670     }
3671     return PyInt_FromLong(cased);
3672 }
3673
3674 static char istitle__doc__[] =
3675 "S.istitle() -> int\n\
3676 \n\
3677 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3678 may only follow uncased characters and lowercase characters only cased\n\
3679 ones. Return 0 otherwise.";
3680
3681 static PyObject*
3682 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3683 {
3684     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3685     register const Py_UNICODE *e;
3686     int cased, previous_is_cased;
3687
3688     if (!PyArg_NoArgs(args))
3689         return NULL;
3690
3691     /* Shortcut for single character strings */
3692     if (PyUnicode_GET_SIZE(self) == 1)
3693         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3694                               (Py_UNICODE_ISUPPER(*p) != 0));
3695
3696     /* Special case for empty strings */
3697     if (PyString_GET_SIZE(self) == 0)
3698         return PyInt_FromLong(0);
3699
3700     e = p + PyUnicode_GET_SIZE(self);
3701     cased = 0;
3702     previous_is_cased = 0;
3703     for (; p < e; p++) {
3704         register const Py_UNICODE ch = *p;
3705
3706         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3707             if (previous_is_cased)
3708                 return PyInt_FromLong(0);
3709             previous_is_cased = 1;
3710             cased = 1;
3711         }
3712         else if (Py_UNICODE_ISLOWER(ch)) {
3713             if (!previous_is_cased)
3714                 return PyInt_FromLong(0);
3715             previous_is_cased = 1;
3716             cased = 1;
3717         }
3718         else
3719             previous_is_cased = 0;
3720     }
3721     return PyInt_FromLong(cased);
3722 }
3723
3724 static char isspace__doc__[] =
3725 "S.isspace() -> int\n\
3726 \n\
3727 Return 1 if there are only whitespace characters in S,\n\
3728 0 otherwise.";
3729
3730 static PyObject*
3731 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3732 {
3733     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3734     register const Py_UNICODE *e;
3735
3736     if (!PyArg_NoArgs(args))
3737         return NULL;
3738
3739     /* Shortcut for single character strings */
3740     if (PyUnicode_GET_SIZE(self) == 1 &&
3741         Py_UNICODE_ISSPACE(*p))
3742         return PyInt_FromLong(1);
3743
3744     /* Special case for empty strings */
3745     if (PyString_GET_SIZE(self) == 0)
3746         return PyInt_FromLong(0);
3747
3748     e = p + PyUnicode_GET_SIZE(self);
3749     for (; p < e; p++) {
3750         if (!Py_UNICODE_ISSPACE(*p))
3751             return PyInt_FromLong(0);
3752     }
3753     return PyInt_FromLong(1);
3754 }
3755
3756 static char isalpha__doc__[] =
3757 "S.isalpha() -> int\n\
3758 \n\
3759 Return 1 if  all characters in S are alphabetic\n\
3760 and there is at least one character in S, 0 otherwise.";
3761
3762 static PyObject*
3763 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3764 {
3765     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3766     register const Py_UNICODE *e;
3767
3768     if (!PyArg_NoArgs(args))
3769         return NULL;
3770
3771     /* Shortcut for single character strings */
3772     if (PyUnicode_GET_SIZE(self) == 1 &&
3773         Py_UNICODE_ISALPHA(*p))
3774         return PyInt_FromLong(1);
3775
3776     /* Special case for empty strings */
3777     if (PyString_GET_SIZE(self) == 0)
3778         return PyInt_FromLong(0);
3779
3780     e = p + PyUnicode_GET_SIZE(self);
3781     for (; p < e; p++) {
3782         if (!Py_UNICODE_ISALPHA(*p))
3783             return PyInt_FromLong(0);
3784     }
3785     return PyInt_FromLong(1);
3786 }
3787
3788 static char isalnum__doc__[] =
3789 "S.isalnum() -> int\n\
3790 \n\
3791 Return 1 if  all characters in S are alphanumeric\n\
3792 and there is at least one character in S, 0 otherwise.";
3793
3794 static PyObject*
3795 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3796 {
3797     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3798     register const Py_UNICODE *e;
3799
3800     if (!PyArg_NoArgs(args))
3801         return NULL;
3802
3803     /* Shortcut for single character strings */
3804     if (PyUnicode_GET_SIZE(self) == 1 &&
3805         Py_UNICODE_ISALNUM(*p))
3806         return PyInt_FromLong(1);
3807
3808     /* Special case for empty strings */
3809     if (PyString_GET_SIZE(self) == 0)
3810         return PyInt_FromLong(0);
3811
3812     e = p + PyUnicode_GET_SIZE(self);
3813     for (; p < e; p++) {
3814         if (!Py_UNICODE_ISALNUM(*p))
3815             return PyInt_FromLong(0);
3816     }
3817     return PyInt_FromLong(1);
3818 }
3819
3820 static char isdecimal__doc__[] =
3821 "S.isdecimal() -> int\n\
3822 \n\
3823 Return 1 if there are only decimal characters in S,\n\
3824 0 otherwise.";
3825
3826 static PyObject*
3827 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3828 {
3829     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3830     register const Py_UNICODE *e;
3831
3832     if (!PyArg_NoArgs(args))
3833         return NULL;
3834
3835     /* Shortcut for single character strings */
3836     if (PyUnicode_GET_SIZE(self) == 1 &&
3837         Py_UNICODE_ISDECIMAL(*p))
3838         return PyInt_FromLong(1);
3839
3840     /* Special case for empty strings */
3841     if (PyString_GET_SIZE(self) == 0)
3842         return PyInt_FromLong(0);
3843
3844     e = p + PyUnicode_GET_SIZE(self);
3845     for (; p < e; p++) {
3846         if (!Py_UNICODE_ISDECIMAL(*p))
3847             return PyInt_FromLong(0);
3848     }
3849     return PyInt_FromLong(1);
3850 }
3851
3852 static char isdigit__doc__[] =
3853 "S.isdigit() -> int\n\
3854 \n\
3855 Return 1 if there are only digit characters in S,\n\
3856 0 otherwise.";
3857
3858 static PyObject*
3859 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3860 {
3861     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862     register const Py_UNICODE *e;
3863
3864     if (!PyArg_NoArgs(args))
3865         return NULL;
3866
3867     /* Shortcut for single character strings */
3868     if (PyUnicode_GET_SIZE(self) == 1 &&
3869         Py_UNICODE_ISDIGIT(*p))
3870         return PyInt_FromLong(1);
3871
3872     /* Special case for empty strings */
3873     if (PyString_GET_SIZE(self) == 0)
3874         return PyInt_FromLong(0);
3875
3876     e = p + PyUnicode_GET_SIZE(self);
3877     for (; p < e; p++) {
3878         if (!Py_UNICODE_ISDIGIT(*p))
3879             return PyInt_FromLong(0);
3880     }
3881     return PyInt_FromLong(1);
3882 }
3883
3884 static char isnumeric__doc__[] =
3885 "S.isnumeric() -> int\n\
3886 \n\
3887 Return 1 if there are only numeric characters in S,\n\
3888 0 otherwise.";
3889
3890 static PyObject*
3891 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3892 {
3893     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894     register const Py_UNICODE *e;
3895
3896     if (!PyArg_NoArgs(args))
3897         return NULL;
3898
3899     /* Shortcut for single character strings */
3900     if (PyUnicode_GET_SIZE(self) == 1 &&
3901         Py_UNICODE_ISNUMERIC(*p))
3902         return PyInt_FromLong(1);
3903
3904     /* Special case for empty strings */
3905     if (PyString_GET_SIZE(self) == 0)
3906         return PyInt_FromLong(0);
3907
3908     e = p + PyUnicode_GET_SIZE(self);
3909     for (; p < e; p++) {
3910         if (!Py_UNICODE_ISNUMERIC(*p))
3911             return PyInt_FromLong(0);
3912     }
3913     return PyInt_FromLong(1);
3914 }
3915
3916 static char join__doc__[] =
3917 "S.join(sequence) -> unicode\n\
3918 \n\
3919 Return a string which is the concatenation of the strings in the\n\
3920 sequence.  The separator between elements is S.";
3921
3922 static PyObject*
3923 unicode_join(PyUnicodeObject *self, PyObject *args)
3924 {
3925     PyObject *data;
3926     if (!PyArg_ParseTuple(args, "O:join", &data))
3927         return NULL;
3928
3929     return PyUnicode_Join((PyObject *)self, data);
3930 }
3931
3932 static int
3933 unicode_length(PyUnicodeObject *self)
3934 {
3935     return self->length;
3936 }
3937
3938 static char ljust__doc__[] =
3939 "S.ljust(width) -> unicode\n\
3940 \n\
3941 Return S left justified in a Unicode string of length width. Padding is\n\
3942 done using spaces.";
3943
3944 static PyObject *
3945 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3946 {
3947     int width;
3948     if (!PyArg_ParseTuple(args, "i:ljust", &width))
3949         return NULL;
3950
3951     if (self->length >= width) {
3952         Py_INCREF(self);
3953         return (PyObject*) self;
3954     }
3955
3956     return (PyObject*) pad(self, 0, width - self->length, ' ');
3957 }
3958
3959 static char lower__doc__[] =
3960 "S.lower() -> unicode\n\
3961 \n\
3962 Return a copy of the string S converted to lowercase.";
3963
3964 static PyObject*
3965 unicode_lower(PyUnicodeObject *self, PyObject *args)
3966 {
3967     if (!PyArg_NoArgs(args))
3968         return NULL;
3969     return fixup(self, fixlower);
3970 }
3971
3972 static char lstrip__doc__[] =
3973 "S.lstrip() -> unicode\n\
3974 \n\
3975 Return a copy of the string S with leading whitespace removed.";
3976
3977 static PyObject *
3978 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3979 {
3980     if (!PyArg_NoArgs(args))
3981         return NULL;
3982     return strip(self, 1, 0);
3983 }
3984
3985 static PyObject*
3986 unicode_repeat(PyUnicodeObject *str, int len)
3987 {
3988     PyUnicodeObject *u;
3989     Py_UNICODE *p;
3990     int nchars;
3991     size_t nbytes;
3992
3993     if (len < 0)
3994         len = 0;
3995
3996     if (len == 1) {
3997         /* no repeat, return original string */
3998         Py_INCREF(str);
3999         return (PyObject*) str;
4000     }
4001
4002     /* ensure # of chars needed doesn't overflow int and # of bytes
4003      * needed doesn't overflow size_t
4004      */
4005     nchars = len * str->length;
4006     if (len && nchars / len != str->length) {
4007         PyErr_SetString(PyExc_OverflowError,
4008                         "repeated string is too long");
4009         return NULL;
4010     }
4011     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4012     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4013         PyErr_SetString(PyExc_OverflowError,
4014                         "repeated string is too long");
4015         return NULL;
4016     }
4017     u = _PyUnicode_New(nchars);
4018     if (!u)
4019         return NULL;
4020
4021     p = u->str;
4022
4023     while (len-- > 0) {
4024         Py_UNICODE_COPY(p, str->str, str->length);
4025         p += str->length;
4026     }
4027
4028     return (PyObject*) u;
4029 }
4030
4031 PyObject *PyUnicode_Replace(PyObject *obj,
4032                             PyObject *subobj,
4033                             PyObject *replobj,
4034                             int maxcount)
4035 {
4036     PyObject *self;
4037     PyObject *str1;
4038     PyObject *str2;
4039     PyObject *result;
4040
4041     self = PyUnicode_FromObject(obj);
4042     if (self == NULL)
4043         return NULL;
4044     str1 = PyUnicode_FromObject(subobj);
4045     if (str1 == NULL) {
4046         Py_DECREF(self);
4047         return NULL;
4048     }
4049     str2 = PyUnicode_FromObject(replobj);
4050     if (str2 == NULL) {
4051         Py_DECREF(self);
4052         Py_DECREF(str1);
4053         return NULL;
4054     }
4055     result = replace((PyUnicodeObject *)self,
4056                      (PyUnicodeObject *)str1,
4057                      (PyUnicodeObject *)str2,
4058                      maxcount);
4059     Py_DECREF(self);
4060     Py_DECREF(str1);
4061     Py_DECREF(str2);
4062     return result;
4063 }
4064
4065 static char replace__doc__[] =
4066 "S.replace (old, new[, maxsplit]) -> unicode\n\
4067 \n\
4068 Return a copy of S with all occurrences of substring\n\
4069 old replaced by new.  If the optional argument maxsplit is\n\
4070 given, only the first maxsplit occurrences are replaced.";
4071
4072 static PyObject*
4073 unicode_replace(PyUnicodeObject *self, PyObject *args)
4074 {
4075     PyUnicodeObject *str1;
4076     PyUnicodeObject *str2;
4077     int maxcount = -1;
4078     PyObject *result;
4079
4080     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4081         return NULL;
4082     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4083     if (str1 == NULL)
4084         return NULL;
4085     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4086     if (str2 == NULL)
4087         return NULL;
4088
4089     result = replace(self, str1, str2, maxcount);
4090
4091     Py_DECREF(str1);
4092     Py_DECREF(str2);
4093     return result;
4094 }
4095
4096 static
4097 PyObject *unicode_repr(PyObject *unicode)
4098 {
4099     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4100                                 PyUnicode_GET_SIZE(unicode),
4101                                 1);
4102 }
4103
4104 static char rfind__doc__[] =
4105 "S.rfind(sub [,start [,end]]) -> int\n\
4106 \n\
4107 Return the highest index in S where substring sub is found,\n\
4108 such that sub is contained within s[start,end].  Optional\n\
4109 arguments start and end are interpreted as in slice notation.\n\
4110 \n\
4111 Return -1 on failure.";
4112
4113 static PyObject *
4114 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4115 {
4116     PyUnicodeObject *substring;
4117     int start = 0;
4118     int end = INT_MAX;
4119     PyObject *result;
4120
4121     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4122                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4123         return NULL;
4124     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4125                                                 (PyObject *)substring);
4126     if (substring == NULL)
4127         return NULL;
4128
4129     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4130
4131     Py_DECREF(substring);
4132     return result;
4133 }
4134
4135 static char rindex__doc__[] =
4136 "S.rindex(sub [,start [,end]]) -> int\n\
4137 \n\
4138 Like S.rfind() but raise ValueError when the substring is not found.";
4139
4140 static PyObject *
4141 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4142 {
4143     int result;
4144     PyUnicodeObject *substring;
4145     int start = 0;
4146     int end = INT_MAX;
4147
4148     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4149                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4150         return NULL;
4151     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4152                                                 (PyObject *)substring);
4153     if (substring == NULL)
4154         return NULL;
4155
4156     result = findstring(self, substring, start, end, -1);
4157
4158     Py_DECREF(substring);
4159     if (result < 0) {
4160         PyErr_SetString(PyExc_ValueError, "substring not found");
4161         return NULL;
4162     }
4163     return PyInt_FromLong(result);
4164 }
4165
4166 static char rjust__doc__[] =
4167 "S.rjust(width) -> unicode\n\
4168 \n\
4169 Return S right justified in a Unicode string of length width. Padding is\n\
4170 done using spaces.";
4171
4172 static PyObject *
4173 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4174 {
4175     int width;
4176     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4177         return NULL;
4178
4179     if (self->length >= width) {
4180         Py_INCREF(self);
4181         return (PyObject*) self;
4182     }
4183
4184     return (PyObject*) pad(self, width - self->length, 0, ' ');
4185 }
4186
4187 static char rstrip__doc__[] =
4188 "S.rstrip() -> unicode\n\
4189 \n\
4190 Return a copy of the string S with trailing whitespace removed.";
4191
4192 static PyObject *
4193 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4194 {
4195     if (!PyArg_NoArgs(args))
4196         return NULL;
4197     return strip(self, 0, 1);
4198 }
4199
4200 static PyObject*
4201 unicode_slice(PyUnicodeObject *self, int start, int end)
4202 {
4203     /* standard clamping */
4204     if (start < 0)
4205         start = 0;
4206     if (end < 0)
4207         end = 0;
4208     if (end > self->length)
4209         end = self->length;
4210     if (start == 0 && end == self->length) {
4211         /* full slice, return original string */
4212         Py_INCREF(self);
4213         return (PyObject*) self;
4214     }
4215     if (start > end)
4216         start = end;
4217     /* copy slice */
4218     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4219                                              end - start);
4220 }
4221
4222 PyObject *PyUnicode_Split(PyObject *s,
4223                           PyObject *sep,
4224                           int maxsplit)
4225 {
4226     PyObject *result;
4227
4228     s = PyUnicode_FromObject(s);
4229     if (s == NULL)
4230         return NULL;
4231     if (sep != NULL) {
4232         sep = PyUnicode_FromObject(sep);
4233         if (sep == NULL) {
4234             Py_DECREF(s);
4235             return NULL;
4236         }
4237     }
4238
4239     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4240
4241     Py_DECREF(s);
4242     Py_XDECREF(sep);
4243     return result;
4244 }
4245
4246 static char split__doc__[] =
4247 "S.split([sep [,maxsplit]]) -> list of strings\n\
4248 \n\
4249 Return a list of the words in S, using sep as the\n\
4250 delimiter string.  If maxsplit is given, at most maxsplit\n\
4251 splits are done. If sep is not specified, any whitespace string\n\
4252 is a separator.";
4253
4254 static PyObject*
4255 unicode_split(PyUnicodeObject *self, PyObject *args)
4256 {
4257     PyObject *substring = Py_None;
4258     int maxcount = -1;
4259
4260     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4261         return NULL;
4262
4263     if (substring == Py_None)
4264         return split(self, NULL, maxcount);
4265     else if (PyUnicode_Check(substring))
4266         return split(self, (PyUnicodeObject *)substring, maxcount);
4267     else
4268         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4269 }
4270
4271 static char splitlines__doc__[] =
4272 "S.splitlines([keepends]]) -> list of strings\n\
4273 \n\
4274 Return a list of the lines in S, breaking at line boundaries.\n\
4275 Line breaks are not included in the resulting list unless keepends\n\
4276 is given and true.";
4277
4278 static PyObject*
4279 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4280 {
4281     int keepends = 0;
4282
4283     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4284         return NULL;
4285
4286     return PyUnicode_Splitlines((PyObject *)self, keepends);
4287 }
4288
4289 static
4290 PyObject *unicode_str(PyUnicodeObject *self)
4291 {
4292     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4293 }
4294
4295 static char strip__doc__[] =
4296 "S.strip() -> unicode\n\
4297 \n\
4298 Return a copy of S with leading and trailing whitespace removed.";
4299
4300 static PyObject *
4301 unicode_strip(PyUnicodeObject *self, PyObject *args)
4302 {
4303     if (!PyArg_NoArgs(args))
4304         return NULL;
4305     return strip(self, 1, 1);
4306 }
4307
4308 static char swapcase__doc__[] =
4309 "S.swapcase() -> unicode\n\
4310 \n\
4311 Return a copy of S with uppercase characters converted to lowercase\n\
4312 and vice versa.";
4313
4314 static PyObject*
4315 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4316 {
4317     if (!PyArg_NoArgs(args))
4318         return NULL;
4319     return fixup(self, fixswapcase);
4320 }
4321
4322 static char translate__doc__[] =
4323 "S.translate(table) -> unicode\n\
4324 \n\
4325 Return a copy of the string S, where all characters have been mapped\n\
4326 through the given translation table, which must be a mapping of\n\
4327 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4328 are left untouched. Characters mapped to None are deleted.";
4329
4330 static PyObject*
4331 unicode_translate(PyUnicodeObject *self, PyObject *args)
4332 {
4333     PyObject *table;
4334
4335     if (!PyArg_ParseTuple(args, "O:translate", &table))
4336         return NULL;
4337     return PyUnicode_TranslateCharmap(self->str,
4338                                       self->length,
4339                                       table,
4340                                       "ignore");
4341 }
4342
4343 static char upper__doc__[] =
4344 "S.upper() -> unicode\n\
4345 \n\
4346 Return a copy of S converted to uppercase.";
4347
4348 static PyObject*
4349 unicode_upper(PyUnicodeObject *self, PyObject *args)
4350 {
4351     if (!PyArg_NoArgs(args))
4352         return NULL;
4353     return fixup(self, fixupper);
4354 }
4355
4356 #if 0
4357 static char zfill__doc__[] =
4358 "S.zfill(width) -> unicode\n\
4359 \n\
4360 Pad a numeric string x with zeros on the left, to fill a field\n\
4361 of the specified width. The string x is never truncated.";
4362
4363 static PyObject *
4364 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4365 {
4366     int fill;
4367     PyUnicodeObject *u;
4368
4369     int width;
4370     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4371         return NULL;
4372
4373     if (self->length >= width) {
4374         Py_INCREF(self);
4375         return (PyObject*) self;
4376     }
4377
4378     fill = width - self->length;
4379
4380     u = pad(self, fill, 0, '0');
4381
4382     if (u->str[fill] == '+' || u->str[fill] == '-') {
4383         /* move sign to beginning of string */
4384         u->str[0] = u->str[fill];
4385         u->str[fill] = '0';
4386     }
4387
4388     return (PyObject*) u;
4389 }
4390 #endif
4391
4392 #if 0
4393 static PyObject*
4394 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4395 {
4396     if (!PyArg_NoArgs(args))
4397         return NULL;
4398     return PyInt_FromLong(unicode_freelist_size);
4399 }
4400 #endif
4401
4402 static char startswith__doc__[] =
4403 "S.startswith(prefix[, start[, end]]) -> int\n\
4404 \n\
4405 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4406 optional start, test S beginning at that position.  With optional end, stop\n\
4407 comparing S at that position.";
4408
4409 static PyObject *
4410 unicode_startswith(PyUnicodeObject *self,
4411                    PyObject *args)
4412 {
4413     PyUnicodeObject *substring;
4414     int start = 0;
4415     int end = INT_MAX;
4416     PyObject *result;
4417
4418     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4419                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4420         return NULL;
4421     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4422                                                 (PyObject *)substring);
4423     if (substring == NULL)
4424         return NULL;
4425
4426     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4427
4428     Py_DECREF(substring);
4429     return result;
4430 }
4431
4432
4433 static char endswith__doc__[] =
4434 "S.endswith(suffix[, start[, end]]) -> int\n\
4435 \n\
4436 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4437 optional start, test S beginning at that position.  With optional end, stop\n\
4438 comparing S at that position.";
4439
4440 static PyObject *
4441 unicode_endswith(PyUnicodeObject *self,
4442                  PyObject *args)
4443 {
4444     PyUnicodeObject *substring;
4445     int start = 0;
4446     int end = INT_MAX;
4447     PyObject *result;
4448
4449     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4450                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4451         return NULL;
4452     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4453                                                 (PyObject *)substring);
4454     if (substring == NULL)
4455         return NULL;
4456
4457     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4458
4459     Py_DECREF(substring);
4460     return result;
4461 }
4462
4463
4464 static PyMethodDef unicode_methods[] = {
4465
4466     /* Order is according to common usage: often used methods should
4467        appear first, since lookup is done sequentially. */
4468
4469     {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4470     {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4471     {"split", (PyCFunction) unicode_split, 1, split__doc__},
4472     {"join", (PyCFunction) unicode_join, 1, join__doc__},
4473     {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4474     {"title", (PyCFunction) unicode_title, 0, title__doc__},
4475     {"center", (PyCFunction) unicode_center, 1, center__doc__},
4476     {"count", (PyCFunction) unicode_count, 1, count__doc__},
4477     {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4478     {"find", (PyCFunction) unicode_find, 1, find__doc__},
4479     {"index", (PyCFunction) unicode_index, 1, index__doc__},
4480     {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4481     {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4482     {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4483 /*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4484     {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4485     {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4486     {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4487     {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4488     {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4489     {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4490     {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4491     {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4492     {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4493     {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4494     {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4495     {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4496     {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4497     {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4498     {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4499     {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4500     {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4501     {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4502     {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4503     {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4504 #if 0
4505     {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4506     {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4507 #endif
4508
4509 #if 0
4510     /* This one is just used for debugging the implementation. */
4511     {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4512 #endif
4513
4514     {NULL, NULL}
4515 };
4516
4517 static PyObject *
4518 unicode_getattr(PyUnicodeObject *self, char *name)
4519 {
4520     return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4521 }
4522
4523 static PySequenceMethods unicode_as_sequence = {
4524     (inquiry) unicode_length,           /* sq_length */
4525     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4526     (intargfunc) unicode_repeat,        /* sq_repeat */
4527     (intargfunc) unicode_getitem,       /* sq_item */
4528     (intintargfunc) unicode_slice,      /* sq_slice */
4529     0,                                  /* sq_ass_item */
4530     0,                                  /* sq_ass_slice */
4531     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4532 };
4533
4534 static int
4535 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4536                           int index,
4537                           const void **ptr)
4538 {
4539     if (index != 0) {
4540         PyErr_SetString(PyExc_SystemError,
4541                         "accessing non-existent unicode segment");
4542         return -1;
4543     }
4544     *ptr = (void *) self->str;
4545     return PyUnicode_GET_DATA_SIZE(self);
4546 }
4547
4548 static int
4549 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4550                            const void **ptr)
4551 {
4552     PyErr_SetString(PyExc_TypeError,
4553                     "cannot use unicode as modifyable buffer");
4554     return -1;
4555 }
4556
4557 static int
4558 unicode_buffer_getsegcount(PyUnicodeObject *self,
4559                            int *lenp)
4560 {
4561     if (lenp)
4562         *lenp = PyUnicode_GET_DATA_SIZE(self);
4563     return 1;
4564 }
4565
4566 static int
4567 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4568                           int index,
4569                           const void **ptr)
4570 {
4571     PyObject *str;
4572
4573     if (index != 0) {
4574         PyErr_SetString(PyExc_SystemError,
4575                         "accessing non-existent unicode segment");
4576         return -1;
4577     }
4578     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4579     if (str == NULL)
4580         return -1;
4581     *ptr = (void *) PyString_AS_STRING(str);
4582     return PyString_GET_SIZE(str);
4583 }
4584
4585 /* Helpers for PyUnicode_Format() */
4586
4587 static PyObject *
4588 getnextarg(PyObject *args, int arglen, int *p_argidx)
4589 {
4590     int argidx = *p_argidx;
4591     if (argidx < arglen) {
4592         (*p_argidx)++;
4593         if (arglen < 0)
4594             return args;
4595         else
4596             return PyTuple_GetItem(args, argidx);
4597     }
4598     PyErr_SetString(PyExc_TypeError,
4599                     "not enough arguments for format string");
4600     return NULL;
4601 }
4602
4603 #define F_LJUST (1<<0)
4604 #define F_SIGN  (1<<1)
4605 #define F_BLANK (1<<2)
4606 #define F_ALT   (1<<3)
4607 #define F_ZERO  (1<<4)
4608
4609 static
4610 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4611 {
4612     register int i;
4613     int len;
4614     va_list va;
4615     char *charbuffer;
4616     va_start(va, format);
4617
4618     /* First, format the string as char array, then expand to Py_UNICODE
4619        array. */
4620     charbuffer = (char *)buffer;
4621     len = vsprintf(charbuffer, format, va);
4622     for (i = len - 1; i >= 0; i--)
4623         buffer[i] = (Py_UNICODE) charbuffer[i];
4624
4625     va_end(va);
4626     return len;
4627 }
4628
4629 static int
4630 formatfloat(Py_UNICODE *buf,
4631             size_t buflen,
4632             int flags,
4633             int prec,
4634             int type,
4635             PyObject *v)
4636 {
4637     /* fmt = '%#.' + `prec` + `type`
4638        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4639     char fmt[20];
4640     double x;
4641
4642     x = PyFloat_AsDouble(v);
4643     if (x == -1.0 && PyErr_Occurred())
4644         return -1;
4645     if (prec < 0)
4646         prec = 6;
4647     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4648         type = 'g';
4649     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4650     /* worst case length calc to ensure no buffer overrun:
4651          fmt = %#.<prec>g
4652          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4653             for any double rep.)
4654          len = 1 + prec + 1 + 2 + 5 = 9 + prec
4655        If prec=0 the effective precision is 1 (the leading digit is
4656        always given), therefore increase by one to 10+prec. */
4657     if (buflen <= (size_t)10 + (size_t)prec) {
4658         PyErr_SetString(PyExc_OverflowError,
4659             "formatted float is too long (precision too long?)");
4660         return -1;
4661     }
4662     return usprintf(buf, fmt, x);
4663 }
4664
4665 static PyObject*
4666 formatlong(PyObject *val, int flags, int prec, int type)
4667 {
4668         char *buf;
4669         int i, len;
4670         PyObject *str; /* temporary string object. */
4671         PyUnicodeObject *result;
4672
4673         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4674         if (!str)
4675                 return NULL;
4676         result = _PyUnicode_New(len);
4677         for (i = 0; i < len; i++)
4678                 result->str[i] = buf[i];
4679         result->str[len] = 0;
4680         Py_DECREF(str);
4681         return (PyObject*)result;
4682 }
4683
4684 static int
4685 formatint(Py_UNICODE *buf,
4686           size_t buflen,
4687           int flags,
4688           int prec,
4689           int type,
4690           PyObject *v)
4691 {
4692     /* fmt = '%#.' + `prec` + 'l' + `type`
4693        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4694        + 1 + 1 = 24*/
4695     char fmt[64]; /* plenty big enough! */
4696     long x;
4697
4698     x = PyInt_AsLong(v);
4699     if (x == -1 && PyErr_Occurred())
4700         return -1;
4701     if (prec < 0)
4702         prec = 1;
4703     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4704        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4705     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4706         PyErr_SetString(PyExc_OverflowError,
4707             "formatted integer is too long (precision too long?)");
4708         return -1;
4709     }
4710     sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4711     return usprintf(buf, fmt, x);
4712 }
4713
4714 static int
4715 formatchar(Py_UNICODE *buf,
4716            size_t buflen,
4717            PyObject *v)
4718 {
4719     /* presume that the buffer is at least 2 characters long */
4720     if (PyUnicode_Check(v)) {
4721         if (PyUnicode_GET_SIZE(v) != 1)
4722             goto onError;
4723         buf[0] = PyUnicode_AS_UNICODE(v)[0];
4724     }
4725
4726     else if (PyString_Check(v)) {
4727         if (PyString_GET_SIZE(v) != 1)
4728             goto onError;
4729         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4730     }
4731
4732     else {
4733         /* Integer input truncated to a character */
4734         long x;
4735         x = PyInt_AsLong(v);
4736         if (x == -1 && PyErr_Occurred())
4737             goto onError;
4738         buf[0] = (char) x;
4739     }
4740     buf[1] = '\0';
4741     return 1;
4742
4743  onError:
4744     PyErr_SetString(PyExc_TypeError,
4745                     "%c requires int or char");
4746     return -1;
4747 }
4748
4749 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4750
4751    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4752    chars are formatted. XXX This is a magic number. Each formatting
4753    routine does bounds checking to ensure no overflow, but a better
4754    solution may be to malloc a buffer of appropriate size for each
4755    format. For now, the current solution is sufficient.
4756 */
4757 #define FORMATBUFLEN (size_t)120
4758
4759 PyObject *PyUnicode_Format(PyObject *format,
4760                            PyObject *args)
4761 {
4762     Py_UNICODE *fmt, *res;
4763     int fmtcnt, rescnt, reslen, arglen, argidx;
4764     int args_owned = 0;
4765     PyUnicodeObject *result = NULL;
4766     PyObject *dict = NULL;
4767     PyObject *uformat;
4768
4769     if (format == NULL || args == NULL) {
4770         PyErr_BadInternalCall();
4771         return NULL;
4772     }
4773     uformat = PyUnicode_FromObject(format);
4774     if (uformat == NULL)
4775         return NULL;
4776     fmt = PyUnicode_AS_UNICODE(uformat);
4777     fmtcnt = PyUnicode_GET_SIZE(uformat);
4778
4779     reslen = rescnt = fmtcnt + 100;
4780     result = _PyUnicode_New(reslen);
4781     if (result == NULL)
4782         goto onError;
4783     res = PyUnicode_AS_UNICODE(result);
4784
4785     if (PyTuple_Check(args)) {
4786         arglen = PyTuple_Size(args);
4787         argidx = 0;
4788     }
4789     else {
4790         arglen = -1;
4791         argidx = -2;
4792     }
4793     if (args->ob_type->tp_as_mapping)
4794         dict = args;
4795
4796     while (--fmtcnt >= 0) {
4797         if (*fmt != '%') {
4798             if (--rescnt < 0) {
4799                 rescnt = fmtcnt + 100;
4800                 reslen += rescnt;
4801                 if (_PyUnicode_Resize(result, reslen) < 0)
4802                     return NULL;
4803                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4804                 --rescnt;
4805             }
4806             *res++ = *fmt++;
4807         }
4808         else {
4809             /* Got a format specifier */
4810             int flags = 0;
4811             int width = -1;
4812             int prec = -1;
4813             int size = 0;
4814             Py_UNICODE c = '\0';
4815             Py_UNICODE fill;
4816             PyObject *v = NULL;
4817             PyObject *temp = NULL;
4818             Py_UNICODE *pbuf;
4819             Py_UNICODE sign;
4820             int len;
4821             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4822
4823             fmt++;
4824             if (*fmt == '(') {
4825                 Py_UNICODE *keystart;
4826                 int keylen;
4827                 PyObject *key;
4828                 int pcount = 1;
4829
4830                 if (dict == NULL) {
4831                     PyErr_SetString(PyExc_TypeError,
4832                                     "format requires a mapping");
4833                     goto onError;
4834                 }
4835                 ++fmt;
4836                 --fmtcnt;
4837                 keystart = fmt;
4838                 /* Skip over balanced parentheses */
4839                 while (pcount > 0 && --fmtcnt >= 0) {
4840                     if (*fmt == ')')
4841                         --pcount;
4842                     else if (*fmt == '(')
4843                         ++pcount;
4844                     fmt++;
4845                 }
4846                 keylen = fmt - keystart - 1;
4847                 if (fmtcnt < 0 || pcount > 0) {
4848                     PyErr_SetString(PyExc_ValueError,
4849                                     "incomplete format key");
4850                     goto onError;
4851                 }
4852                 /* keys are converted to strings using UTF-8 and
4853                    then looked up since Python uses strings to hold
4854                    variables names etc. in its namespaces and we
4855                    wouldn't want to break common idioms. */
4856                 key = PyUnicode_EncodeUTF8(keystart,
4857                                            keylen,
4858                                            NULL);
4859                 if (key == NULL)
4860                     goto onError;
4861                 if (args_owned) {
4862                     Py_DECREF(args);
4863                     args_owned = 0;
4864                 }
4865                 args = PyObject_GetItem(dict, key);
4866                 Py_DECREF(key);
4867                 if (args == NULL) {
4868                     goto onError;
4869                 }
4870                 args_owned = 1;
4871                 arglen = -1;
4872                 argidx = -2;
4873             }
4874             while (--fmtcnt >= 0) {
4875                 switch (c = *fmt++) {
4876                 case '-': flags |= F_LJUST; continue;
4877                 case '+': flags |= F_SIGN; continue;
4878                 case ' ': flags |= F_BLANK; continue;
4879                 case '#': flags |= F_ALT; continue;
4880                 case '0': flags |= F_ZERO; continue;
4881                 }
4882                 break;
4883             }
4884             if (c == '*') {
4885                 v = getnextarg(args, arglen, &argidx);
4886                 if (v == NULL)
4887                     goto onError;
4888                 if (!PyInt_Check(v)) {
4889                     PyErr_SetString(PyExc_TypeError,
4890                                     "* wants int");
4891                     goto onError;
4892                 }
4893                 width = PyInt_AsLong(v);
4894                 if (width < 0) {
4895                     flags |= F_LJUST;
4896                     width = -width;
4897                 }
4898                 if (--fmtcnt >= 0)
4899                     c = *fmt++;
4900             }
4901             else if (c >= '0' && c <= '9') {
4902                 width = c - '0';
4903                 while (--fmtcnt >= 0) {
4904                     c = *fmt++;
4905                     if (c < '0' || c > '9')
4906                         break;
4907                     if ((width*10) / 10 != width) {
4908                         PyErr_SetString(PyExc_ValueError,
4909                                         "width too big");
4910                         goto onError;
4911                     }
4912                     width = width*10 + (c - '0');
4913                 }
4914             }
4915             if (c == '.') {
4916                 prec = 0;
4917                 if (--fmtcnt >= 0)
4918                     c = *fmt++;
4919                 if (c == '*') {
4920                     v = getnextarg(args, arglen, &argidx);
4921                     if (v == NULL)
4922                         goto onError;
4923                     if (!PyInt_Check(v)) {
4924                         PyErr_SetString(PyExc_TypeError,
4925                                         "* wants int");
4926                         goto onError;
4927                     }
4928                     prec = PyInt_AsLong(v);
4929                     if (prec < 0)
4930                         prec = 0;
4931                     if (--fmtcnt >= 0)
4932                         c = *fmt++;
4933                 }
4934                 else if (c >= '0' && c <= '9') {
4935                     prec = c - '0';
4936                     while (--fmtcnt >= 0) {
4937                         c = Py_CHARMASK(*fmt++);
4938                         if (c < '0' || c > '9')
4939                             break;
4940                         if ((prec*10) / 10 != prec) {
4941                             PyErr_SetString(PyExc_ValueError,
4942                                             "prec too big");
4943                             goto onError;
4944                         }
4945                         prec = prec*10 + (c - '0');
4946                     }
4947                 }
4948             } /* prec */
4949             if (fmtcnt >= 0) {
4950                 if (c == 'h' || c == 'l' || c == 'L') {
4951                     size = c;
4952                     if (--fmtcnt >= 0)
4953                         c = *fmt++;
4954                 }
4955             }
4956             if (fmtcnt < 0) {
4957                 PyErr_SetString(PyExc_ValueError,
4958                                 "incomplete format");
4959                 goto onError;
4960             }
4961             if (c != '%') {
4962                 v = getnextarg(args, arglen, &argidx);
4963                 if (v == NULL)
4964                     goto onError;
4965             }
4966             sign = 0;
4967             fill = ' ';
4968             switch (c) {
4969
4970             case '%':
4971                 pbuf = formatbuf;
4972                 /* presume that buffer length is at least 1 */
4973                 pbuf[0] = '%';
4974                 len = 1;
4975                 break;
4976
4977             case 's':
4978             case 'r':
4979                 if (PyUnicode_Check(v) && c == 's') {
4980                     temp = v;
4981                     Py_INCREF(temp);
4982                 }
4983                 else {
4984                     PyObject *unicode;
4985                     if (c == 's')
4986                         temp = PyObject_Str(v);
4987                     else
4988                         temp = PyObject_Repr(v);
4989                     if (temp == NULL)
4990                         goto onError;
4991                     if (!PyString_Check(temp)) {
4992                         /* XXX Note: this should never happen, since
4993                                PyObject_Repr() and PyObject_Str() assure
4994                                this */
4995                         Py_DECREF(temp);
4996                         PyErr_SetString(PyExc_TypeError,
4997                                         "%s argument has non-string str()");
4998                         goto onError;
4999                     }
5000                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5001                                                    PyString_GET_SIZE(temp),
5002                                                NULL,
5003                                                    "strict");
5004                     Py_DECREF(temp);
5005                     temp = unicode;
5006                     if (temp == NULL)
5007                         goto onError;
5008                 }
5009                 pbuf = PyUnicode_AS_UNICODE(temp);
5010                 len = PyUnicode_GET_SIZE(temp);
5011                 if (prec >= 0 && len > prec)
5012                     len = prec;
5013                 break;
5014
5015             case 'i':
5016             case 'd':
5017             case 'u':
5018             case 'o':
5019             case 'x':
5020             case 'X':
5021                 if (c == 'i')
5022                     c = 'd';
5023                 if (PyLong_Check(v) && PyLong_AsLong(v) == -1
5024                     && PyErr_Occurred()) {
5025                     PyErr_Clear();
5026                     temp = formatlong(v, flags, prec, c);
5027                     if (!temp)
5028                         goto onError;
5029                     pbuf = PyUnicode_AS_UNICODE(temp);
5030                     len = PyUnicode_GET_SIZE(temp);
5031                     /* unbounded ints can always produce
5032                        a sign character! */
5033                     sign = 1;
5034                 }
5035                 else {
5036                     pbuf = formatbuf;
5037                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5038                                     flags, prec, c, v);
5039                     if (len < 0)
5040                         goto onError;
5041                     /* only d conversion is signed */
5042                     sign = c == 'd';
5043                 }
5044                 if (flags & F_ZERO)
5045                     fill = '0';
5046                 break;
5047
5048             case 'e':
5049             case 'E':
5050             case 'f':
5051             case 'g':
5052             case 'G':
5053                 pbuf = formatbuf;
5054                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5055                         flags, prec, c, v);
5056                 if (len < 0)
5057                     goto onError;
5058                 sign = 1;
5059                 if (flags & F_ZERO)
5060                     fill = '0';
5061                 break;
5062
5063             case 'c':
5064                 pbuf = formatbuf;
5065                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5066                 if (len < 0)
5067                     goto onError;
5068                 break;
5069
5070             default:
5071                 PyErr_Format(PyExc_ValueError,
5072                              "unsupported format character '%c' (0x%x)",
5073                              c, c);
5074                 goto onError;
5075             }
5076             if (sign) {
5077                 if (*pbuf == '-' || *pbuf == '+') {
5078                     sign = *pbuf++;
5079                     len--;
5080                 }
5081                 else if (flags & F_SIGN)
5082                     sign = '+';
5083                 else if (flags & F_BLANK)
5084                     sign = ' ';
5085                 else
5086                     sign = 0;
5087             }
5088             if (width < len)
5089                 width = len;
5090             if (rescnt < width + (sign != 0)) {
5091                 reslen -= rescnt;
5092                 rescnt = width + fmtcnt + 100;
5093                 reslen += rescnt;
5094                 if (_PyUnicode_Resize(result, reslen) < 0)
5095                     return NULL;
5096                 res = PyUnicode_AS_UNICODE(result)
5097                     + reslen - rescnt;
5098             }
5099             if (sign) {
5100                 if (fill != ' ')
5101                     *res++ = sign;
5102                 rescnt--;
5103                 if (width > len)
5104                     width--;
5105             }
5106             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5107                 assert(pbuf[0] == '0');
5108                 assert(pbuf[1] == c);
5109                 if (fill != ' ') {
5110                     *res++ = *pbuf++;
5111                     *res++ = *pbuf++;
5112                 }
5113                 rescnt -= 2;
5114                 width -= 2;
5115                 if (width < 0)
5116                     width = 0;
5117                 len -= 2;
5118             }
5119             if (width > len && !(flags & F_LJUST)) {
5120                 do {
5121                     --rescnt;
5122                     *res++ = fill;
5123                 } while (--width > len);
5124             }
5125             if (fill == ' ') {
5126                 if (sign)
5127                     *res++ = sign;
5128                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5129                     assert(pbuf[0] == '0');
5130                     assert(pbuf[1] == c);
5131                     *res++ = *pbuf++;
5132                     *res++ = *pbuf++;
5133                 }
5134             }
5135             memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5136             res += len;
5137             rescnt -= len;
5138             while (--width >= len) {
5139                 --rescnt;
5140                 *res++ = ' ';
5141             }
5142             if (dict && (argidx < arglen) && c != '%') {
5143                 PyErr_SetString(PyExc_TypeError,
5144                                 "not all arguments converted");
5145                 goto onError;
5146             }
5147             Py_XDECREF(temp);
5148         } /* '%' */
5149     } /* until end */
5150     if (argidx < arglen && !dict) {
5151         PyErr_SetString(PyExc_TypeError,
5152                         "not all arguments converted");
5153         goto onError;
5154     }
5155
5156     if (args_owned) {
5157         Py_DECREF(args);
5158     }
5159     Py_DECREF(uformat);
5160     if (_PyUnicode_Resize(result, reslen - rescnt))
5161         goto onError;
5162     return (PyObject *)result;
5163
5164  onError:
5165     Py_XDECREF(result);
5166     Py_DECREF(uformat);
5167     if (args_owned) {
5168         Py_DECREF(args);
5169     }
5170     return NULL;
5171 }
5172
5173 static PyBufferProcs unicode_as_buffer = {
5174     (getreadbufferproc) unicode_buffer_getreadbuf,
5175     (getwritebufferproc) unicode_buffer_getwritebuf,
5176     (getsegcountproc) unicode_buffer_getsegcount,
5177     (getcharbufferproc) unicode_buffer_getcharbuf,
5178 };
5179
5180 PyTypeObject PyUnicode_Type = {
5181     PyObject_HEAD_INIT(&PyType_Type)
5182     0,                                  /* ob_size */
5183     "unicode",                          /* tp_name */
5184     sizeof(PyUnicodeObject),            /* tp_size */
5185     0,                                  /* tp_itemsize */
5186     /* Slots */
5187     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5188     0,                                  /* tp_print */
5189     (getattrfunc)unicode_getattr,       /* tp_getattr */
5190     0,                                  /* tp_setattr */
5191     (cmpfunc) unicode_compare,          /* tp_compare */
5192     (reprfunc) unicode_repr,            /* tp_repr */
5193     0,                                  /* tp_as_number */
5194     &unicode_as_sequence,               /* tp_as_sequence */
5195     0,                                  /* tp_as_mapping */
5196     (hashfunc) unicode_hash,            /* tp_hash*/
5197     0,                                  /* tp_call*/
5198     (reprfunc) unicode_str,             /* tp_str */
5199     (getattrofunc) NULL,                /* tp_getattro */
5200     (setattrofunc) NULL,                /* tp_setattro */
5201     &unicode_as_buffer,                 /* tp_as_buffer */
5202     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
5203 };
5204
5205 /* Initialize the Unicode implementation */
5206
5207 void _PyUnicode_Init(void)
5208 {
5209     /* Doublecheck the configuration... */
5210     if (sizeof(Py_UNICODE) != 2)
5211         Py_FatalError("Unicode configuration error: "
5212                       "sizeof(Py_UNICODE) != 2 bytes");
5213
5214     /* Init the implementation */
5215     unicode_freelist = NULL;
5216     unicode_freelist_size = 0;
5217     unicode_empty = _PyUnicode_New(0);
5218     strcpy(unicode_default_encoding, "ascii");
5219 }
5220
5221 /* Finalize the Unicode implementation */
5222
5223 void
5224 _PyUnicode_Fini(void)
5225 {
5226     PyUnicodeObject *u = unicode_freelist;
5227
5228     while (u != NULL) {
5229         PyUnicodeObject *v = u;
5230         u = *(PyUnicodeObject **)u;
5231         if (v->str)
5232             PyMem_DEL(v->str);
5233         Py_XDECREF(v->defenc);
5234         PyObject_DEL(v);
5235     }
5236     unicode_freelist = NULL;
5237     unicode_freelist_size = 0;
5238     Py_XDECREF(unicode_empty);
5239     unicode_empty = NULL;
5240 }