Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* The empty Unicode object */
  87 static PyUnicodeObject *unicode_empty;
  88
  89 /* Free list for Unicode objects */
  90 static PyUnicodeObject *unicode_freelist;
  91 static int unicode_freelist_size;
  92
  93 /* Default encoding to use and assume when NULL is passed as encoding
  94    parameter; it is initialized by _PyUnicode_Init().
  95
  96    Always use the PyUnicode_SetDefaultEncoding() and
  97    PyUnicode_GetDefaultEncoding() APIs to access this global.
  98
  99 */
 100
 101 static char unicode_default_encoding[100];
 102
 103 /* --- Unicode Object ----------------------------------------------------- */
 104
 105 static
 106 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
 107                       int length)
 108 {
 109     void *oldstr;
 110
 111     /* Shortcut if there's nothing much to do. */
 112     if (unicode->length == length)
 113         goto reset;
 114
 115     /* Resizing unicode_empty is not allowed. */
 116     if (unicode == unicode_empty) {
 117         PyErr_SetString(PyExc_SystemError,
 118                         "can't resize empty unicode object");
 119         return -1;
 120     }
 121
 122     /* We allocate one more byte to make sure the string is
 123        Ux0000 terminated -- XXX is this needed ? */
 124     oldstr = unicode->str;
 125     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 126     if (!unicode->str) {
 127         unicode->str = oldstr;
 128         PyErr_NoMemory();
 129         return -1;
 130     }
 131     unicode->str[length] = 0;
 132     unicode->length = length;
 133
 134  reset:
 135     /* Reset the object caches */
 136     if (unicode->defenc) {
 137         Py_DECREF(unicode->defenc);
 138         unicode->defenc = NULL;
 139     }
 140     unicode->hash = -1;
 141
 142     return 0;
 143 }
 144
 145 int PyUnicode_Resize(PyObject **unicode,
 146                      int length)
 147 {
 148     PyUnicodeObject *v;
 149
 150     if (unicode == NULL) {
 151         PyErr_BadInternalCall();
 152         return -1;
 153     }
 154     v = (PyUnicodeObject *)*unicode;
 155     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 156         PyErr_BadInternalCall();
 157         return -1;
 158     }
 159     return _PyUnicode_Resize(v, length);
 160 }
 161
 162 /* We allocate one more byte to make sure the string is
 163    Ux0000 terminated -- XXX is this needed ?
 164
 165    XXX This allocator could further be enhanced by assuring that the
 166        free list never reduces its size below 1.
 167
 168 */
 169
 170 static
 171 PyUnicodeObject *_PyUnicode_New(int length)
 172 {
 173     register PyUnicodeObject *unicode;
 174
 175     /* Optimization for empty strings */
 176     if (length == 0 && unicode_empty != NULL) {
 177         Py_INCREF(unicode_empty);
 178         return unicode_empty;
 179     }
 180
 181     /* Unicode freelist & memory allocation */
 182     if (unicode_freelist) {
 183         unicode = unicode_freelist;
 184         unicode_freelist = *(PyUnicodeObject **)unicode;
 185         unicode_freelist_size--;
 186         if (unicode->str) {
 187             /* Keep-Alive optimization: we only upsize the buffer,
 188                never downsize it. */
 189             if ((unicode->length < length) &&
 190                 _PyUnicode_Resize(unicode, length)) {
 191                 PyMem_DEL(unicode->str);
 192                 goto onError;
 193             }
 194         }
 195       else {
 196             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 197       }
 198       PyObject_INIT(unicode, &PyUnicode_Type);
 199     }
 200     else {
 201         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 202         if (unicode == NULL)
 203             return NULL;
 204         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 205     }
 206
 207     if (!unicode->str) {
 208         PyErr_NoMemory();
 209         goto onError;
 210     }
 211     unicode->str[length] = 0;
 212     unicode->length = length;
 213     unicode->hash = -1;
 214     unicode->defenc = NULL;
 215     return unicode;
 216
 217  onError:
 218     _Py_ForgetReference((PyObject *)unicode);
 219     PyObject_DEL(unicode);
 220     return NULL;
 221 }
 222
 223 static
 224 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 225 {
 226     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 227         /* Keep-Alive optimization */
 228         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 229             PyMem_DEL(unicode->str);
 230             unicode->str = NULL;
 231             unicode->length = 0;
 232         }
 233         if (unicode->defenc) {
 234             Py_DECREF(unicode->defenc);
 235             unicode->defenc = NULL;
 236         }
 237         /* Add to free list */
 238         *(PyUnicodeObject **)unicode = unicode_freelist;
 239         unicode_freelist = unicode;
 240         unicode_freelist_size++;
 241     }
 242     else {
 243         PyMem_DEL(unicode->str);
 244         Py_XDECREF(unicode->defenc);
 245         PyObject_DEL(unicode);
 246     }
 247 }
 248
 249 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 250                                 int size)
 251 {
 252     PyUnicodeObject *unicode;
 253
 254     unicode = _PyUnicode_New(size);
 255     if (!unicode)
 256         return NULL;
 257
 258     /* Copy the Unicode data into the new object */
 259     if (u != NULL)
 260         memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
 261
 262     return (PyObject *)unicode;
 263 }
 264
 265 #ifdef HAVE_WCHAR_H
 266
 267 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 268                                  int size)
 269 {
 270     PyUnicodeObject *unicode;
 271
 272     if (w == NULL) {
 273         PyErr_BadInternalCall();
 274         return NULL;
 275     }
 276
 277     unicode = _PyUnicode_New(size);
 278     if (!unicode)
 279         return NULL;
 280
 281     /* Copy the wchar_t data into the new object */
 282 #ifdef HAVE_USABLE_WCHAR_T
 283     memcpy(unicode->str, w, size * sizeof(wchar_t));
 284 #else
 285     {
 286         register Py_UNICODE *u;
 287         register int i;
 288         u = PyUnicode_AS_UNICODE(unicode);
 289         for (i = size; i >= 0; i--)
 290             *u++ = *w++;
 291     }
 292 #endif
 293
 294     return (PyObject *)unicode;
 295 }
 296
 297 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 298                          register wchar_t *w,
 299                          int size)
 300 {
 301     if (unicode == NULL) {
 302         PyErr_BadInternalCall();
 303         return -1;
 304     }
 305     if (size > PyUnicode_GET_SIZE(unicode))
 306         size = PyUnicode_GET_SIZE(unicode);
 307 #ifdef HAVE_USABLE_WCHAR_T
 308     memcpy(w, unicode->str, size * sizeof(wchar_t));
 309 #else
 310     {
 311         register Py_UNICODE *u;
 312         register int i;
 313         u = PyUnicode_AS_UNICODE(unicode);
 314         for (i = size; i >= 0; i--)
 315             *w++ = *u++;
 316     }
 317 #endif
 318
 319     return size;
 320 }
 321
 322 #endif
 323
 324 PyObject *PyUnicode_FromObject(register PyObject *obj)
 325 {
 326     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 327 }
 328
 329 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 330                                       const char *encoding,
 331                                       const char *errors)
 332 {
 333     const char *s;
 334     int len;
 335     int owned = 0;
 336     PyObject *v;
 337
 338     if (obj == NULL) {
 339         PyErr_BadInternalCall();
 340         return NULL;
 341     }
 342
 343     /* Coerce object */
 344     if (PyInstance_Check(obj)) {
 345         PyObject *func;
 346         func = PyObject_GetAttrString(obj, "__str__");
 347         if (func == NULL) {
 348             PyErr_SetString(PyExc_TypeError,
 349                   "coercing to Unicode: instance doesn't define __str__");
 350             return NULL;
 351         }
 352         obj = PyEval_CallObject(func, NULL);
 353         Py_DECREF(func);
 354         if (obj == NULL)
 355             return NULL;
 356         owned = 1;
 357     }
 358     if (PyUnicode_Check(obj)) {
 359         Py_INCREF(obj);
 360         v = obj;
 361         if (encoding) {
 362             PyErr_SetString(PyExc_TypeError,
 363                             "decoding Unicode is not supported");
 364             return NULL;
 365         }
 366         goto done;
 367     }
 368     else if (PyString_Check(obj)) {
 369         s = PyString_AS_STRING(obj);
 370         len = PyString_GET_SIZE(obj);
 371     }
 372     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 373         /* Overwrite the error message with something more useful in
 374            case of a TypeError. */
 375         if (PyErr_ExceptionMatches(PyExc_TypeError))
 376             PyErr_Format(PyExc_TypeError,
 377                          "coercing to Unicode: need string or buffer, "
 378                          "%.80s found",
 379                          obj->ob_type->tp_name);
 380         goto onError;
 381     }
 382
 383     /* Convert to Unicode */
 384     if (len == 0) {
 385         Py_INCREF(unicode_empty);
 386         v = (PyObject *)unicode_empty;
 387     }
 388     else
 389         v = PyUnicode_Decode(s, len, encoding, errors);
 390
 391  done:
 392     if (owned) {
 393         Py_DECREF(obj);
 394     }
 395     return v;
 396
 397  onError:
 398     if (owned) {
 399         Py_DECREF(obj);
 400     }
 401     return NULL;
 402 }
 403
 404 PyObject *PyUnicode_Decode(const char *s,
 405                            int size,
 406                            const char *encoding,
 407                            const char *errors)
 408 {
 409     PyObject *buffer = NULL, *unicode;
 410
 411     if (encoding == NULL)
 412         encoding = PyUnicode_GetDefaultEncoding();
 413
 414     /* Shortcuts for common default encodings */
 415     if (strcmp(encoding, "utf-8") == 0)
 416         return PyUnicode_DecodeUTF8(s, size, errors);
 417     else if (strcmp(encoding, "latin-1") == 0)
 418         return PyUnicode_DecodeLatin1(s, size, errors);
 419     else if (strcmp(encoding, "ascii") == 0)
 420         return PyUnicode_DecodeASCII(s, size, errors);
 421
 422     /* Decode via the codec registry */
 423     buffer = PyBuffer_FromMemory((void *)s, size);
 424     if (buffer == NULL)
 425         goto onError;
 426     unicode = PyCodec_Decode(buffer, encoding, errors);
 427     if (unicode == NULL)
 428         goto onError;
 429     if (!PyUnicode_Check(unicode)) {
 430         PyErr_Format(PyExc_TypeError,
 431                      "decoder did not return an unicode object (type=%.400s)",
 432                      unicode->ob_type->tp_name);
 433         Py_DECREF(unicode);
 434         goto onError;
 435     }
 436     Py_DECREF(buffer);
 437     return unicode;
 438
 439  onError:
 440     Py_XDECREF(buffer);
 441     return NULL;
 442 }
 443
 444 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 445                            int size,
 446                            const char *encoding,
 447                            const char *errors)
 448 {
 449     PyObject *v, *unicode;
 450
 451     unicode = PyUnicode_FromUnicode(s, size);
 452     if (unicode == NULL)
 453         return NULL;
 454     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 455     Py_DECREF(unicode);
 456     return v;
 457 }
 458
 459 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 460                                     const char *encoding,
 461                                     const char *errors)
 462 {
 463     PyObject *v;
 464
 465     if (!PyUnicode_Check(unicode)) {
 466         PyErr_BadArgument();
 467         goto onError;
 468     }
 469
 470     if (encoding == NULL)
 471         encoding = PyUnicode_GetDefaultEncoding();
 472
 473     /* Shortcuts for common default encodings */
 474     if (errors == NULL) {
 475         if (strcmp(encoding, "utf-8") == 0)
 476         return PyUnicode_AsUTF8String(unicode);
 477         else if (strcmp(encoding, "latin-1") == 0)
 478             return PyUnicode_AsLatin1String(unicode);
 479         else if (strcmp(encoding, "ascii") == 0)
 480             return PyUnicode_AsASCIIString(unicode);
 481     }
 482
 483     /* Encode via the codec registry */
 484     v = PyCodec_Encode(unicode, encoding, errors);
 485     if (v == NULL)
 486         goto onError;
 487     /* XXX Should we really enforce this ? */
 488     if (!PyString_Check(v)) {
 489         PyErr_Format(PyExc_TypeError,
 490                      "encoder did not return a string object (type=%.400s)",
 491                      v->ob_type->tp_name);
 492         Py_DECREF(v);
 493         goto onError;
 494     }
 495     return v;
 496
 497  onError:
 498     return NULL;
 499 }
 500
 501 /* Return a Python string holding the default encoded value of the
 502    Unicode object.
 503
 504    The resulting string is cached in the Unicode object for subsequent
 505    usage by this function. The cached version is needed to implement
 506    the character buffer interface and will live (at least) as long as
 507    the Unicode object itself.
 508
 509    The refcount of the string is *not* incremented.
 510
 511    *** Exported for internal use by the interpreter only !!! ***
 512
 513 */
 514
 515 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 516                                             const char *errors)
 517 {
 518     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 519
 520     if (v)
 521         return v;
 522     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 523     if (v && errors == NULL)
 524         ((PyUnicodeObject *)unicode)->defenc = v;
 525     return v;
 526 }
 527
 528 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 529 {
 530     if (!PyUnicode_Check(unicode)) {
 531         PyErr_BadArgument();
 532         goto onError;
 533     }
 534     return PyUnicode_AS_UNICODE(unicode);
 535
 536  onError:
 537     return NULL;
 538 }
 539
 540 int PyUnicode_GetSize(PyObject *unicode)
 541 {
 542     if (!PyUnicode_Check(unicode)) {
 543         PyErr_BadArgument();
 544         goto onError;
 545     }
 546     return PyUnicode_GET_SIZE(unicode);
 547
 548  onError:
 549     return -1;
 550 }
 551
 552 const char *PyUnicode_GetDefaultEncoding(void)
 553 {
 554     return unicode_default_encoding;
 555 }
 556
 557 int PyUnicode_SetDefaultEncoding(const char *encoding)
 558 {
 559     PyObject *v;
 560
 561     /* Make sure the encoding is valid. As side effect, this also
 562        loads the encoding into the codec registry cache. */
 563     v = _PyCodec_Lookup(encoding);
 564     if (v == NULL)
 565         goto onError;
 566     Py_DECREF(v);
 567     strncpy(unicode_default_encoding,
 568             encoding,
 569             sizeof(unicode_default_encoding));
 570     return 0;
 571
 572  onError:
 573     return -1;
 574 }
 575
 576 /* --- UTF-8 Codec -------------------------------------------------------- */
 577
 578 static
 579 char utf8_code_length[256] = {
 580     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 581        illegal prefix.  see RFC 2279 for details */
 582     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 583     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 584     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 585     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 586     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 587     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 588     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 589     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 590     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 591     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 592     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 593     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 594     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 595     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 596     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 597     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 598 };
 599
 600 static
 601 int utf8_decoding_error(const char **source,
 602                         Py_UNICODE **dest,
 603                         const char *errors,
 604                         const char *details)
 605 {
 606     if ((errors == NULL) ||
 607         (strcmp(errors,"strict") == 0)) {
 608         PyErr_Format(PyExc_UnicodeError,
 609                      "UTF-8 decoding error: %.400s",
 610                      details);
 611         return -1;
 612     }
 613     else if (strcmp(errors,"ignore") == 0) {
 614         (*source)++;
 615         return 0;
 616     }
 617     else if (strcmp(errors,"replace") == 0) {
 618         (*source)++;
 619         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 620         (*dest)++;
 621         return 0;
 622     }
 623     else {
 624         PyErr_Format(PyExc_ValueError,
 625                      "UTF-8 decoding error; unknown error handling code: %.400s",
 626                      errors);
 627         return -1;
 628     }
 629 }
 630
 631 PyObject *PyUnicode_DecodeUTF8(const char *s,
 632                                int size,
 633                                const char *errors)
 634 {
 635     int n;
 636     const char *e;
 637     PyUnicodeObject *unicode;
 638     Py_UNICODE *p;
 639     const char *errmsg = "";
 640
 641     /* Note: size will always be longer than the resulting Unicode
 642        character count */
 643     unicode = _PyUnicode_New(size);
 644     if (!unicode)
 645         return NULL;
 646     if (size == 0)
 647         return (PyObject *)unicode;
 648
 649     /* Unpack UTF-8 encoded data */
 650     p = unicode->str;
 651     e = s + size;
 652
 653     while (s < e) {
 654         Py_UCS4 ch = (unsigned char)*s;
 655
 656         if (ch < 0x80) {
 657             *p++ = (Py_UNICODE)ch;
 658             s++;
 659             continue;
 660         }
 661
 662         n = utf8_code_length[ch];
 663
 664         if (s + n > e) {
 665             errmsg = "unexpected end of data";
 666             goto utf8Error;
 667         }
 668
 669         switch (n) {
 670
 671         case 0:
 672             errmsg = "unexpected code byte";
 673             goto utf8Error;
 674             break;
 675
 676         case 1:
 677             errmsg = "internal error";
 678             goto utf8Error;
 679             break;
 680
 681         case 2:
 682             if ((s[1] & 0xc0) != 0x80) {
 683                 errmsg = "invalid data";
 684                 goto utf8Error;
 685             }
 686             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
 687             if (ch < 0x80) {
 688                 errmsg = "illegal encoding";
 689                 goto utf8Error;
 690             }
 691             else
 692                 *p++ = (Py_UNICODE)ch;
 693             break;
 694
 695         case 3:
 696             if ((s[1] & 0xc0) != 0x80 ||
 697                 (s[2] & 0xc0) != 0x80) {
 698                 errmsg = "invalid data";
 699                 goto utf8Error;
 700             }
 701             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
 702             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
 703                 errmsg = "illegal encoding";
 704                 goto utf8Error;
 705             }
 706             else
 707                                 *p++ = (Py_UNICODE)ch;
 708             break;
 709
 710         case 4:
 711             if ((s[1] & 0xc0) != 0x80 ||
 712                 (s[2] & 0xc0) != 0x80 ||
 713                 (s[3] & 0xc0) != 0x80) {
 714                 errmsg = "invalid data";
 715                 goto utf8Error;
 716             }
 717             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
 718                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
 719             /* validate and convert to UTF-16 */
 720             if ((ch < 0x10000) ||   /* minimum value allowed for 4
 721                                        byte encoding */
 722                 (ch > 0x10ffff)) {  /* maximum value allowed for
 723                                        UTF-16 */
 724                 errmsg = "illegal encoding";
 725                 goto utf8Error;
 726             }
 727             /*  compute and append the two surrogates: */
 728
 729             /*  translate from 10000..10FFFF to 0..FFFF */
 730             ch -= 0x10000;
 731
 732             /*  high surrogate = top 10 bits added to D800 */
 733             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
 734
 735             /*  low surrogate = bottom 10 bits added to DC00 */
 736             *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
 737             break;
 738
 739         default:
 740             /* Other sizes are only needed for UCS-4 */
 741             errmsg = "unsupported Unicode code range";
 742             goto utf8Error;
 743             break;
 744         }
 745         s += n;
 746         continue;
 747
 748     utf8Error:
 749       if (utf8_decoding_error(&s, &p, errors, errmsg))
 750           goto onError;
 751     }
 752
 753     /* Adjust length */
 754     if (_PyUnicode_Resize(unicode, p - unicode->str))
 755         goto onError;
 756
 757     return (PyObject *)unicode;
 758
 759 onError:
 760     Py_DECREF(unicode);
 761     return NULL;
 762 }
 763
 764 /* Not used anymore, now that the encoder supports UTF-16
 765    surrogates. */
 766 #if 0
 767 static
 768 int utf8_encoding_error(const Py_UNICODE **source,
 769                         char **dest,
 770                         const char *errors,
 771                         const char *details)
 772 {
 773     if ((errors == NULL) ||
 774         (strcmp(errors,"strict") == 0)) {
 775         PyErr_Format(PyExc_UnicodeError,
 776                      "UTF-8 encoding error: %.400s",
 777                      details);
 778         return -1;
 779     }
 780     else if (strcmp(errors,"ignore") == 0) {
 781         return 0;
 782     }
 783     else if (strcmp(errors,"replace") == 0) {
 784         **dest = '?';
 785         (*dest)++;
 786         return 0;
 787     }
 788     else {
 789         PyErr_Format(PyExc_ValueError,
 790                      "UTF-8 encoding error; "
 791                      "unknown error handling code: %.400s",
 792                      errors);
 793         return -1;
 794     }
 795 }
 796 #endif
 797
 798 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 799                                int size,
 800                                const char *errors)
 801 {
 802     PyObject *v;
 803     char *p;
 804     char *q;
 805     Py_UCS4 ch2;
 806     unsigned int cbAllocated = 3 * size;
 807     unsigned int cbWritten = 0;
 808     int i = 0;
 809
 810     v = PyString_FromStringAndSize(NULL, cbAllocated);
 811     if (v == NULL)
 812         return NULL;
 813     if (size == 0)
 814         return v;
 815
 816     p = q = PyString_AS_STRING(v);
 817     while (i < size) {
 818         Py_UCS4 ch = s[i++];
 819         if (ch < 0x80) {
 820             *p++ = (char) ch;
 821             cbWritten++;
 822         }
 823         else if (ch < 0x0800) {
 824             *p++ = 0xc0 | (ch >> 6);
 825             *p++ = 0x80 | (ch & 0x3f);
 826             cbWritten += 2;
 827         }
 828         else {
 829             /* Check for high surrogate */
 830             if (0xD800 <= ch && ch <= 0xDBFF) {
 831                 if (i != size) {
 832                     ch2 = s[i];
 833                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 834
 835                         if (cbWritten >= (cbAllocated - 4)) {
 836                             /* Provide enough room for some more
 837                                surrogates */
 838                             cbAllocated += 4*10;
 839                             if (_PyString_Resize(&v, cbAllocated))
 840                                 goto onError;
 841                         }
 842
 843                         /* combine the two values */
 844                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
 845
 846                         *p++ = (char)((ch >> 18) | 0xf0);
 847                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 848                         i++;
 849                         cbWritten += 4;
 850                     }
 851                 }
 852             }
 853             else {
 854                 *p++ = (char)(0xe0 | (ch >> 12));
 855                 cbWritten += 3;
 856             }
 857             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 858             *p++ = (char)(0x80 | (ch & 0x3f));
 859         }
 860     }
 861     *p = '\0';
 862     if (_PyString_Resize(&v, p - q))
 863         goto onError;
 864     return v;
 865
 866  onError:
 867     Py_DECREF(v);
 868     return NULL;
 869 }
 870
 871 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 872 {
 873     if (!PyUnicode_Check(unicode)) {
 874         PyErr_BadArgument();
 875         return NULL;
 876     }
 877     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
 878                                 PyUnicode_GET_SIZE(unicode),
 879                                 NULL);
 880 }
 881
 882 /* --- UTF-16 Codec ------------------------------------------------------- */
 883
 884 static
 885 int utf16_decoding_error(const Py_UNICODE **source,
 886                          Py_UNICODE **dest,
 887                          const char *errors,
 888                          const char *details)
 889 {
 890     if ((errors == NULL) ||
 891         (strcmp(errors,"strict") == 0)) {
 892         PyErr_Format(PyExc_UnicodeError,
 893                      "UTF-16 decoding error: %.400s",
 894                      details);
 895         return -1;
 896     }
 897     else if (strcmp(errors,"ignore") == 0) {
 898         return 0;
 899     }
 900     else if (strcmp(errors,"replace") == 0) {
 901         if (dest) {
 902             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 903             (*dest)++;
 904         }
 905         return 0;
 906     }
 907     else {
 908         PyErr_Format(PyExc_ValueError,
 909                      "UTF-16 decoding error; "
 910                      "unknown error handling code: %.400s",
 911                      errors);
 912         return -1;
 913     }
 914 }
 915
 916 PyObject *PyUnicode_DecodeUTF16(const char *s,
 917                                 int size,
 918                                 const char *errors,
 919                                 int *byteorder)
 920 {
 921     PyUnicodeObject *unicode;
 922     Py_UNICODE *p;
 923     const Py_UNICODE *q, *e;
 924     int bo = 0;
 925     const char *errmsg = "";
 926
 927     /* size should be an even number */
 928     if (size % sizeof(Py_UNICODE) != 0) {
 929         if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
 930             return NULL;
 931         /* The remaining input chars are ignored if we fall through
 932            here... */
 933     }
 934
 935     /* Note: size will always be longer than the resulting Unicode
 936        character count */
 937     unicode = _PyUnicode_New(size);
 938     if (!unicode)
 939         return NULL;
 940     if (size == 0)
 941         return (PyObject *)unicode;
 942
 943     /* Unpack UTF-16 encoded data */
 944     p = unicode->str;
 945     q = (Py_UNICODE *)s;
 946     e = q + (size / sizeof(Py_UNICODE));
 947
 948     if (byteorder)
 949         bo = *byteorder;
 950
 951     while (q < e) {
 952         register Py_UNICODE ch = *q++;
 953
 954         /* Check for BOM marks (U+FEFF) in the input and adjust
 955            current byte order setting accordingly. Swap input
 956            bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
 957            !) */
 958 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 959         if (ch == 0xFEFF) {
 960             bo = -1;
 961             continue;
 962         } else if (ch == 0xFFFE) {
 963             bo = 1;
 964             continue;
 965         }
 966         if (bo == 1)
 967             ch = (ch >> 8) | (ch << 8);
 968 #else
 969         if (ch == 0xFEFF) {
 970             bo = 1;
 971             continue;
 972         } else if (ch == 0xFFFE) {
 973             bo = -1;
 974             continue;
 975         }
 976         if (bo == -1)
 977             ch = (ch >> 8) | (ch << 8);
 978 #endif
 979         if (ch < 0xD800 || ch > 0xDFFF) {
 980             *p++ = ch;
 981             continue;
 982         }
 983
 984         /* UTF-16 code pair: */
 985         if (q >= e) {
 986             errmsg = "unexpected end of data";
 987             goto utf16Error;
 988         }
 989         if (0xDC00 <= *q && *q <= 0xDFFF) {
 990             q++;
 991             if (0xD800 <= *q && *q <= 0xDBFF) {
 992                 /* This is valid data (a UTF-16 surrogate pair), but
 993                    we are not able to store this information since our
 994                    Py_UNICODE type only has 16 bits... this might
 995                    change someday, even though it's unlikely. */
 996                 errmsg = "code pairs are not supported";
 997                 goto utf16Error;
 998             }
 999             else
1000                 continue;
1001         }
1002         errmsg = "illegal encoding";
1003         /* Fall through to report the error */
1004
1005     utf16Error:
1006         if (utf16_decoding_error(&q, &p, errors, errmsg))
1007             goto onError;
1008     }
1009
1010     if (byteorder)
1011         *byteorder = bo;
1012
1013     /* Adjust length */
1014     if (_PyUnicode_Resize(unicode, p - unicode->str))
1015         goto onError;
1016
1017     return (PyObject *)unicode;
1018
1019 onError:
1020     Py_DECREF(unicode);
1021     return NULL;
1022 }
1023
1024 #undef UTF16_ERROR
1025
1026 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027                                 int size,
1028                                 const char *errors,
1029                                 int byteorder)
1030 {
1031     PyObject *v;
1032     Py_UNICODE *p;
1033     char *q;
1034
1035     /* We don't create UTF-16 pairs... */
1036     v = PyString_FromStringAndSize(NULL,
1037                         sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038     if (v == NULL)
1039         return NULL;
1040
1041     q = PyString_AS_STRING(v);
1042     p = (Py_UNICODE *)q;
1043     if (byteorder == 0)
1044         *p++ = 0xFEFF;
1045     if (size == 0)
1046         return v;
1047     if (byteorder == 0 ||
1048 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049         byteorder == -1
1050 #else
1051         byteorder == 1
1052 #endif
1053         )
1054         memcpy(p, s, size * sizeof(Py_UNICODE));
1055     else
1056         while (size-- > 0) {
1057             Py_UNICODE ch = *s++;
1058             *p++ = (ch >> 8) | (ch << 8);
1059         }
1060     return v;
1061 }
1062
1063 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064 {
1065     if (!PyUnicode_Check(unicode)) {
1066         PyErr_BadArgument();
1067         return NULL;
1068     }
1069     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070                                  PyUnicode_GET_SIZE(unicode),
1071                                  NULL,
1072                                  0);
1073 }
1074
1075 /* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077 static
1078 int unicodeescape_decoding_error(const char **source,
1079                                  Py_UNICODE *x,
1080                                  const char *errors,
1081                                  const char *details)
1082 {
1083     if ((errors == NULL) ||
1084         (strcmp(errors,"strict") == 0)) {
1085         PyErr_Format(PyExc_UnicodeError,
1086                      "Unicode-Escape decoding error: %.400s",
1087                      details);
1088         return -1;
1089     }
1090     else if (strcmp(errors,"ignore") == 0) {
1091         return 0;
1092     }
1093     else if (strcmp(errors,"replace") == 0) {
1094         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1095         return 0;
1096     }
1097     else {
1098         PyErr_Format(PyExc_ValueError,
1099                      "Unicode-Escape decoding error; "
1100                      "unknown error handling code: %.400s",
1101                      errors);
1102         return -1;
1103     }
1104 }
1105
1106 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1107
1108 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109                                         int size,
1110                                         const char *errors)
1111 {
1112     PyUnicodeObject *v;
1113     Py_UNICODE *p, *buf;
1114     const char *end;
1115     char* message;
1116     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1117
1118     /* Escaped strings will always be longer than the resulting
1119        Unicode string, so we start with size here and then reduce the
1120        length after conversion to the true value. */
1121     v = _PyUnicode_New(size);
1122     if (v == NULL)
1123         goto onError;
1124     if (size == 0)
1125         return (PyObject *)v;
1126
1127     p = buf = PyUnicode_AS_UNICODE(v);
1128     end = s + size;
1129
1130     while (s < end) {
1131         unsigned char c;
1132         Py_UNICODE x;
1133         int i, digits;
1134
1135         /* Non-escape characters are interpreted as Unicode ordinals */
1136         if (*s != '\\') {
1137             *p++ = (unsigned char) *s++;
1138             continue;
1139         }
1140
1141         /* \ - Escapes */
1142         s++;
1143         switch (*s++) {
1144
1145         /* \x escapes */
1146         case '\n': break;
1147         case '\\': *p++ = '\\'; break;
1148         case '\'': *p++ = '\''; break;
1149         case '\"': *p++ = '\"'; break;
1150         case 'b': *p++ = '\b'; break;
1151         case 'f': *p++ = '\014'; break; /* FF */
1152         case 't': *p++ = '\t'; break;
1153         case 'n': *p++ = '\n'; break;
1154         case 'r': *p++ = '\r'; break;
1155         case 'v': *p++ = '\013'; break; /* VT */
1156         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1157
1158         /* \OOO (octal) escapes */
1159         case '0': case '1': case '2': case '3':
1160         case '4': case '5': case '6': case '7':
1161             x = s[-1] - '0';
1162             if ('0' <= *s && *s <= '7') {
1163                 x = (x<<3) + *s++ - '0';
1164                 if ('0' <= *s && *s <= '7')
1165                     x = (x<<3) + *s++ - '0';
1166             }
1167             *p++ = x;
1168             break;
1169
1170         /* hex escapes */
1171         /* \xXX */
1172         case 'x':
1173             digits = 2;
1174             message = "truncated \\xXX escape";
1175             goto hexescape;
1176
1177         /* \uXXXX */
1178         case 'u':
1179             digits = 4;
1180             message = "truncated \\uXXXX escape";
1181             goto hexescape;
1182
1183         /* \UXXXXXXXX */
1184         case 'U':
1185             digits = 8;
1186             message = "truncated \\UXXXXXXXX escape";
1187         hexescape:
1188             chr = 0;
1189             for (i = 0; i < digits; i++) {
1190                 c = (unsigned char) s[i];
1191                 if (!isxdigit(c)) {
1192                     if (unicodeescape_decoding_error(&s, &x, errors, message))
1193                         goto onError;
1194                     chr = x;
1195                     i++;
1196                     break;
1197                 }
1198                 chr = (chr<<4) & ~0xF;
1199                 if (c >= '0' && c <= '9')
1200                     chr += c - '0';
1201                 else if (c >= 'a' && c <= 'f')
1202                     chr += 10 + c - 'a';
1203                 else
1204                     chr += 10 + c - 'A';
1205             }
1206             s += i;
1207         store:
1208             /* when we get here, chr is a 32-bit unicode character */
1209             if (chr <= 0xffff)
1210                 /* UCS-2 character */
1211                 *p++ = (Py_UNICODE) chr;
1212             else if (chr <= 0x10ffff) {
1213                 /* UCS-4 character.  store as two surrogate characters */
1214                 chr -= 0x10000L;
1215                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1216                 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1217             } else {
1218                 if (unicodeescape_decoding_error(
1219                     &s, &x, errors,
1220                     "illegal Unicode character")
1221                     )
1222                     goto onError;
1223                 *p++ = x; /* store replacement character */
1224             }
1225             break;
1226
1227         /* \N{name} */
1228         case 'N':
1229             message = "malformed \\N character escape";
1230             if (ucnhash_CAPI == NULL) {
1231                 /* load the unicode data module */
1232                 PyObject *m, *v;
1233                 m = PyImport_ImportModule("unicodedata");
1234                 if (m == NULL)
1235                     goto ucnhashError;
1236                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1237                 Py_DECREF(m);
1238                 if (v == NULL)
1239                     goto ucnhashError;
1240                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1241                 Py_DECREF(v);
1242                 if (ucnhash_CAPI == NULL)
1243                     goto ucnhashError;
1244             }
1245             if (*s == '{') {
1246                 const char *start = s+1;
1247                 /* look for the closing brace */
1248                 while (*s != '}' && s < end)
1249                     s++;
1250                 if (s > start && s < end && *s == '}') {
1251                     /* found a name.  look it up in the unicode database */
1252                     message = "unknown Unicode character name";
1253                     s++;
1254                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1255                         goto store;
1256                 }
1257             }
1258             if (unicodeescape_decoding_error(&s, &x, errors, message))
1259                 goto onError;
1260             *p++ = x;
1261             break;
1262
1263         default:
1264             *p++ = '\\';
1265             *p++ = (unsigned char)s[-1];
1266             break;
1267         }
1268     }
1269     if (_PyUnicode_Resize(v, (int)(p - buf)))
1270                 goto onError;
1271     return (PyObject *)v;
1272
1273 ucnhashError:
1274     PyErr_SetString(
1275         PyExc_UnicodeError,
1276         "\\N escapes not supported (can't load unicodedata module)"
1277         );
1278     return NULL;
1279
1280 onError:
1281     Py_XDECREF(v);
1282     return NULL;
1283 }
1284
1285 /* Return a Unicode-Escape string version of the Unicode object.
1286
1287    If quotes is true, the string is enclosed in u"" or u'' quotes as
1288    appropriate.
1289
1290 */
1291
1292 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1293                                   int size,
1294                                   Py_UNICODE ch);
1295
1296 static
1297 PyObject *unicodeescape_string(const Py_UNICODE *s,
1298                                int size,
1299                                int quotes)
1300 {
1301     PyObject *repr;
1302     char *p;
1303     char *q;
1304
1305     static const char *hexdigit = "0123456789abcdef";
1306
1307     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1308     if (repr == NULL)
1309         return NULL;
1310
1311     p = q = PyString_AS_STRING(repr);
1312
1313     if (quotes) {
1314         *p++ = 'u';
1315         *p++ = (findchar(s, size, '\'') &&
1316                 !findchar(s, size, '"')) ? '"' : '\'';
1317     }
1318     while (size-- > 0) {
1319         Py_UNICODE ch = *s++;
1320         /* Escape quotes */
1321         if (quotes && (ch == q[1] || ch == '\\')) {
1322             *p++ = '\\';
1323             *p++ = (char) ch;
1324         }
1325         /* Map 16-bit characters to '\uxxxx' */
1326         else if (ch >= 256) {
1327             *p++ = '\\';
1328             *p++ = 'u';
1329             *p++ = hexdigit[(ch >> 12) & 0xf];
1330             *p++ = hexdigit[(ch >> 8) & 0xf];
1331             *p++ = hexdigit[(ch >> 4) & 0xf];
1332             *p++ = hexdigit[ch & 15];
1333         }
1334         /* Map special whitespace to '\t', \n', '\r' */
1335         else if (ch == '\t') {
1336             *p++ = '\\';
1337             *p++ = 't';
1338         }
1339         else if (ch == '\n') {
1340             *p++ = '\\';
1341             *p++ = 'n';
1342         }
1343         else if (ch == '\r') {
1344             *p++ = '\\';
1345             *p++ = 'r';
1346         }
1347         /* Map non-printable US ASCII to '\xhh' */
1348         else if (ch < ' ' || ch >= 128) {
1349             *p++ = '\\';
1350             *p++ = 'x';
1351             *p++ = hexdigit[(ch >> 4) & 0xf];
1352             *p++ = hexdigit[ch & 15];
1353         }
1354         /* Copy everything else as-is */
1355         else
1356             *p++ = (char) ch;
1357     }
1358     if (quotes)
1359         *p++ = q[1];
1360
1361     *p = '\0';
1362     if (_PyString_Resize(&repr, p - q))
1363         goto onError;
1364
1365     return repr;
1366
1367  onError:
1368     Py_DECREF(repr);
1369     return NULL;
1370 }
1371
1372 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1373                                         int size)
1374 {
1375     return unicodeescape_string(s, size, 0);
1376 }
1377
1378 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1379 {
1380     if (!PyUnicode_Check(unicode)) {
1381         PyErr_BadArgument();
1382         return NULL;
1383     }
1384     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1385                                          PyUnicode_GET_SIZE(unicode));
1386 }
1387
1388 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1389
1390 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1391                                            int size,
1392                                            const char *errors)
1393 {
1394     PyUnicodeObject *v;
1395     Py_UNICODE *p, *buf;
1396     const char *end;
1397     const char *bs;
1398
1399     /* Escaped strings will always be longer than the resulting
1400        Unicode string, so we start with size here and then reduce the
1401        length after conversion to the true value. */
1402     v = _PyUnicode_New(size);
1403     if (v == NULL)
1404         goto onError;
1405     if (size == 0)
1406         return (PyObject *)v;
1407     p = buf = PyUnicode_AS_UNICODE(v);
1408     end = s + size;
1409     while (s < end) {
1410         unsigned char c;
1411         Py_UNICODE x;
1412         int i;
1413
1414         /* Non-escape characters are interpreted as Unicode ordinals */
1415         if (*s != '\\') {
1416             *p++ = (unsigned char)*s++;
1417             continue;
1418         }
1419
1420         /* \u-escapes are only interpreted iff the number of leading
1421            backslashes if odd */
1422         bs = s;
1423         for (;s < end;) {
1424             if (*s != '\\')
1425                 break;
1426             *p++ = (unsigned char)*s++;
1427         }
1428         if (((s - bs) & 1) == 0 ||
1429             s >= end ||
1430             *s != 'u') {
1431             continue;
1432         }
1433         p--;
1434         s++;
1435
1436         /* \uXXXX with 4 hex digits */
1437         for (x = 0, i = 0; i < 4; i++) {
1438             c = (unsigned char)s[i];
1439             if (!isxdigit(c)) {
1440                 if (unicodeescape_decoding_error(&s, &x, errors,
1441                                                  "truncated \\uXXXX"))
1442                     goto onError;
1443                 i++;
1444                 break;
1445             }
1446             x = (x<<4) & ~0xF;
1447             if (c >= '0' && c <= '9')
1448                 x += c - '0';
1449             else if (c >= 'a' && c <= 'f')
1450                 x += 10 + c - 'a';
1451             else
1452                 x += 10 + c - 'A';
1453         }
1454         s += i;
1455         *p++ = x;
1456     }
1457     if (_PyUnicode_Resize(v, (int)(p - buf)))
1458         goto onError;
1459     return (PyObject *)v;
1460
1461  onError:
1462     Py_XDECREF(v);
1463     return NULL;
1464 }
1465
1466 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1467                                            int size)
1468 {
1469     PyObject *repr;
1470     char *p;
1471     char *q;
1472
1473     static const char *hexdigit = "0123456789abcdef";
1474
1475     repr = PyString_FromStringAndSize(NULL, 6 * size);
1476     if (repr == NULL)
1477         return NULL;
1478     if (size == 0)
1479         return repr;
1480
1481     p = q = PyString_AS_STRING(repr);
1482     while (size-- > 0) {
1483         Py_UNICODE ch = *s++;
1484         /* Map 16-bit characters to '\uxxxx' */
1485         if (ch >= 256) {
1486             *p++ = '\\';
1487             *p++ = 'u';
1488             *p++ = hexdigit[(ch >> 12) & 0xf];
1489             *p++ = hexdigit[(ch >> 8) & 0xf];
1490             *p++ = hexdigit[(ch >> 4) & 0xf];
1491             *p++ = hexdigit[ch & 15];
1492         }
1493         /* Copy everything else as-is */
1494         else
1495             *p++ = (char) ch;
1496     }
1497     *p = '\0';
1498     if (_PyString_Resize(&repr, p - q))
1499         goto onError;
1500
1501     return repr;
1502
1503  onError:
1504     Py_DECREF(repr);
1505     return NULL;
1506 }
1507
1508 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1509 {
1510     if (!PyUnicode_Check(unicode)) {
1511         PyErr_BadArgument();
1512         return NULL;
1513     }
1514     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1515                                             PyUnicode_GET_SIZE(unicode));
1516 }
1517
1518 /* --- Latin-1 Codec ------------------------------------------------------ */
1519
1520 PyObject *PyUnicode_DecodeLatin1(const char *s,
1521                                  int size,
1522                                  const char *errors)
1523 {
1524     PyUnicodeObject *v;
1525     Py_UNICODE *p;
1526
1527     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1528     v = _PyUnicode_New(size);
1529     if (v == NULL)
1530         goto onError;
1531     if (size == 0)
1532         return (PyObject *)v;
1533     p = PyUnicode_AS_UNICODE(v);
1534     while (size-- > 0)
1535         *p++ = (unsigned char)*s++;
1536     return (PyObject *)v;
1537
1538  onError:
1539     Py_XDECREF(v);
1540     return NULL;
1541 }
1542
1543 static
1544 int latin1_encoding_error(const Py_UNICODE **source,
1545                           char **dest,
1546                           const char *errors,
1547                           const char *details)
1548 {
1549     if ((errors == NULL) ||
1550         (strcmp(errors,"strict") == 0)) {
1551         PyErr_Format(PyExc_UnicodeError,
1552                      "Latin-1 encoding error: %.400s",
1553                      details);
1554         return -1;
1555     }
1556     else if (strcmp(errors,"ignore") == 0) {
1557         return 0;
1558     }
1559     else if (strcmp(errors,"replace") == 0) {
1560         **dest = '?';
1561         (*dest)++;
1562         return 0;
1563     }
1564     else {
1565         PyErr_Format(PyExc_ValueError,
1566                      "Latin-1 encoding error; "
1567                      "unknown error handling code: %.400s",
1568                      errors);
1569         return -1;
1570     }
1571 }
1572
1573 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1574                                  int size,
1575                                  const char *errors)
1576 {
1577     PyObject *repr;
1578     char *s, *start;
1579
1580     repr = PyString_FromStringAndSize(NULL, size);
1581     if (repr == NULL)
1582         return NULL;
1583     if (size == 0)
1584         return repr;
1585
1586     s = PyString_AS_STRING(repr);
1587     start = s;
1588     while (size-- > 0) {
1589         Py_UNICODE ch = *p++;
1590         if (ch >= 256) {
1591             if (latin1_encoding_error(&p, &s, errors,
1592                                       "ordinal not in range(256)"))
1593                 goto onError;
1594         }
1595         else
1596             *s++ = (char)ch;
1597     }
1598     /* Resize if error handling skipped some characters */
1599     if (s - start < PyString_GET_SIZE(repr))
1600         if (_PyString_Resize(&repr, s - start))
1601             goto onError;
1602     return repr;
1603
1604  onError:
1605     Py_DECREF(repr);
1606     return NULL;
1607 }
1608
1609 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1610 {
1611     if (!PyUnicode_Check(unicode)) {
1612         PyErr_BadArgument();
1613         return NULL;
1614     }
1615     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1616                                   PyUnicode_GET_SIZE(unicode),
1617                                   NULL);
1618 }
1619
1620 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1621
1622 static
1623 int ascii_decoding_error(const char **source,
1624                          Py_UNICODE **dest,
1625                          const char *errors,
1626                          const char *details)
1627 {
1628     if ((errors == NULL) ||
1629         (strcmp(errors,"strict") == 0)) {
1630         PyErr_Format(PyExc_UnicodeError,
1631                      "ASCII decoding error: %.400s",
1632                      details);
1633         return -1;
1634     }
1635     else if (strcmp(errors,"ignore") == 0) {
1636         return 0;
1637     }
1638     else if (strcmp(errors,"replace") == 0) {
1639         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1640         (*dest)++;
1641         return 0;
1642     }
1643     else {
1644         PyErr_Format(PyExc_ValueError,
1645                      "ASCII decoding error; "
1646                      "unknown error handling code: %.400s",
1647                      errors);
1648         return -1;
1649     }
1650 }
1651
1652 PyObject *PyUnicode_DecodeASCII(const char *s,
1653                                 int size,
1654                                 const char *errors)
1655 {
1656     PyUnicodeObject *v;
1657     Py_UNICODE *p;
1658
1659     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1660     v = _PyUnicode_New(size);
1661     if (v == NULL)
1662         goto onError;
1663     if (size == 0)
1664         return (PyObject *)v;
1665     p = PyUnicode_AS_UNICODE(v);
1666     while (size-- > 0) {
1667         register unsigned char c;
1668
1669         c = (unsigned char)*s++;
1670         if (c < 128)
1671             *p++ = c;
1672         else if (ascii_decoding_error(&s, &p, errors,
1673                                       "ordinal not in range(128)"))
1674                 goto onError;
1675     }
1676     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1677         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1678             goto onError;
1679     return (PyObject *)v;
1680
1681  onError:
1682     Py_XDECREF(v);
1683     return NULL;
1684 }
1685
1686 static
1687 int ascii_encoding_error(const Py_UNICODE **source,
1688                          char **dest,
1689                          const char *errors,
1690                          const char *details)
1691 {
1692     if ((errors == NULL) ||
1693         (strcmp(errors,"strict") == 0)) {
1694         PyErr_Format(PyExc_UnicodeError,
1695                      "ASCII encoding error: %.400s",
1696                      details);
1697         return -1;
1698     }
1699     else if (strcmp(errors,"ignore") == 0) {
1700         return 0;
1701     }
1702     else if (strcmp(errors,"replace") == 0) {
1703         **dest = '?';
1704         (*dest)++;
1705         return 0;
1706     }
1707     else {
1708         PyErr_Format(PyExc_ValueError,
1709                      "ASCII encoding error; "
1710                      "unknown error handling code: %.400s",
1711                      errors);
1712         return -1;
1713     }
1714 }
1715
1716 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1717                                 int size,
1718                                 const char *errors)
1719 {
1720     PyObject *repr;
1721     char *s, *start;
1722
1723     repr = PyString_FromStringAndSize(NULL, size);
1724     if (repr == NULL)
1725         return NULL;
1726     if (size == 0)
1727         return repr;
1728
1729     s = PyString_AS_STRING(repr);
1730     start = s;
1731     while (size-- > 0) {
1732         Py_UNICODE ch = *p++;
1733         if (ch >= 128) {
1734             if (ascii_encoding_error(&p, &s, errors,
1735                                       "ordinal not in range(128)"))
1736                 goto onError;
1737         }
1738         else
1739             *s++ = (char)ch;
1740     }
1741     /* Resize if error handling skipped some characters */
1742     if (s - start < PyString_GET_SIZE(repr))
1743         if (_PyString_Resize(&repr, s - start))
1744             goto onError;
1745     return repr;
1746
1747  onError:
1748     Py_DECREF(repr);
1749     return NULL;
1750 }
1751
1752 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1753 {
1754     if (!PyUnicode_Check(unicode)) {
1755         PyErr_BadArgument();
1756         return NULL;
1757     }
1758     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1759                                  PyUnicode_GET_SIZE(unicode),
1760                                  NULL);
1761 }
1762
1763 #ifdef MS_WIN32
1764
1765 /* --- MBCS codecs for Windows -------------------------------------------- */
1766
1767 PyObject *PyUnicode_DecodeMBCS(const char *s,
1768                                 int size,
1769                                 const char *errors)
1770 {
1771     PyUnicodeObject *v;
1772     Py_UNICODE *p;
1773
1774     /* First get the size of the result */
1775     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1776     if (size > 0 && usize==0)
1777         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1778
1779     v = _PyUnicode_New(usize);
1780     if (v == NULL)
1781         return NULL;
1782     if (usize == 0)
1783         return (PyObject *)v;
1784     p = PyUnicode_AS_UNICODE(v);
1785     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1786         Py_DECREF(v);
1787         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1788     }
1789
1790     return (PyObject *)v;
1791 }
1792
1793 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1794                                 int size,
1795                                 const char *errors)
1796 {
1797     PyObject *repr;
1798     char *s;
1799     DWORD mbcssize;
1800
1801     /* If there are no characters, bail now! */
1802     if (size==0)
1803             return PyString_FromString("");
1804
1805     /* First get the size of the result */
1806     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1807     if (mbcssize==0)
1808         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1809
1810     repr = PyString_FromStringAndSize(NULL, mbcssize);
1811     if (repr == NULL)
1812         return NULL;
1813     if (mbcssize == 0)
1814         return repr;
1815
1816     /* Do the conversion */
1817     s = PyString_AS_STRING(repr);
1818     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1819         Py_DECREF(repr);
1820         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1821     }
1822     return repr;
1823 }
1824
1825 #endif /* MS_WIN32 */
1826
1827 /* --- Character Mapping Codec -------------------------------------------- */
1828
1829 static
1830 int charmap_decoding_error(const char **source,
1831                          Py_UNICODE **dest,
1832                          const char *errors,
1833                          const char *details)
1834 {
1835     if ((errors == NULL) ||
1836         (strcmp(errors,"strict") == 0)) {
1837         PyErr_Format(PyExc_UnicodeError,
1838                      "charmap decoding error: %.400s",
1839                      details);
1840         return -1;
1841     }
1842     else if (strcmp(errors,"ignore") == 0) {
1843         return 0;
1844     }
1845     else if (strcmp(errors,"replace") == 0) {
1846         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1847         (*dest)++;
1848         return 0;
1849     }
1850     else {
1851         PyErr_Format(PyExc_ValueError,
1852                      "charmap decoding error; "
1853                      "unknown error handling code: %.400s",
1854                      errors);
1855         return -1;
1856     }
1857 }
1858
1859 PyObject *PyUnicode_DecodeCharmap(const char *s,
1860                                   int size,
1861                                   PyObject *mapping,
1862                                   const char *errors)
1863 {
1864     PyUnicodeObject *v;
1865     Py_UNICODE *p;
1866     int extrachars = 0;
1867
1868     /* Default to Latin-1 */
1869     if (mapping == NULL)
1870         return PyUnicode_DecodeLatin1(s, size, errors);
1871
1872     v = _PyUnicode_New(size);
1873     if (v == NULL)
1874         goto onError;
1875     if (size == 0)
1876         return (PyObject *)v;
1877     p = PyUnicode_AS_UNICODE(v);
1878     while (size-- > 0) {
1879         unsigned char ch = *s++;
1880         PyObject *w, *x;
1881
1882         /* Get mapping (char ordinal -> integer, Unicode char or None) */
1883         w = PyInt_FromLong((long)ch);
1884         if (w == NULL)
1885             goto onError;
1886         x = PyObject_GetItem(mapping, w);
1887         Py_DECREF(w);
1888         if (x == NULL) {
1889             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1890                 /* No mapping found means: mapping is undefined. */
1891                 PyErr_Clear();
1892                 x = Py_None;
1893                 Py_INCREF(x);
1894             } else
1895                 goto onError;
1896         }
1897
1898         /* Apply mapping */
1899         if (PyInt_Check(x)) {
1900             long value = PyInt_AS_LONG(x);
1901             if (value < 0 || value > 65535) {
1902                 PyErr_SetString(PyExc_TypeError,
1903                                 "character mapping must be in range(65536)");
1904                 Py_DECREF(x);
1905                 goto onError;
1906             }
1907             *p++ = (Py_UNICODE)value;
1908         }
1909         else if (x == Py_None) {
1910             /* undefined mapping */
1911             if (charmap_decoding_error(&s, &p, errors,
1912                                        "character maps to <undefined>")) {
1913                 Py_DECREF(x);
1914                 goto onError;
1915             }
1916         }
1917         else if (PyUnicode_Check(x)) {
1918             int targetsize = PyUnicode_GET_SIZE(x);
1919
1920             if (targetsize == 1)
1921                 /* 1-1 mapping */
1922                 *p++ = *PyUnicode_AS_UNICODE(x);
1923
1924             else if (targetsize > 1) {
1925                 /* 1-n mapping */
1926                 if (targetsize > extrachars) {
1927                     /* resize first */
1928                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1929                     int needed = (targetsize - extrachars) + \
1930                                  (targetsize << 2);
1931                     extrachars += needed;
1932                     if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
1933                         Py_DECREF(x);
1934                         goto onError;
1935                     }
1936                     p = PyUnicode_AS_UNICODE(v) + oldpos;
1937                 }
1938                 Py_UNICODE_COPY(p,
1939                                 PyUnicode_AS_UNICODE(x),
1940                                 targetsize);
1941                 p += targetsize;
1942                 extrachars -= targetsize;
1943             }
1944             /* 1-0 mapping: skip the character */
1945         }
1946         else {
1947             /* wrong return value */
1948             PyErr_SetString(PyExc_TypeError,
1949                   "character mapping must return integer, None or unicode");
1950             Py_DECREF(x);
1951             goto onError;
1952         }
1953         Py_DECREF(x);
1954     }
1955     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1956         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1957             goto onError;
1958     return (PyObject *)v;
1959
1960  onError:
1961     Py_XDECREF(v);
1962     return NULL;
1963 }
1964
1965 static
1966 int charmap_encoding_error(const Py_UNICODE **source,
1967                            char **dest,
1968                            const char *errors,
1969                            const char *details)
1970 {
1971     if ((errors == NULL) ||
1972         (strcmp(errors,"strict") == 0)) {
1973         PyErr_Format(PyExc_UnicodeError,
1974                      "charmap encoding error: %.400s",
1975                      details);
1976         return -1;
1977     }
1978     else if (strcmp(errors,"ignore") == 0) {
1979         return 0;
1980     }
1981     else if (strcmp(errors,"replace") == 0) {
1982         **dest = '?';
1983         (*dest)++;
1984         return 0;
1985     }
1986     else {
1987         PyErr_Format(PyExc_ValueError,
1988                      "charmap encoding error; "
1989                      "unknown error handling code: %.400s",
1990                      errors);
1991         return -1;
1992     }
1993 }
1994
1995 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1996                                   int size,
1997                                   PyObject *mapping,
1998                                   const char *errors)
1999 {
2000     PyObject *v;
2001     char *s;
2002     int extrachars = 0;
2003
2004     /* Default to Latin-1 */
2005     if (mapping == NULL)
2006         return PyUnicode_EncodeLatin1(p, size, errors);
2007
2008     v = PyString_FromStringAndSize(NULL, size);
2009     if (v == NULL)
2010         return NULL;
2011     if (size == 0)
2012         return v;
2013     s = PyString_AS_STRING(v);
2014     while (size-- > 0) {
2015         Py_UNICODE ch = *p++;
2016         PyObject *w, *x;
2017
2018         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2019         w = PyInt_FromLong((long)ch);
2020         if (w == NULL)
2021             goto onError;
2022         x = PyObject_GetItem(mapping, w);
2023         Py_DECREF(w);
2024         if (x == NULL) {
2025             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2026                 /* No mapping found means: mapping is undefined. */
2027                 PyErr_Clear();
2028                 x = Py_None;
2029                 Py_INCREF(x);
2030             } else
2031                 goto onError;
2032         }
2033
2034         /* Apply mapping */
2035         if (PyInt_Check(x)) {
2036             long value = PyInt_AS_LONG(x);
2037             if (value < 0 || value > 255) {
2038                 PyErr_SetString(PyExc_TypeError,
2039                                 "character mapping must be in range(256)");
2040                 Py_DECREF(x);
2041                 goto onError;
2042             }
2043             *s++ = (char)value;
2044         }
2045         else if (x == Py_None) {
2046             /* undefined mapping */
2047             if (charmap_encoding_error(&p, &s, errors,
2048                                        "character maps to <undefined>")) {
2049                 Py_DECREF(x);
2050                 goto onError;
2051             }
2052         }
2053         else if (PyString_Check(x)) {
2054             int targetsize = PyString_GET_SIZE(x);
2055
2056             if (targetsize == 1)
2057                 /* 1-1 mapping */
2058                 *s++ = *PyString_AS_STRING(x);
2059
2060             else if (targetsize > 1) {
2061                 /* 1-n mapping */
2062                 if (targetsize > extrachars) {
2063                     /* resize first */
2064                     int oldpos = (int)(s - PyString_AS_STRING(v));
2065                     int needed = (targetsize - extrachars) + \
2066                                  (targetsize << 2);
2067                     extrachars += needed;
2068                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2069                         Py_DECREF(x);
2070                         goto onError;
2071                     }
2072                     s = PyString_AS_STRING(v) + oldpos;
2073                 }
2074                 memcpy(s,
2075                        PyString_AS_STRING(x),
2076                        targetsize);
2077                 s += targetsize;
2078                 extrachars -= targetsize;
2079             }
2080             /* 1-0 mapping: skip the character */
2081         }
2082         else {
2083             /* wrong return value */
2084             PyErr_SetString(PyExc_TypeError,
2085                   "character mapping must return integer, None or unicode");
2086             Py_DECREF(x);
2087             goto onError;
2088         }
2089         Py_DECREF(x);
2090     }
2091     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2092         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2093             goto onError;
2094     return v;
2095
2096  onError:
2097     Py_DECREF(v);
2098     return NULL;
2099 }
2100
2101 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2102                                     PyObject *mapping)
2103 {
2104     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2105         PyErr_BadArgument();
2106         return NULL;
2107     }
2108     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2109                                    PyUnicode_GET_SIZE(unicode),
2110                                    mapping,
2111                                    NULL);
2112 }
2113
2114 static
2115 int translate_error(const Py_UNICODE **source,
2116                     Py_UNICODE **dest,
2117                     const char *errors,
2118                     const char *details)
2119 {
2120     if ((errors == NULL) ||
2121         (strcmp(errors,"strict") == 0)) {
2122         PyErr_Format(PyExc_UnicodeError,
2123                      "translate error: %.400s",
2124                      details);
2125         return -1;
2126     }
2127     else if (strcmp(errors,"ignore") == 0) {
2128         return 0;
2129     }
2130     else if (strcmp(errors,"replace") == 0) {
2131         **dest = '?';
2132         (*dest)++;
2133         return 0;
2134     }
2135     else {
2136         PyErr_Format(PyExc_ValueError,
2137                      "translate error; "
2138                      "unknown error handling code: %.400s",
2139                      errors);
2140         return -1;
2141     }
2142 }
2143
2144 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2145                                      int size,
2146                                      PyObject *mapping,
2147                                      const char *errors)
2148 {
2149     PyUnicodeObject *v;
2150     Py_UNICODE *p;
2151
2152     if (mapping == NULL) {
2153         PyErr_BadArgument();
2154         return NULL;
2155     }
2156
2157     /* Output will never be longer than input */
2158     v = _PyUnicode_New(size);
2159     if (v == NULL)
2160         goto onError;
2161     if (size == 0)
2162         goto done;
2163     p = PyUnicode_AS_UNICODE(v);
2164     while (size-- > 0) {
2165         Py_UNICODE ch = *s++;
2166         PyObject *w, *x;
2167
2168         /* Get mapping */
2169         w = PyInt_FromLong(ch);
2170         if (w == NULL)
2171             goto onError;
2172         x = PyObject_GetItem(mapping, w);
2173         Py_DECREF(w);
2174         if (x == NULL) {
2175             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2176                 /* No mapping found: default to 1-1 mapping */
2177                 PyErr_Clear();
2178                 *p++ = ch;
2179                 continue;
2180             }
2181             goto onError;
2182         }
2183
2184         /* Apply mapping */
2185         if (PyInt_Check(x))
2186             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2187         else if (x == Py_None) {
2188             /* undefined mapping */
2189             if (translate_error(&s, &p, errors,
2190                                 "character maps to <undefined>")) {
2191                 Py_DECREF(x);
2192                 goto onError;
2193             }
2194         }
2195         else if (PyUnicode_Check(x)) {
2196             if (PyUnicode_GET_SIZE(x) != 1) {
2197                 /* 1-n mapping */
2198                 PyErr_SetString(PyExc_NotImplementedError,
2199                                 "1-n mappings are currently not implemented");
2200                 Py_DECREF(x);
2201                 goto onError;
2202             }
2203             *p++ = *PyUnicode_AS_UNICODE(x);
2204         }
2205         else {
2206             /* wrong return value */
2207             PyErr_SetString(PyExc_TypeError,
2208                   "translate mapping must return integer, None or unicode");
2209             Py_DECREF(x);
2210             goto onError;
2211         }
2212         Py_DECREF(x);
2213     }
2214     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2215         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2216             goto onError;
2217
2218  done:
2219     return (PyObject *)v;
2220
2221  onError:
2222     Py_XDECREF(v);
2223     return NULL;
2224 }
2225
2226 PyObject *PyUnicode_Translate(PyObject *str,
2227                               PyObject *mapping,
2228                               const char *errors)
2229 {
2230     PyObject *result;
2231
2232     str = PyUnicode_FromObject(str);
2233     if (str == NULL)
2234         goto onError;
2235     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2236                                         PyUnicode_GET_SIZE(str),
2237                                         mapping,
2238                                         errors);
2239     Py_DECREF(str);
2240     return result;
2241
2242  onError:
2243     Py_XDECREF(str);
2244     return NULL;
2245 }
2246
2247 /* --- Decimal Encoder ---------------------------------------------------- */
2248
2249 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2250                             int length,
2251                             char *output,
2252                             const char *errors)
2253 {
2254     Py_UNICODE *p, *end;
2255
2256     if (output == NULL) {
2257         PyErr_BadArgument();
2258         return -1;
2259     }
2260
2261     p = s;
2262     end = s + length;
2263     while (p < end) {
2264         register Py_UNICODE ch = *p++;
2265         int decimal;
2266
2267         if (Py_UNICODE_ISSPACE(ch)) {
2268             *output++ = ' ';
2269             continue;
2270         }
2271         decimal = Py_UNICODE_TODECIMAL(ch);
2272         if (decimal >= 0) {
2273             *output++ = '0' + decimal;
2274             continue;
2275         }
2276         if (0 < ch && ch < 256) {
2277             *output++ = (char)ch;
2278             continue;
2279         }
2280         /* All other characters are considered invalid */
2281         if (errors == NULL || strcmp(errors, "strict") == 0) {
2282             PyErr_SetString(PyExc_ValueError,
2283                             "invalid decimal Unicode string");
2284             goto onError;
2285         }
2286         else if (strcmp(errors, "ignore") == 0)
2287             continue;
2288         else if (strcmp(errors, "replace") == 0) {
2289             *output++ = '?';
2290             continue;
2291         }
2292     }
2293     /* 0-terminate the output string */
2294     *output++ = '\0';
2295     return 0;
2296
2297  onError:
2298     return -1;
2299 }
2300
2301 /* --- Helpers ------------------------------------------------------------ */
2302
2303 static
2304 int count(PyUnicodeObject *self,
2305           int start,
2306           int end,
2307           PyUnicodeObject *substring)
2308 {
2309     int count = 0;
2310
2311     if (start < 0)
2312         start += self->length;
2313     if (start < 0)
2314         start = 0;
2315     if (end > self->length)
2316         end = self->length;
2317     if (end < 0)
2318         end += self->length;
2319     if (end < 0)
2320         end = 0;
2321
2322     if (substring->length == 0)
2323         return (end - start + 1);
2324
2325     end -= substring->length;
2326
2327     while (start <= end)
2328         if (Py_UNICODE_MATCH(self, start, substring)) {
2329             count++;
2330             start += substring->length;
2331         } else
2332             start++;
2333
2334     return count;
2335 }
2336
2337 int PyUnicode_Count(PyObject *str,
2338                     PyObject *substr,
2339                     int start,
2340                     int end)
2341 {
2342     int result;
2343
2344     str = PyUnicode_FromObject(str);
2345     if (str == NULL)
2346         return -1;
2347     substr = PyUnicode_FromObject(substr);
2348     if (substr == NULL) {
2349         Py_DECREF(str);
2350         return -1;
2351     }
2352
2353     result = count((PyUnicodeObject *)str,
2354                    start, end,
2355                    (PyUnicodeObject *)substr);
2356
2357     Py_DECREF(str);
2358     Py_DECREF(substr);
2359     return result;
2360 }
2361
2362 static
2363 int findstring(PyUnicodeObject *self,
2364                PyUnicodeObject *substring,
2365                int start,
2366                int end,
2367                int direction)
2368 {
2369     if (start < 0)
2370         start += self->length;
2371     if (start < 0)
2372         start = 0;
2373
2374     if (substring->length == 0)
2375         return start;
2376
2377     if (end > self->length)
2378         end = self->length;
2379     if (end < 0)
2380         end += self->length;
2381     if (end < 0)
2382         end = 0;
2383
2384     end -= substring->length;
2385
2386     if (direction < 0) {
2387         for (; end >= start; end--)
2388             if (Py_UNICODE_MATCH(self, end, substring))
2389                 return end;
2390     } else {
2391         for (; start <= end; start++)
2392             if (Py_UNICODE_MATCH(self, start, substring))
2393                 return start;
2394     }
2395
2396     return -1;
2397 }
2398
2399 int PyUnicode_Find(PyObject *str,
2400                    PyObject *substr,
2401                    int start,
2402                    int end,
2403                    int direction)
2404 {
2405     int result;
2406
2407     str = PyUnicode_FromObject(str);
2408     if (str == NULL)
2409         return -1;
2410     substr = PyUnicode_FromObject(substr);
2411     if (substr == NULL) {
2412         Py_DECREF(substr);
2413         return -1;
2414     }
2415
2416     result = findstring((PyUnicodeObject *)str,
2417                         (PyUnicodeObject *)substr,
2418                         start, end, direction);
2419     Py_DECREF(str);
2420     Py_DECREF(substr);
2421     return result;
2422 }
2423
2424 static
2425 int tailmatch(PyUnicodeObject *self,
2426               PyUnicodeObject *substring,
2427               int start,
2428               int end,
2429               int direction)
2430 {
2431     if (start < 0)
2432         start += self->length;
2433     if (start < 0)
2434         start = 0;
2435
2436     if (substring->length == 0)
2437         return 1;
2438
2439     if (end > self->length)
2440         end = self->length;
2441     if (end < 0)
2442         end += self->length;
2443     if (end < 0)
2444         end = 0;
2445
2446     end -= substring->length;
2447     if (end < start)
2448         return 0;
2449
2450     if (direction > 0) {
2451         if (Py_UNICODE_MATCH(self, end, substring))
2452             return 1;
2453     } else {
2454         if (Py_UNICODE_MATCH(self, start, substring))
2455             return 1;
2456     }
2457
2458     return 0;
2459 }
2460
2461 int PyUnicode_Tailmatch(PyObject *str,
2462                         PyObject *substr,
2463                         int start,
2464                         int end,
2465                         int direction)
2466 {
2467     int result;
2468
2469     str = PyUnicode_FromObject(str);
2470     if (str == NULL)
2471         return -1;
2472     substr = PyUnicode_FromObject(substr);
2473     if (substr == NULL) {
2474         Py_DECREF(substr);
2475         return -1;
2476     }
2477
2478     result = tailmatch((PyUnicodeObject *)str,
2479                        (PyUnicodeObject *)substr,
2480                        start, end, direction);
2481     Py_DECREF(str);
2482     Py_DECREF(substr);
2483     return result;
2484 }
2485
2486 static
2487 const Py_UNICODE *findchar(const Py_UNICODE *s,
2488                      int size,
2489                      Py_UNICODE ch)
2490 {
2491     /* like wcschr, but doesn't stop at NULL characters */
2492
2493     while (size-- > 0) {
2494         if (*s == ch)
2495             return s;
2496         s++;
2497     }
2498
2499     return NULL;
2500 }
2501
2502 /* Apply fixfct filter to the Unicode object self and return a
2503    reference to the modified object */
2504
2505 static
2506 PyObject *fixup(PyUnicodeObject *self,
2507                 int (*fixfct)(PyUnicodeObject *s))
2508 {
2509
2510     PyUnicodeObject *u;
2511
2512     u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2513                                                  self->length);
2514     if (u == NULL)
2515         return NULL;
2516     if (!fixfct(u)) {
2517         /* fixfct should return TRUE if it modified the buffer. If
2518            FALSE, return a reference to the original buffer instead
2519            (to save space, not time) */
2520         Py_INCREF(self);
2521         Py_DECREF(u);
2522         return (PyObject*) self;
2523     }
2524     return (PyObject*) u;
2525 }
2526
2527 static
2528 int fixupper(PyUnicodeObject *self)
2529 {
2530     int len = self->length;
2531     Py_UNICODE *s = self->str;
2532     int status = 0;
2533
2534     while (len-- > 0) {
2535         register Py_UNICODE ch;
2536
2537         ch = Py_UNICODE_TOUPPER(*s);
2538         if (ch != *s) {
2539             status = 1;
2540             *s = ch;
2541         }
2542         s++;
2543     }
2544
2545     return status;
2546 }
2547
2548 static
2549 int fixlower(PyUnicodeObject *self)
2550 {
2551     int len = self->length;
2552     Py_UNICODE *s = self->str;
2553     int status = 0;
2554
2555     while (len-- > 0) {
2556         register Py_UNICODE ch;
2557
2558         ch = Py_UNICODE_TOLOWER(*s);
2559         if (ch != *s) {
2560             status = 1;
2561             *s = ch;
2562         }
2563         s++;
2564     }
2565
2566     return status;
2567 }
2568
2569 static
2570 int fixswapcase(PyUnicodeObject *self)
2571 {
2572     int len = self->length;
2573     Py_UNICODE *s = self->str;
2574     int status = 0;
2575
2576     while (len-- > 0) {
2577         if (Py_UNICODE_ISUPPER(*s)) {
2578             *s = Py_UNICODE_TOLOWER(*s);
2579             status = 1;
2580         } else if (Py_UNICODE_ISLOWER(*s)) {
2581             *s = Py_UNICODE_TOUPPER(*s);
2582             status = 1;
2583         }
2584         s++;
2585     }
2586
2587     return status;
2588 }
2589
2590 static
2591 int fixcapitalize(PyUnicodeObject *self)
2592 {
2593     int len = self->length;
2594     Py_UNICODE *s = self->str;
2595     int status = 0;
2596
2597     if (len == 0)
2598         return 0;
2599     if (Py_UNICODE_ISLOWER(*s)) {
2600         *s = Py_UNICODE_TOUPPER(*s);
2601         status = 1;
2602     }
2603     s++;
2604     while (--len > 0) {
2605         if (Py_UNICODE_ISUPPER(*s)) {
2606             *s = Py_UNICODE_TOLOWER(*s);
2607             status = 1;
2608         }
2609         s++;
2610     }
2611     return status;
2612 }
2613
2614 static
2615 int fixtitle(PyUnicodeObject *self)
2616 {
2617     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2618     register Py_UNICODE *e;
2619     int previous_is_cased;
2620
2621     /* Shortcut for single character strings */
2622     if (PyUnicode_GET_SIZE(self) == 1) {
2623         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2624         if (*p != ch) {
2625             *p = ch;
2626             return 1;
2627         }
2628         else
2629             return 0;
2630     }
2631
2632     e = p + PyUnicode_GET_SIZE(self);
2633     previous_is_cased = 0;
2634     for (; p < e; p++) {
2635         register const Py_UNICODE ch = *p;
2636
2637         if (previous_is_cased)
2638             *p = Py_UNICODE_TOLOWER(ch);
2639         else
2640             *p = Py_UNICODE_TOTITLE(ch);
2641
2642         if (Py_UNICODE_ISLOWER(ch) ||
2643             Py_UNICODE_ISUPPER(ch) ||
2644             Py_UNICODE_ISTITLE(ch))
2645             previous_is_cased = 1;
2646         else
2647             previous_is_cased = 0;
2648     }
2649     return 1;
2650 }
2651
2652 PyObject *PyUnicode_Join(PyObject *separator,
2653                          PyObject *seq)
2654 {
2655     Py_UNICODE *sep;
2656     int seplen;
2657     PyUnicodeObject *res = NULL;
2658     int reslen = 0;
2659     Py_UNICODE *p;
2660     int seqlen = 0;
2661     int sz = 100;
2662     int i;
2663
2664     seqlen = PySequence_Size(seq);
2665     if (seqlen < 0 && PyErr_Occurred())
2666         return NULL;
2667
2668     if (separator == NULL) {
2669         Py_UNICODE blank = ' ';
2670         sep = &blank;
2671         seplen = 1;
2672     }
2673     else {
2674         separator = PyUnicode_FromObject(separator);
2675         if (separator == NULL)
2676             return NULL;
2677         sep = PyUnicode_AS_UNICODE(separator);
2678         seplen = PyUnicode_GET_SIZE(separator);
2679     }
2680
2681     res = _PyUnicode_New(sz);
2682     if (res == NULL)
2683         goto onError;
2684     p = PyUnicode_AS_UNICODE(res);
2685     reslen = 0;
2686
2687     for (i = 0; i < seqlen; i++) {
2688         int itemlen;
2689         PyObject *item;
2690
2691         item = PySequence_GetItem(seq, i);
2692         if (item == NULL)
2693             goto onError;
2694         if (!PyUnicode_Check(item)) {
2695             PyObject *v;
2696             v = PyUnicode_FromObject(item);
2697             Py_DECREF(item);
2698             item = v;
2699             if (item == NULL)
2700                 goto onError;
2701         }
2702         itemlen = PyUnicode_GET_SIZE(item);
2703         while (reslen + itemlen + seplen >= sz) {
2704             if (_PyUnicode_Resize(res, sz*2))
2705                 goto onError;
2706             sz *= 2;
2707             p = PyUnicode_AS_UNICODE(res) + reslen;
2708         }
2709         if (i > 0) {
2710             memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2711             p += seplen;
2712             reslen += seplen;
2713         }
2714         memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2715         p += itemlen;
2716         reslen += itemlen;
2717         Py_DECREF(item);
2718     }
2719     if (_PyUnicode_Resize(res, reslen))
2720         goto onError;
2721
2722     Py_XDECREF(separator);
2723     return (PyObject *)res;
2724
2725  onError:
2726     Py_XDECREF(separator);
2727     Py_DECREF(res);
2728     return NULL;
2729 }
2730
2731 static
2732 PyUnicodeObject *pad(PyUnicodeObject *self,
2733                      int left,
2734                      int right,
2735                      Py_UNICODE fill)
2736 {
2737     PyUnicodeObject *u;
2738
2739     if (left < 0)
2740         left = 0;
2741     if (right < 0)
2742         right = 0;
2743
2744     if (left == 0 && right == 0) {
2745         Py_INCREF(self);
2746         return self;
2747     }
2748
2749     u = _PyUnicode_New(left + self->length + right);
2750     if (u) {
2751         if (left)
2752             Py_UNICODE_FILL(u->str, fill, left);
2753         Py_UNICODE_COPY(u->str + left, self->str, self->length);
2754         if (right)
2755             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2756     }
2757
2758     return u;
2759 }
2760
2761 #define SPLIT_APPEND(data, left, right)                                 \
2762         str = PyUnicode_FromUnicode(data + left, right - left);         \
2763         if (!str)                                                       \
2764             goto onError;                                               \
2765         if (PyList_Append(list, str)) {                                 \
2766             Py_DECREF(str);                                             \
2767             goto onError;                                               \
2768         }                                                               \
2769         else                                                            \
2770             Py_DECREF(str);
2771
2772 static
2773 PyObject *split_whitespace(PyUnicodeObject *self,
2774                            PyObject *list,
2775                            int maxcount)
2776 {
2777     register int i;
2778     register int j;
2779     int len = self->length;
2780     PyObject *str;
2781
2782     for (i = j = 0; i < len; ) {
2783         /* find a token */
2784         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2785             i++;
2786         j = i;
2787         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2788             i++;
2789         if (j < i) {
2790             if (maxcount-- <= 0)
2791                 break;
2792             SPLIT_APPEND(self->str, j, i);
2793             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2794                 i++;
2795             j = i;
2796         }
2797     }
2798     if (j < len) {
2799         SPLIT_APPEND(self->str, j, len);
2800     }
2801     return list;
2802
2803  onError:
2804     Py_DECREF(list);
2805     return NULL;
2806 }
2807
2808 PyObject *PyUnicode_Splitlines(PyObject *string,
2809                                int keepends)
2810 {
2811     register int i;
2812     register int j;
2813     int len;
2814     PyObject *list;
2815     PyObject *str;
2816     Py_UNICODE *data;
2817
2818     string = PyUnicode_FromObject(string);
2819     if (string == NULL)
2820         return NULL;
2821     data = PyUnicode_AS_UNICODE(string);
2822     len = PyUnicode_GET_SIZE(string);
2823
2824     list = PyList_New(0);
2825     if (!list)
2826         goto onError;
2827
2828     for (i = j = 0; i < len; ) {
2829         int eol;
2830
2831         /* Find a line and append it */
2832         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2833             i++;
2834
2835         /* Skip the line break reading CRLF as one line break */
2836         eol = i;
2837         if (i < len) {
2838             if (data[i] == '\r' && i + 1 < len &&
2839                 data[i+1] == '\n')
2840                 i += 2;
2841             else
2842                 i++;
2843             if (keepends)
2844                 eol = i;
2845         }
2846         SPLIT_APPEND(data, j, eol);
2847         j = i;
2848     }
2849     if (j < len) {
2850         SPLIT_APPEND(data, j, len);
2851     }
2852
2853     Py_DECREF(string);
2854     return list;
2855
2856  onError:
2857     Py_DECREF(list);
2858     Py_DECREF(string);
2859     return NULL;
2860 }
2861
2862 static
2863 PyObject *split_char(PyUnicodeObject *self,
2864                      PyObject *list,
2865                      Py_UNICODE ch,
2866                      int maxcount)
2867 {
2868     register int i;
2869     register int j;
2870     int len = self->length;
2871     PyObject *str;
2872
2873     for (i = j = 0; i < len; ) {
2874         if (self->str[i] == ch) {
2875             if (maxcount-- <= 0)
2876                 break;
2877             SPLIT_APPEND(self->str, j, i);
2878             i = j = i + 1;
2879         } else
2880             i++;
2881     }
2882     if (j <= len) {
2883         SPLIT_APPEND(self->str, j, len);
2884     }
2885     return list;
2886
2887  onError:
2888     Py_DECREF(list);
2889     return NULL;
2890 }
2891
2892 static
2893 PyObject *split_substring(PyUnicodeObject *self,
2894                           PyObject *list,
2895                           PyUnicodeObject *substring,
2896                           int maxcount)
2897 {
2898     register int i;
2899     register int j;
2900     int len = self->length;
2901     int sublen = substring->length;
2902     PyObject *str;
2903
2904     for (i = j = 0; i <= len - sublen; ) {
2905         if (Py_UNICODE_MATCH(self, i, substring)) {
2906             if (maxcount-- <= 0)
2907                 break;
2908             SPLIT_APPEND(self->str, j, i);
2909             i = j = i + sublen;
2910         } else
2911             i++;
2912     }
2913     if (j <= len) {
2914         SPLIT_APPEND(self->str, j, len);
2915     }
2916     return list;
2917
2918  onError:
2919     Py_DECREF(list);
2920     return NULL;
2921 }
2922
2923 #undef SPLIT_APPEND
2924
2925 static
2926 PyObject *split(PyUnicodeObject *self,
2927                 PyUnicodeObject *substring,
2928                 int maxcount)
2929 {
2930     PyObject *list;
2931
2932     if (maxcount < 0)
2933         maxcount = INT_MAX;
2934
2935     list = PyList_New(0);
2936     if (!list)
2937         return NULL;
2938
2939     if (substring == NULL)
2940         return split_whitespace(self,list,maxcount);
2941
2942     else if (substring->length == 1)
2943         return split_char(self,list,substring->str[0],maxcount);
2944
2945     else if (substring->length == 0) {
2946         Py_DECREF(list);
2947         PyErr_SetString(PyExc_ValueError, "empty separator");
2948         return NULL;
2949     }
2950     else
2951         return split_substring(self,list,substring,maxcount);
2952 }
2953
2954 static
2955 PyObject *strip(PyUnicodeObject *self,
2956                 int left,
2957                 int right)
2958 {
2959     Py_UNICODE *p = self->str;
2960     int start = 0;
2961     int end = self->length;
2962
2963     if (left)
2964         while (start < end && Py_UNICODE_ISSPACE(p[start]))
2965             start++;
2966
2967     if (right)
2968         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2969             end--;
2970
2971     if (start == 0 && end == self->length) {
2972         /* couldn't strip anything off, return original string */
2973         Py_INCREF(self);
2974         return (PyObject*) self;
2975     }
2976
2977     return (PyObject*) PyUnicode_FromUnicode(
2978         self->str + start,
2979         end - start
2980         );
2981 }
2982
2983 static
2984 PyObject *replace(PyUnicodeObject *self,
2985                   PyUnicodeObject *str1,
2986                   PyUnicodeObject *str2,
2987                   int maxcount)
2988 {
2989     PyUnicodeObject *u;
2990
2991     if (maxcount < 0)
2992         maxcount = INT_MAX;
2993
2994     if (str1->length == 1 && str2->length == 1) {
2995         int i;
2996
2997         /* replace characters */
2998         if (!findchar(self->str, self->length, str1->str[0])) {
2999             /* nothing to replace, return original string */
3000             Py_INCREF(self);
3001             u = self;
3002         } else {
3003             Py_UNICODE u1 = str1->str[0];
3004             Py_UNICODE u2 = str2->str[0];
3005
3006             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3007                 self->str,
3008                 self->length
3009                 );
3010             if (u)
3011                 for (i = 0; i < u->length; i++)
3012                     if (u->str[i] == u1) {
3013                         if (--maxcount < 0)
3014                             break;
3015                         u->str[i] = u2;
3016                     }
3017         }
3018
3019     } else {
3020         int n, i;
3021         Py_UNICODE *p;
3022
3023         /* replace strings */
3024         n = count(self, 0, self->length, str1);
3025         if (n > maxcount)
3026             n = maxcount;
3027         if (n == 0) {
3028             /* nothing to replace, return original string */
3029             Py_INCREF(self);
3030             u = self;
3031         } else {
3032             u = _PyUnicode_New(
3033                 self->length + n * (str2->length - str1->length));
3034             if (u) {
3035                 i = 0;
3036                 p = u->str;
3037                 while (i <= self->length - str1->length)
3038                     if (Py_UNICODE_MATCH(self, i, str1)) {
3039                         /* replace string segment */
3040                         Py_UNICODE_COPY(p, str2->str, str2->length);
3041                         p += str2->length;
3042                         i += str1->length;
3043                         if (--n <= 0) {
3044                             /* copy remaining part */
3045                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3046                             break;
3047                         }
3048                     } else
3049                         *p++ = self->str[i++];
3050             }
3051         }
3052     }
3053
3054     return (PyObject *) u;
3055 }
3056
3057 /* --- Unicode Object Methods --------------------------------------------- */
3058
3059 static char title__doc__[] =
3060 "S.title() -> unicode\n\
3061 \n\
3062 Return a titlecased version of S, i.e. words start with title case\n\
3063 characters, all remaining cased characters have lower case.";
3064
3065 static PyObject*
3066 unicode_title(PyUnicodeObject *self, PyObject *args)
3067 {
3068     if (!PyArg_NoArgs(args))
3069         return NULL;
3070     return fixup(self, fixtitle);
3071 }
3072
3073 static char capitalize__doc__[] =
3074 "S.capitalize() -> unicode\n\
3075 \n\
3076 Return a capitalized version of S, i.e. make the first character\n\
3077 have upper case.";
3078
3079 static PyObject*
3080 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3081 {
3082     if (!PyArg_NoArgs(args))
3083         return NULL;
3084     return fixup(self, fixcapitalize);
3085 }
3086
3087 #if 0
3088 static char capwords__doc__[] =
3089 "S.capwords() -> unicode\n\
3090 \n\
3091 Apply .capitalize() to all words in S and return the result with\n\
3092 normalized whitespace (all whitespace strings are replaced by ' ').";
3093
3094 static PyObject*
3095 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3096 {
3097     PyObject *list;
3098     PyObject *item;
3099     int i;
3100
3101     if (!PyArg_NoArgs(args))
3102         return NULL;
3103
3104     /* Split into words */
3105     list = split(self, NULL, -1);
3106     if (!list)
3107         return NULL;
3108
3109     /* Capitalize each word */
3110     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3111         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3112                      fixcapitalize);
3113         if (item == NULL)
3114             goto onError;
3115         Py_DECREF(PyList_GET_ITEM(list, i));
3116         PyList_SET_ITEM(list, i, item);
3117     }
3118
3119     /* Join the words to form a new string */
3120     item = PyUnicode_Join(NULL, list);
3121
3122 onError:
3123     Py_DECREF(list);
3124     return (PyObject *)item;
3125 }
3126 #endif
3127
3128 static char center__doc__[] =
3129 "S.center(width) -> unicode\n\
3130 \n\
3131 Return S centered in a Unicode string of length width. Padding is done\n\
3132 using spaces.";
3133
3134 static PyObject *
3135 unicode_center(PyUnicodeObject *self, PyObject *args)
3136 {
3137     int marg, left;
3138     int width;
3139
3140     if (!PyArg_ParseTuple(args, "i:center", &width))
3141         return NULL;
3142
3143     if (self->length >= width) {
3144         Py_INCREF(self);
3145         return (PyObject*) self;
3146     }
3147
3148     marg = width - self->length;
3149     left = marg / 2 + (marg & width & 1);
3150
3151     return (PyObject*) pad(self, left, marg - left, ' ');
3152 }
3153
3154 #if 0
3155
3156 /* This code should go into some future Unicode collation support
3157    module. The basic comparison should compare ordinals on a naive
3158    basis (this is what Java does and thus JPython too). */
3159
3160 /* speedy UTF-16 code point order comparison */
3161 /* gleaned from: */
3162 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3163
3164 static short utf16Fixup[32] =
3165 {
3166     0, 0, 0, 0, 0, 0, 0, 0,
3167     0, 0, 0, 0, 0, 0, 0, 0,
3168     0, 0, 0, 0, 0, 0, 0, 0,
3169     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3170 };
3171
3172 static int
3173 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3174 {
3175     int len1, len2;
3176
3177     Py_UNICODE *s1 = str1->str;
3178     Py_UNICODE *s2 = str2->str;
3179
3180     len1 = str1->length;
3181     len2 = str2->length;
3182
3183     while (len1 > 0 && len2 > 0) {
3184         Py_UNICODE c1, c2;
3185         long diff;
3186
3187         c1 = *s1++;
3188         c2 = *s2++;
3189         if (c1 > (1<<11) * 26)
3190             c1 += utf16Fixup[c1>>11];
3191         if (c2 > (1<<11) * 26)
3192             c2 += utf16Fixup[c2>>11];
3193
3194         /* now c1 and c2 are in UTF-32-compatible order */
3195         diff = (long)c1 - (long)c2;
3196         if (diff)
3197             return (diff < 0) ? -1 : (diff != 0);
3198         len1--; len2--;
3199     }
3200
3201     return (len1 < len2) ? -1 : (len1 != len2);
3202 }
3203
3204 #else
3205
3206 static int
3207 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3208 {
3209     register int len1, len2;
3210
3211     Py_UNICODE *s1 = str1->str;
3212     Py_UNICODE *s2 = str2->str;
3213
3214     len1 = str1->length;
3215     len2 = str2->length;
3216
3217     while (len1 > 0 && len2 > 0) {
3218         register long diff;
3219
3220         diff = (long)*s1++ - (long)*s2++;
3221         if (diff)
3222             return (diff < 0) ? -1 : (diff != 0);
3223         len1--; len2--;
3224     }
3225
3226     return (len1 < len2) ? -1 : (len1 != len2);
3227 }
3228
3229 #endif
3230
3231 int PyUnicode_Compare(PyObject *left,
3232                       PyObject *right)
3233 {
3234     PyUnicodeObject *u = NULL, *v = NULL;
3235     int result;
3236
3237     /* Coerce the two arguments */
3238     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3239     if (u == NULL)
3240         goto onError;
3241     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3242     if (v == NULL)
3243         goto onError;
3244
3245     /* Shortcut for empty or interned objects */
3246     if (v == u) {
3247         Py_DECREF(u);
3248         Py_DECREF(v);
3249         return 0;
3250     }
3251
3252     result = unicode_compare(u, v);
3253
3254     Py_DECREF(u);
3255     Py_DECREF(v);
3256     return result;
3257
3258 onError:
3259     Py_XDECREF(u);
3260     Py_XDECREF(v);
3261     return -1;
3262 }
3263
3264 int PyUnicode_Contains(PyObject *container,
3265                        PyObject *element)
3266 {
3267     PyUnicodeObject *u = NULL, *v = NULL;
3268     int result;
3269     register const Py_UNICODE *p, *e;
3270     register Py_UNICODE ch;
3271
3272     /* Coerce the two arguments */
3273     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3274     if (v == NULL) {
3275         PyErr_SetString(PyExc_TypeError,
3276             "'in <string>' requires character as left operand");
3277         goto onError;
3278     }
3279     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3280     if (u == NULL) {
3281         Py_DECREF(v);
3282         goto onError;
3283     }
3284
3285     /* Check v in u */
3286     if (PyUnicode_GET_SIZE(v) != 1) {
3287         PyErr_SetString(PyExc_TypeError,
3288             "'in <string>' requires character as left operand");
3289         goto onError;
3290     }
3291     ch = *PyUnicode_AS_UNICODE(v);
3292     p = PyUnicode_AS_UNICODE(u);
3293     e = p + PyUnicode_GET_SIZE(u);
3294     result = 0;
3295     while (p < e) {
3296         if (*p++ == ch) {
3297             result = 1;
3298             break;
3299         }
3300     }
3301
3302     Py_DECREF(u);
3303     Py_DECREF(v);
3304     return result;
3305
3306 onError:
3307     Py_XDECREF(u);
3308     Py_XDECREF(v);
3309     return -1;
3310 }
3311
3312 /* Concat to string or Unicode object giving a new Unicode object. */
3313
3314 PyObject *PyUnicode_Concat(PyObject *left,
3315                            PyObject *right)
3316 {
3317     PyUnicodeObject *u = NULL, *v = NULL, *w;
3318
3319     /* Coerce the two arguments */
3320     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3321     if (u == NULL)
3322         goto onError;
3323     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3324     if (v == NULL)
3325         goto onError;
3326
3327     /* Shortcuts */
3328     if (v == unicode_empty) {
3329         Py_DECREF(v);
3330         return (PyObject *)u;
3331     }
3332     if (u == unicode_empty) {
3333         Py_DECREF(u);
3334         return (PyObject *)v;
3335     }
3336
3337     /* Concat the two Unicode strings */
3338     w = _PyUnicode_New(u->length + v->length);
3339     if (w == NULL)
3340         goto onError;
3341     Py_UNICODE_COPY(w->str, u->str, u->length);
3342     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3343
3344     Py_DECREF(u);
3345     Py_DECREF(v);
3346     return (PyObject *)w;
3347
3348 onError:
3349     Py_XDECREF(u);
3350     Py_XDECREF(v);
3351     return NULL;
3352 }
3353
3354 static char count__doc__[] =
3355 "S.count(sub[, start[, end]]) -> int\n\
3356 \n\
3357 Return the number of occurrences of substring sub in Unicode string\n\
3358 S[start:end].  Optional arguments start and end are\n\
3359 interpreted as in slice notation.";
3360
3361 static PyObject *
3362 unicode_count(PyUnicodeObject *self, PyObject *args)
3363 {
3364     PyUnicodeObject *substring;
3365     int start = 0;
3366     int end = INT_MAX;
3367     PyObject *result;
3368
3369     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3370                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3371         return NULL;
3372
3373     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3374                                                 (PyObject *)substring);
3375     if (substring == NULL)
3376         return NULL;
3377
3378     if (start < 0)
3379         start += self->length;
3380     if (start < 0)
3381         start = 0;
3382     if (end > self->length)
3383         end = self->length;
3384     if (end < 0)
3385         end += self->length;
3386     if (end < 0)
3387         end = 0;
3388
3389     result = PyInt_FromLong((long) count(self, start, end, substring));
3390
3391     Py_DECREF(substring);
3392     return result;
3393 }
3394
3395 static char encode__doc__[] =
3396 "S.encode([encoding[,errors]]) -> string\n\
3397 \n\
3398 Return an encoded string version of S. Default encoding is the current\n\
3399 default string encoding. errors may be given to set a different error\n\
3400 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3401 a ValueError. Other possible values are 'ignore' and 'replace'.";
3402
3403 static PyObject *
3404 unicode_encode(PyUnicodeObject *self, PyObject *args)
3405 {
3406     char *encoding = NULL;
3407     char *errors = NULL;
3408     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3409         return NULL;
3410     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3411 }
3412
3413 static char expandtabs__doc__[] =
3414 "S.expandtabs([tabsize]) -> unicode\n\
3415 \n\
3416 Return a copy of S where all tab characters are expanded using spaces.\n\
3417 If tabsize is not given, a tab size of 8 characters is assumed.";
3418
3419 static PyObject*
3420 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3421 {
3422     Py_UNICODE *e;
3423     Py_UNICODE *p;
3424     Py_UNICODE *q;
3425     int i, j;
3426     PyUnicodeObject *u;
3427     int tabsize = 8;
3428
3429     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3430         return NULL;
3431
3432     /* First pass: determine size of output string */
3433     i = j = 0;
3434     e = self->str + self->length;
3435     for (p = self->str; p < e; p++)
3436         if (*p == '\t') {
3437             if (tabsize > 0)
3438                 j += tabsize - (j % tabsize);
3439         }
3440         else {
3441             j++;
3442             if (*p == '\n' || *p == '\r') {
3443                 i += j;
3444                 j = 0;
3445             }
3446         }
3447
3448     /* Second pass: create output string and fill it */
3449     u = _PyUnicode_New(i + j);
3450     if (!u)
3451         return NULL;
3452
3453     j = 0;
3454     q = u->str;
3455
3456     for (p = self->str; p < e; p++)
3457         if (*p == '\t') {
3458             if (tabsize > 0) {
3459                 i = tabsize - (j % tabsize);
3460                 j += i;
3461                 while (i--)
3462                     *q++ = ' ';
3463             }
3464         }
3465         else {
3466             j++;
3467             *q++ = *p;
3468             if (*p == '\n' || *p == '\r')
3469                 j = 0;
3470         }
3471
3472     return (PyObject*) u;
3473 }
3474
3475 static char find__doc__[] =
3476 "S.find(sub [,start [,end]]) -> int\n\
3477 \n\
3478 Return the lowest index in S where substring sub is found,\n\
3479 such that sub is contained within s[start,end].  Optional\n\
3480 arguments start and end are interpreted as in slice notation.\n\
3481 \n\
3482 Return -1 on failure.";
3483
3484 static PyObject *
3485 unicode_find(PyUnicodeObject *self, PyObject *args)
3486 {
3487     PyUnicodeObject *substring;
3488     int start = 0;
3489     int end = INT_MAX;
3490     PyObject *result;
3491
3492     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3493                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3494         return NULL;
3495     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3496                                                 (PyObject *)substring);
3497     if (substring == NULL)
3498         return NULL;
3499
3500     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3501
3502     Py_DECREF(substring);
3503     return result;
3504 }
3505
3506 static PyObject *
3507 unicode_getitem(PyUnicodeObject *self, int index)
3508 {
3509     if (index < 0 || index >= self->length) {
3510         PyErr_SetString(PyExc_IndexError, "string index out of range");
3511         return NULL;
3512     }
3513
3514     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3515 }
3516
3517 static long
3518 unicode_hash(PyUnicodeObject *self)
3519 {
3520     /* Since Unicode objects compare equal to their ASCII string
3521        counterparts, they should use the individual character values
3522        as basis for their hash value.  This is needed to assure that
3523        strings and Unicode objects behave in the same way as
3524        dictionary keys. */
3525
3526     register int len;
3527     register Py_UNICODE *p;
3528     register long x;
3529
3530     if (self->hash != -1)
3531         return self->hash;
3532     len = PyUnicode_GET_SIZE(self);
3533     p = PyUnicode_AS_UNICODE(self);
3534     x = *p << 7;
3535     while (--len >= 0)
3536         x = (1000003*x) ^ *p++;
3537     x ^= PyUnicode_GET_SIZE(self);
3538     if (x == -1)
3539         x = -2;
3540     self->hash = x;
3541     return x;
3542 }
3543
3544 static char index__doc__[] =
3545 "S.index(sub [,start [,end]]) -> int\n\
3546 \n\
3547 Like S.find() but raise ValueError when the substring is not found.";
3548
3549 static PyObject *
3550 unicode_index(PyUnicodeObject *self, PyObject *args)
3551 {
3552     int result;
3553     PyUnicodeObject *substring;
3554     int start = 0;
3555     int end = INT_MAX;
3556
3557     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3558                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3559         return NULL;
3560
3561     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3562                                                 (PyObject *)substring);
3563     if (substring == NULL)
3564         return NULL;
3565
3566     result = findstring(self, substring, start, end, 1);
3567
3568     Py_DECREF(substring);
3569     if (result < 0) {
3570         PyErr_SetString(PyExc_ValueError, "substring not found");
3571         return NULL;
3572     }
3573     return PyInt_FromLong(result);
3574 }
3575
3576 static char islower__doc__[] =
3577 "S.islower() -> int\n\
3578 \n\
3579 Return 1 if  all cased characters in S are lowercase and there is\n\
3580 at least one cased character in S, 0 otherwise.";
3581
3582 static PyObject*
3583 unicode_islower(PyUnicodeObject *self, PyObject *args)
3584 {
3585     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3586     register const Py_UNICODE *e;
3587     int cased;
3588
3589     if (!PyArg_NoArgs(args))
3590         return NULL;
3591
3592     /* Shortcut for single character strings */
3593     if (PyUnicode_GET_SIZE(self) == 1)
3594         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3595
3596     /* Special case for empty strings */
3597     if (PyString_GET_SIZE(self) == 0)
3598         return PyInt_FromLong(0);
3599
3600     e = p + PyUnicode_GET_SIZE(self);
3601     cased = 0;
3602     for (; p < e; p++) {
3603         register const Py_UNICODE ch = *p;
3604
3605         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3606             return PyInt_FromLong(0);
3607         else if (!cased && Py_UNICODE_ISLOWER(ch))
3608             cased = 1;
3609     }
3610     return PyInt_FromLong(cased);
3611 }
3612
3613 static char isupper__doc__[] =
3614 "S.isupper() -> int\n\
3615 \n\
3616 Return 1 if  all cased characters in S are uppercase and there is\n\
3617 at least one cased character in S, 0 otherwise.";
3618
3619 static PyObject*
3620 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3621 {
3622     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3623     register const Py_UNICODE *e;
3624     int cased;
3625
3626     if (!PyArg_NoArgs(args))
3627         return NULL;
3628
3629     /* Shortcut for single character strings */
3630     if (PyUnicode_GET_SIZE(self) == 1)
3631         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3632
3633     /* Special case for empty strings */
3634     if (PyString_GET_SIZE(self) == 0)
3635         return PyInt_FromLong(0);
3636
3637     e = p + PyUnicode_GET_SIZE(self);
3638     cased = 0;
3639     for (; p < e; p++) {
3640         register const Py_UNICODE ch = *p;
3641
3642         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3643             return PyInt_FromLong(0);
3644         else if (!cased && Py_UNICODE_ISUPPER(ch))
3645             cased = 1;
3646     }
3647     return PyInt_FromLong(cased);
3648 }
3649
3650 static char istitle__doc__[] =
3651 "S.istitle() -> int\n\
3652 \n\
3653 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3654 may only follow uncased characters and lowercase characters only cased\n\
3655 ones. Return 0 otherwise.";
3656
3657 static PyObject*
3658 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3659 {
3660     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3661     register const Py_UNICODE *e;
3662     int cased, previous_is_cased;
3663
3664     if (!PyArg_NoArgs(args))
3665         return NULL;
3666
3667     /* Shortcut for single character strings */
3668     if (PyUnicode_GET_SIZE(self) == 1)
3669         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3670                               (Py_UNICODE_ISUPPER(*p) != 0));
3671
3672     /* Special case for empty strings */
3673     if (PyString_GET_SIZE(self) == 0)
3674         return PyInt_FromLong(0);
3675
3676     e = p + PyUnicode_GET_SIZE(self);
3677     cased = 0;
3678     previous_is_cased = 0;
3679     for (; p < e; p++) {
3680         register const Py_UNICODE ch = *p;
3681
3682         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3683             if (previous_is_cased)
3684                 return PyInt_FromLong(0);
3685             previous_is_cased = 1;
3686             cased = 1;
3687         }
3688         else if (Py_UNICODE_ISLOWER(ch)) {
3689             if (!previous_is_cased)
3690                 return PyInt_FromLong(0);
3691             previous_is_cased = 1;
3692             cased = 1;
3693         }
3694         else
3695             previous_is_cased = 0;
3696     }
3697     return PyInt_FromLong(cased);
3698 }
3699
3700 static char isspace__doc__[] =
3701 "S.isspace() -> int\n\
3702 \n\
3703 Return 1 if there are only whitespace characters in S,\n\
3704 0 otherwise.";
3705
3706 static PyObject*
3707 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3708 {
3709     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3710     register const Py_UNICODE *e;
3711
3712     if (!PyArg_NoArgs(args))
3713         return NULL;
3714
3715     /* Shortcut for single character strings */
3716     if (PyUnicode_GET_SIZE(self) == 1 &&
3717         Py_UNICODE_ISSPACE(*p))
3718         return PyInt_FromLong(1);
3719
3720     /* Special case for empty strings */
3721     if (PyString_GET_SIZE(self) == 0)
3722         return PyInt_FromLong(0);
3723
3724     e = p + PyUnicode_GET_SIZE(self);
3725     for (; p < e; p++) {
3726         if (!Py_UNICODE_ISSPACE(*p))
3727             return PyInt_FromLong(0);
3728     }
3729     return PyInt_FromLong(1);
3730 }
3731
3732 static char isalpha__doc__[] =
3733 "S.isalpha() -> int\n\
3734 \n\
3735 Return 1 if  all characters in S are alphabetic\n\
3736 and there is at least one character in S, 0 otherwise.";
3737
3738 static PyObject*
3739 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3740 {
3741     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3742     register const Py_UNICODE *e;
3743
3744     if (!PyArg_NoArgs(args))
3745         return NULL;
3746
3747     /* Shortcut for single character strings */
3748     if (PyUnicode_GET_SIZE(self) == 1 &&
3749         Py_UNICODE_ISALPHA(*p))
3750         return PyInt_FromLong(1);
3751
3752     /* Special case for empty strings */
3753     if (PyString_GET_SIZE(self) == 0)
3754         return PyInt_FromLong(0);
3755
3756     e = p + PyUnicode_GET_SIZE(self);
3757     for (; p < e; p++) {
3758         if (!Py_UNICODE_ISALPHA(*p))
3759             return PyInt_FromLong(0);
3760     }
3761     return PyInt_FromLong(1);
3762 }
3763
3764 static char isalnum__doc__[] =
3765 "S.isalnum() -> int\n\
3766 \n\
3767 Return 1 if  all characters in S are alphanumeric\n\
3768 and there is at least one character in S, 0 otherwise.";
3769
3770 static PyObject*
3771 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3772 {
3773     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3774     register const Py_UNICODE *e;
3775
3776     if (!PyArg_NoArgs(args))
3777         return NULL;
3778
3779     /* Shortcut for single character strings */
3780     if (PyUnicode_GET_SIZE(self) == 1 &&
3781         Py_UNICODE_ISALNUM(*p))
3782         return PyInt_FromLong(1);
3783
3784     /* Special case for empty strings */
3785     if (PyString_GET_SIZE(self) == 0)
3786         return PyInt_FromLong(0);
3787
3788     e = p + PyUnicode_GET_SIZE(self);
3789     for (; p < e; p++) {
3790         if (!Py_UNICODE_ISALNUM(*p))
3791             return PyInt_FromLong(0);
3792     }
3793     return PyInt_FromLong(1);
3794 }
3795
3796 static char isdecimal__doc__[] =
3797 "S.isdecimal() -> int\n\
3798 \n\
3799 Return 1 if there are only decimal characters in S,\n\
3800 0 otherwise.";
3801
3802 static PyObject*
3803 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3804 {
3805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3806     register const Py_UNICODE *e;
3807
3808     if (!PyArg_NoArgs(args))
3809         return NULL;
3810
3811     /* Shortcut for single character strings */
3812     if (PyUnicode_GET_SIZE(self) == 1 &&
3813         Py_UNICODE_ISDECIMAL(*p))
3814         return PyInt_FromLong(1);
3815
3816     /* Special case for empty strings */
3817     if (PyString_GET_SIZE(self) == 0)
3818         return PyInt_FromLong(0);
3819
3820     e = p + PyUnicode_GET_SIZE(self);
3821     for (; p < e; p++) {
3822         if (!Py_UNICODE_ISDECIMAL(*p))
3823             return PyInt_FromLong(0);
3824     }
3825     return PyInt_FromLong(1);
3826 }
3827
3828 static char isdigit__doc__[] =
3829 "S.isdigit() -> int\n\
3830 \n\
3831 Return 1 if there are only digit characters in S,\n\
3832 0 otherwise.";
3833
3834 static PyObject*
3835 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3836 {
3837     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3838     register const Py_UNICODE *e;
3839
3840     if (!PyArg_NoArgs(args))
3841         return NULL;
3842
3843     /* Shortcut for single character strings */
3844     if (PyUnicode_GET_SIZE(self) == 1 &&
3845         Py_UNICODE_ISDIGIT(*p))
3846         return PyInt_FromLong(1);
3847
3848     /* Special case for empty strings */
3849     if (PyString_GET_SIZE(self) == 0)
3850         return PyInt_FromLong(0);
3851
3852     e = p + PyUnicode_GET_SIZE(self);
3853     for (; p < e; p++) {
3854         if (!Py_UNICODE_ISDIGIT(*p))
3855             return PyInt_FromLong(0);
3856     }
3857     return PyInt_FromLong(1);
3858 }
3859
3860 static char isnumeric__doc__[] =
3861 "S.isnumeric() -> int\n\
3862 \n\
3863 Return 1 if there are only numeric characters in S,\n\
3864 0 otherwise.";
3865
3866 static PyObject*
3867 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3868 {
3869     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3870     register const Py_UNICODE *e;
3871
3872     if (!PyArg_NoArgs(args))
3873         return NULL;
3874
3875     /* Shortcut for single character strings */
3876     if (PyUnicode_GET_SIZE(self) == 1 &&
3877         Py_UNICODE_ISNUMERIC(*p))
3878         return PyInt_FromLong(1);
3879
3880     /* Special case for empty strings */
3881     if (PyString_GET_SIZE(self) == 0)
3882         return PyInt_FromLong(0);
3883
3884     e = p + PyUnicode_GET_SIZE(self);
3885     for (; p < e; p++) {
3886         if (!Py_UNICODE_ISNUMERIC(*p))
3887             return PyInt_FromLong(0);
3888     }
3889     return PyInt_FromLong(1);
3890 }
3891
3892 static char join__doc__[] =
3893 "S.join(sequence) -> unicode\n\
3894 \n\
3895 Return a string which is the concatenation of the strings in the\n\
3896 sequence.  The separator between elements is S.";
3897
3898 static PyObject*
3899 unicode_join(PyUnicodeObject *self, PyObject *args)
3900 {
3901     PyObject *data;
3902     if (!PyArg_ParseTuple(args, "O:join", &data))
3903         return NULL;
3904
3905     return PyUnicode_Join((PyObject *)self, data);
3906 }
3907
3908 static int
3909 unicode_length(PyUnicodeObject *self)
3910 {
3911     return self->length;
3912 }
3913
3914 static char ljust__doc__[] =
3915 "S.ljust(width) -> unicode\n\
3916 \n\
3917 Return S left justified in a Unicode string of length width. Padding is\n\
3918 done using spaces.";
3919
3920 static PyObject *
3921 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3922 {
3923     int width;
3924     if (!PyArg_ParseTuple(args, "i:ljust", &width))
3925         return NULL;
3926
3927     if (self->length >= width) {
3928         Py_INCREF(self);
3929         return (PyObject*) self;
3930     }
3931
3932     return (PyObject*) pad(self, 0, width - self->length, ' ');
3933 }
3934
3935 static char lower__doc__[] =
3936 "S.lower() -> unicode\n\
3937 \n\
3938 Return a copy of the string S converted to lowercase.";
3939
3940 static PyObject*
3941 unicode_lower(PyUnicodeObject *self, PyObject *args)
3942 {
3943     if (!PyArg_NoArgs(args))
3944         return NULL;
3945     return fixup(self, fixlower);
3946 }
3947
3948 static char lstrip__doc__[] =
3949 "S.lstrip() -> unicode\n\
3950 \n\
3951 Return a copy of the string S with leading whitespace removed.";
3952
3953 static PyObject *
3954 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3955 {
3956     if (!PyArg_NoArgs(args))
3957         return NULL;
3958     return strip(self, 1, 0);
3959 }
3960
3961 static PyObject*
3962 unicode_repeat(PyUnicodeObject *str, int len)
3963 {
3964     PyUnicodeObject *u;
3965     Py_UNICODE *p;
3966     int nchars;
3967     size_t nbytes;
3968
3969     if (len < 0)
3970         len = 0;
3971
3972     if (len == 1) {
3973         /* no repeat, return original string */
3974         Py_INCREF(str);
3975         return (PyObject*) str;
3976     }
3977
3978     /* ensure # of chars needed doesn't overflow int and # of bytes
3979      * needed doesn't overflow size_t
3980      */
3981     nchars = len * str->length;
3982     if (len && nchars / len != str->length) {
3983         PyErr_SetString(PyExc_OverflowError,
3984                         "repeated string is too long");
3985         return NULL;
3986     }
3987     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
3988     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
3989         PyErr_SetString(PyExc_OverflowError,
3990                         "repeated string is too long");
3991         return NULL;
3992     }
3993     u = _PyUnicode_New(nchars);
3994     if (!u)
3995         return NULL;
3996
3997     p = u->str;
3998
3999     while (len-- > 0) {
4000         Py_UNICODE_COPY(p, str->str, str->length);
4001         p += str->length;
4002     }
4003
4004     return (PyObject*) u;
4005 }
4006
4007 PyObject *PyUnicode_Replace(PyObject *obj,
4008                             PyObject *subobj,
4009                             PyObject *replobj,
4010                             int maxcount)
4011 {
4012     PyObject *self;
4013     PyObject *str1;
4014     PyObject *str2;
4015     PyObject *result;
4016
4017     self = PyUnicode_FromObject(obj);
4018     if (self == NULL)
4019         return NULL;
4020     str1 = PyUnicode_FromObject(subobj);
4021     if (str1 == NULL) {
4022         Py_DECREF(self);
4023         return NULL;
4024     }
4025     str2 = PyUnicode_FromObject(replobj);
4026     if (str2 == NULL) {
4027         Py_DECREF(self);
4028         Py_DECREF(str1);
4029         return NULL;
4030     }
4031     result = replace((PyUnicodeObject *)self,
4032                      (PyUnicodeObject *)str1,
4033                      (PyUnicodeObject *)str2,
4034                      maxcount);
4035     Py_DECREF(self);
4036     Py_DECREF(str1);
4037     Py_DECREF(str2);
4038     return result;
4039 }
4040
4041 static char replace__doc__[] =
4042 "S.replace (old, new[, maxsplit]) -> unicode\n\
4043 \n\
4044 Return a copy of S with all occurrences of substring\n\
4045 old replaced by new.  If the optional argument maxsplit is\n\
4046 given, only the first maxsplit occurrences are replaced.";
4047
4048 static PyObject*
4049 unicode_replace(PyUnicodeObject *self, PyObject *args)
4050 {
4051     PyUnicodeObject *str1;
4052     PyUnicodeObject *str2;
4053     int maxcount = -1;
4054     PyObject *result;
4055
4056     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4057         return NULL;
4058     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4059     if (str1 == NULL)
4060         return NULL;
4061     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4062     if (str2 == NULL)
4063         return NULL;
4064
4065     result = replace(self, str1, str2, maxcount);
4066
4067     Py_DECREF(str1);
4068     Py_DECREF(str2);
4069     return result;
4070 }
4071
4072 static
4073 PyObject *unicode_repr(PyObject *unicode)
4074 {
4075     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4076                                 PyUnicode_GET_SIZE(unicode),
4077                                 1);
4078 }
4079
4080 static char rfind__doc__[] =
4081 "S.rfind(sub [,start [,end]]) -> int\n\
4082 \n\
4083 Return the highest index in S where substring sub is found,\n\
4084 such that sub is contained within s[start,end].  Optional\n\
4085 arguments start and end are interpreted as in slice notation.\n\
4086 \n\
4087 Return -1 on failure.";
4088
4089 static PyObject *
4090 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4091 {
4092     PyUnicodeObject *substring;
4093     int start = 0;
4094     int end = INT_MAX;
4095     PyObject *result;
4096
4097     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4098                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4099         return NULL;
4100     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4101                                                 (PyObject *)substring);
4102     if (substring == NULL)
4103         return NULL;
4104
4105     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4106
4107     Py_DECREF(substring);
4108     return result;
4109 }
4110
4111 static char rindex__doc__[] =
4112 "S.rindex(sub [,start [,end]]) -> int\n\
4113 \n\
4114 Like S.rfind() but raise ValueError when the substring is not found.";
4115
4116 static PyObject *
4117 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4118 {
4119     int result;
4120     PyUnicodeObject *substring;
4121     int start = 0;
4122     int end = INT_MAX;
4123
4124     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4125                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4126         return NULL;
4127     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4128                                                 (PyObject *)substring);
4129     if (substring == NULL)
4130         return NULL;
4131
4132     result = findstring(self, substring, start, end, -1);
4133
4134     Py_DECREF(substring);
4135     if (result < 0) {
4136         PyErr_SetString(PyExc_ValueError, "substring not found");
4137         return NULL;
4138     }
4139     return PyInt_FromLong(result);
4140 }
4141
4142 static char rjust__doc__[] =
4143 "S.rjust(width) -> unicode\n\
4144 \n\
4145 Return S right justified in a Unicode string of length width. Padding is\n\
4146 done using spaces.";
4147
4148 static PyObject *
4149 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4150 {
4151     int width;
4152     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4153         return NULL;
4154
4155     if (self->length >= width) {
4156         Py_INCREF(self);
4157         return (PyObject*) self;
4158     }
4159
4160     return (PyObject*) pad(self, width - self->length, 0, ' ');
4161 }
4162
4163 static char rstrip__doc__[] =
4164 "S.rstrip() -> unicode\n\
4165 \n\
4166 Return a copy of the string S with trailing whitespace removed.";
4167
4168 static PyObject *
4169 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4170 {
4171     if (!PyArg_NoArgs(args))
4172         return NULL;
4173     return strip(self, 0, 1);
4174 }
4175
4176 static PyObject*
4177 unicode_slice(PyUnicodeObject *self, int start, int end)
4178 {
4179     /* standard clamping */
4180     if (start < 0)
4181         start = 0;
4182     if (end < 0)
4183         end = 0;
4184     if (end > self->length)
4185         end = self->length;
4186     if (start == 0 && end == self->length) {
4187         /* full slice, return original string */
4188         Py_INCREF(self);
4189         return (PyObject*) self;
4190     }
4191     if (start > end)
4192         start = end;
4193     /* copy slice */
4194     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4195                                              end - start);
4196 }
4197
4198 PyObject *PyUnicode_Split(PyObject *s,
4199                           PyObject *sep,
4200                           int maxsplit)
4201 {
4202     PyObject *result;
4203
4204     s = PyUnicode_FromObject(s);
4205     if (s == NULL)
4206         return NULL;
4207     if (sep != NULL) {
4208         sep = PyUnicode_FromObject(sep);
4209         if (sep == NULL) {
4210             Py_DECREF(s);
4211             return NULL;
4212         }
4213     }
4214
4215     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4216
4217     Py_DECREF(s);
4218     Py_XDECREF(sep);
4219     return result;
4220 }
4221
4222 static char split__doc__[] =
4223 "S.split([sep [,maxsplit]]) -> list of strings\n\
4224 \n\
4225 Return a list of the words in S, using sep as the\n\
4226 delimiter string.  If maxsplit is given, at most maxsplit\n\
4227 splits are done. If sep is not specified, any whitespace string\n\
4228 is a separator.";
4229
4230 static PyObject*
4231 unicode_split(PyUnicodeObject *self, PyObject *args)
4232 {
4233     PyObject *substring = Py_None;
4234     int maxcount = -1;
4235
4236     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4237         return NULL;
4238
4239     if (substring == Py_None)
4240         return split(self, NULL, maxcount);
4241     else if (PyUnicode_Check(substring))
4242         return split(self, (PyUnicodeObject *)substring, maxcount);
4243     else
4244         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4245 }
4246
4247 static char splitlines__doc__[] =
4248 "S.splitlines([keepends]]) -> list of strings\n\
4249 \n\
4250 Return a list of the lines in S, breaking at line boundaries.\n\
4251 Line breaks are not included in the resulting list unless keepends\n\
4252 is given and true.";
4253
4254 static PyObject*
4255 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4256 {
4257     int keepends = 0;
4258
4259     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4260         return NULL;
4261
4262     return PyUnicode_Splitlines((PyObject *)self, keepends);
4263 }
4264
4265 static
4266 PyObject *unicode_str(PyUnicodeObject *self)
4267 {
4268     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4269 }
4270
4271 static char strip__doc__[] =
4272 "S.strip() -> unicode\n\
4273 \n\
4274 Return a copy of S with leading and trailing whitespace removed.";
4275
4276 static PyObject *
4277 unicode_strip(PyUnicodeObject *self, PyObject *args)
4278 {
4279     if (!PyArg_NoArgs(args))
4280         return NULL;
4281     return strip(self, 1, 1);
4282 }
4283
4284 static char swapcase__doc__[] =
4285 "S.swapcase() -> unicode\n\
4286 \n\
4287 Return a copy of S with uppercase characters converted to lowercase\n\
4288 and vice versa.";
4289
4290 static PyObject*
4291 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4292 {
4293     if (!PyArg_NoArgs(args))
4294         return NULL;
4295     return fixup(self, fixswapcase);
4296 }
4297
4298 static char translate__doc__[] =
4299 "S.translate(table) -> unicode\n\
4300 \n\
4301 Return a copy of the string S, where all characters have been mapped\n\
4302 through the given translation table, which must be a mapping of\n\
4303 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4304 are left untouched. Characters mapped to None are deleted.";
4305
4306 static PyObject*
4307 unicode_translate(PyUnicodeObject *self, PyObject *args)
4308 {
4309     PyObject *table;
4310
4311     if (!PyArg_ParseTuple(args, "O:translate", &table))
4312         return NULL;
4313     return PyUnicode_TranslateCharmap(self->str,
4314                                       self->length,
4315                                       table,
4316                                       "ignore");
4317 }
4318
4319 static char upper__doc__[] =
4320 "S.upper() -> unicode\n\
4321 \n\
4322 Return a copy of S converted to uppercase.";
4323
4324 static PyObject*
4325 unicode_upper(PyUnicodeObject *self, PyObject *args)
4326 {
4327     if (!PyArg_NoArgs(args))
4328         return NULL;
4329     return fixup(self, fixupper);
4330 }
4331
4332 #if 0
4333 static char zfill__doc__[] =
4334 "S.zfill(width) -> unicode\n\
4335 \n\
4336 Pad a numeric string x with zeros on the left, to fill a field\n\
4337 of the specified width. The string x is never truncated.";
4338
4339 static PyObject *
4340 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4341 {
4342     int fill;
4343     PyUnicodeObject *u;
4344
4345     int width;
4346     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4347         return NULL;
4348
4349     if (self->length >= width) {
4350         Py_INCREF(self);
4351         return (PyObject*) self;
4352     }
4353
4354     fill = width - self->length;
4355
4356     u = pad(self, fill, 0, '0');
4357
4358     if (u->str[fill] == '+' || u->str[fill] == '-') {
4359         /* move sign to beginning of string */
4360         u->str[0] = u->str[fill];
4361         u->str[fill] = '0';
4362     }
4363
4364     return (PyObject*) u;
4365 }
4366 #endif
4367
4368 #if 0
4369 static PyObject*
4370 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4371 {
4372     if (!PyArg_NoArgs(args))
4373         return NULL;
4374     return PyInt_FromLong(unicode_freelist_size);
4375 }
4376 #endif
4377
4378 static char startswith__doc__[] =
4379 "S.startswith(prefix[, start[, end]]) -> int\n\
4380 \n\
4381 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4382 optional start, test S beginning at that position.  With optional end, stop\n\
4383 comparing S at that position.";
4384
4385 static PyObject *
4386 unicode_startswith(PyUnicodeObject *self,
4387                    PyObject *args)
4388 {
4389     PyUnicodeObject *substring;
4390     int start = 0;
4391     int end = INT_MAX;
4392     PyObject *result;
4393
4394     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4395                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4396         return NULL;
4397     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4398                                                 (PyObject *)substring);
4399     if (substring == NULL)
4400         return NULL;
4401
4402     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4403
4404     Py_DECREF(substring);
4405     return result;
4406 }
4407
4408
4409 static char endswith__doc__[] =
4410 "S.endswith(suffix[, start[, end]]) -> int\n\
4411 \n\
4412 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4413 optional start, test S beginning at that position.  With optional end, stop\n\
4414 comparing S at that position.";
4415
4416 static PyObject *
4417 unicode_endswith(PyUnicodeObject *self,
4418                  PyObject *args)
4419 {
4420     PyUnicodeObject *substring;
4421     int start = 0;
4422     int end = INT_MAX;
4423     PyObject *result;
4424
4425     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4426                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4427         return NULL;
4428     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4429                                                 (PyObject *)substring);
4430     if (substring == NULL)
4431         return NULL;
4432
4433     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4434
4435     Py_DECREF(substring);
4436     return result;
4437 }
4438
4439
4440 static PyMethodDef unicode_methods[] = {
4441
4442     /* Order is according to common usage: often used methods should
4443        appear first, since lookup is done sequentially. */
4444
4445     {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4446     {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4447     {"split", (PyCFunction) unicode_split, 1, split__doc__},
4448     {"join", (PyCFunction) unicode_join, 1, join__doc__},
4449     {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4450     {"title", (PyCFunction) unicode_title, 0, title__doc__},
4451     {"center", (PyCFunction) unicode_center, 1, center__doc__},
4452     {"count", (PyCFunction) unicode_count, 1, count__doc__},
4453     {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4454     {"find", (PyCFunction) unicode_find, 1, find__doc__},
4455     {"index", (PyCFunction) unicode_index, 1, index__doc__},
4456     {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4457     {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4458     {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4459 /*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4460     {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4461     {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4462     {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4463     {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4464     {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4465     {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4466     {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4467     {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4468     {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4469     {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4470     {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4471     {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4472     {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4473     {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4474     {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4475     {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4476     {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4477     {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4478     {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4479     {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4480 #if 0
4481     {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4482     {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4483 #endif
4484
4485 #if 0
4486     /* This one is just used for debugging the implementation. */
4487     {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4488 #endif
4489
4490     {NULL, NULL}
4491 };
4492
4493 static PyObject *
4494 unicode_getattr(PyUnicodeObject *self, char *name)
4495 {
4496     return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4497 }
4498
4499 static PySequenceMethods unicode_as_sequence = {
4500     (inquiry) unicode_length,           /* sq_length */
4501     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4502     (intargfunc) unicode_repeat,        /* sq_repeat */
4503     (intargfunc) unicode_getitem,       /* sq_item */
4504     (intintargfunc) unicode_slice,      /* sq_slice */
4505     0,                                  /* sq_ass_item */
4506     0,                                  /* sq_ass_slice */
4507     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4508 };
4509
4510 static int
4511 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4512                           int index,
4513                           const void **ptr)
4514 {
4515     if (index != 0) {
4516         PyErr_SetString(PyExc_SystemError,
4517                         "accessing non-existent unicode segment");
4518         return -1;
4519     }
4520     *ptr = (void *) self->str;
4521     return PyUnicode_GET_DATA_SIZE(self);
4522 }
4523
4524 static int
4525 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4526                            const void **ptr)
4527 {
4528     PyErr_SetString(PyExc_TypeError,
4529                     "cannot use unicode as modifyable buffer");
4530     return -1;
4531 }
4532
4533 static int
4534 unicode_buffer_getsegcount(PyUnicodeObject *self,
4535                            int *lenp)
4536 {
4537     if (lenp)
4538         *lenp = PyUnicode_GET_DATA_SIZE(self);
4539     return 1;
4540 }
4541
4542 static int
4543 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4544                           int index,
4545                           const void **ptr)
4546 {
4547     PyObject *str;
4548
4549     if (index != 0) {
4550         PyErr_SetString(PyExc_SystemError,
4551                         "accessing non-existent unicode segment");
4552         return -1;
4553     }
4554     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4555     if (str == NULL)
4556         return -1;
4557     *ptr = (void *) PyString_AS_STRING(str);
4558     return PyString_GET_SIZE(str);
4559 }
4560
4561 /* Helpers for PyUnicode_Format() */
4562
4563 static PyObject *
4564 getnextarg(PyObject *args, int arglen, int *p_argidx)
4565 {
4566     int argidx = *p_argidx;
4567     if (argidx < arglen) {
4568         (*p_argidx)++;
4569         if (arglen < 0)
4570             return args;
4571         else
4572             return PyTuple_GetItem(args, argidx);
4573     }
4574     PyErr_SetString(PyExc_TypeError,
4575                     "not enough arguments for format string");
4576     return NULL;
4577 }
4578
4579 #define F_LJUST (1<<0)
4580 #define F_SIGN  (1<<1)
4581 #define F_BLANK (1<<2)
4582 #define F_ALT   (1<<3)
4583 #define F_ZERO  (1<<4)
4584
4585 static
4586 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4587 {
4588     register int i;
4589     int len;
4590     va_list va;
4591     char *charbuffer;
4592     va_start(va, format);
4593
4594     /* First, format the string as char array, then expand to Py_UNICODE
4595        array. */
4596     charbuffer = (char *)buffer;
4597     len = vsprintf(charbuffer, format, va);
4598     for (i = len - 1; i >= 0; i--)
4599         buffer[i] = (Py_UNICODE) charbuffer[i];
4600
4601     va_end(va);
4602     return len;
4603 }
4604
4605 static int
4606 formatfloat(Py_UNICODE *buf,
4607             size_t buflen,
4608             int flags,
4609             int prec,
4610             int type,
4611             PyObject *v)
4612 {
4613     /* fmt = '%#.' + `prec` + `type`
4614        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4615     char fmt[20];
4616     double x;
4617
4618     x = PyFloat_AsDouble(v);
4619     if (x == -1.0 && PyErr_Occurred())
4620         return -1;
4621     if (prec < 0)
4622         prec = 6;
4623     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4624         type = 'g';
4625     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4626     /* worst case length calc to ensure no buffer overrun:
4627          fmt = %#.<prec>g
4628          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4629             for any double rep.)
4630          len = 1 + prec + 1 + 2 + 5 = 9 + prec
4631        If prec=0 the effective precision is 1 (the leading digit is
4632        always given), therefore increase by one to 10+prec. */
4633     if (buflen <= (size_t)10 + (size_t)prec) {
4634         PyErr_SetString(PyExc_OverflowError,
4635             "formatted float is too long (precision too long?)");
4636         return -1;
4637     }
4638     return usprintf(buf, fmt, x);
4639 }
4640
4641 static PyObject*
4642 formatlong(PyObject *val, int flags, int prec, int type)
4643 {
4644         char *buf;
4645         int i, len;
4646         PyObject *str; /* temporary string object. */
4647         PyUnicodeObject *result;
4648
4649         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4650         if (!str)
4651                 return NULL;
4652         result = _PyUnicode_New(len);
4653         for (i = 0; i < len; i++)
4654                 result->str[i] = buf[i];
4655         result->str[len] = 0;
4656         Py_DECREF(str);
4657         return (PyObject*)result;
4658 }
4659
4660 static int
4661 formatint(Py_UNICODE *buf,
4662           size_t buflen,
4663           int flags,
4664           int prec,
4665           int type,
4666           PyObject *v)
4667 {
4668     /* fmt = '%#.' + `prec` + 'l' + `type`
4669        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4670        + 1 + 1 = 24*/
4671     char fmt[64]; /* plenty big enough! */
4672     long x;
4673
4674     x = PyInt_AsLong(v);
4675     if (x == -1 && PyErr_Occurred())
4676         return -1;
4677     if (prec < 0)
4678         prec = 1;
4679     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4680        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4681     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4682         PyErr_SetString(PyExc_OverflowError,
4683             "formatted integer is too long (precision too long?)");
4684         return -1;
4685     }
4686     sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4687     return usprintf(buf, fmt, x);
4688 }
4689
4690 static int
4691 formatchar(Py_UNICODE *buf,
4692            size_t buflen,
4693            PyObject *v)
4694 {
4695     /* presume that the buffer is at least 2 characters long */
4696     if (PyUnicode_Check(v)) {
4697         if (PyUnicode_GET_SIZE(v) != 1)
4698             goto onError;
4699         buf[0] = PyUnicode_AS_UNICODE(v)[0];
4700     }
4701
4702     else if (PyString_Check(v)) {
4703         if (PyString_GET_SIZE(v) != 1)
4704             goto onError;
4705         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4706     }
4707
4708     else {
4709         /* Integer input truncated to a character */
4710         long x;
4711         x = PyInt_AsLong(v);
4712         if (x == -1 && PyErr_Occurred())
4713             goto onError;
4714         buf[0] = (char) x;
4715     }
4716     buf[1] = '\0';
4717     return 1;
4718
4719  onError:
4720     PyErr_SetString(PyExc_TypeError,
4721                     "%c requires int or char");
4722     return -1;
4723 }
4724
4725 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4726
4727    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4728    chars are formatted. XXX This is a magic number. Each formatting
4729    routine does bounds checking to ensure no overflow, but a better
4730    solution may be to malloc a buffer of appropriate size for each
4731    format. For now, the current solution is sufficient.
4732 */
4733 #define FORMATBUFLEN (size_t)120
4734
4735 PyObject *PyUnicode_Format(PyObject *format,
4736                            PyObject *args)
4737 {
4738     Py_UNICODE *fmt, *res;
4739     int fmtcnt, rescnt, reslen, arglen, argidx;
4740     int args_owned = 0;
4741     PyUnicodeObject *result = NULL;
4742     PyObject *dict = NULL;
4743     PyObject *uformat;
4744
4745     if (format == NULL || args == NULL) {
4746         PyErr_BadInternalCall();
4747         return NULL;
4748     }
4749     uformat = PyUnicode_FromObject(format);
4750     if (uformat == NULL)
4751         return NULL;
4752     fmt = PyUnicode_AS_UNICODE(uformat);
4753     fmtcnt = PyUnicode_GET_SIZE(uformat);
4754
4755     reslen = rescnt = fmtcnt + 100;
4756     result = _PyUnicode_New(reslen);
4757     if (result == NULL)
4758         goto onError;
4759     res = PyUnicode_AS_UNICODE(result);
4760
4761     if (PyTuple_Check(args)) {
4762         arglen = PyTuple_Size(args);
4763         argidx = 0;
4764     }
4765     else {
4766         arglen = -1;
4767         argidx = -2;
4768     }
4769     if (args->ob_type->tp_as_mapping)
4770         dict = args;
4771
4772     while (--fmtcnt >= 0) {
4773         if (*fmt != '%') {
4774             if (--rescnt < 0) {
4775                 rescnt = fmtcnt + 100;
4776                 reslen += rescnt;
4777                 if (_PyUnicode_Resize(result, reslen) < 0)
4778                     return NULL;
4779                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4780                 --rescnt;
4781             }
4782             *res++ = *fmt++;
4783         }
4784         else {
4785             /* Got a format specifier */
4786             int flags = 0;
4787             int width = -1;
4788             int prec = -1;
4789             int size = 0;
4790             Py_UNICODE c = '\0';
4791             Py_UNICODE fill;
4792             PyObject *v = NULL;
4793             PyObject *temp = NULL;
4794             Py_UNICODE *pbuf;
4795             Py_UNICODE sign;
4796             int len;
4797             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4798
4799             fmt++;
4800             if (*fmt == '(') {
4801                 Py_UNICODE *keystart;
4802                 int keylen;
4803                 PyObject *key;
4804                 int pcount = 1;
4805
4806                 if (dict == NULL) {
4807                     PyErr_SetString(PyExc_TypeError,
4808                                     "format requires a mapping");
4809                     goto onError;
4810                 }
4811                 ++fmt;
4812                 --fmtcnt;
4813                 keystart = fmt;
4814                 /* Skip over balanced parentheses */
4815                 while (pcount > 0 && --fmtcnt >= 0) {
4816                     if (*fmt == ')')
4817                         --pcount;
4818                     else if (*fmt == '(')
4819                         ++pcount;
4820                     fmt++;
4821                 }
4822                 keylen = fmt - keystart - 1;
4823                 if (fmtcnt < 0 || pcount > 0) {
4824                     PyErr_SetString(PyExc_ValueError,
4825                                     "incomplete format key");
4826                     goto onError;
4827                 }
4828                 /* keys are converted to strings using UTF-8 and
4829                    then looked up since Python uses strings to hold
4830                    variables names etc. in its namespaces and we
4831                    wouldn't want to break common idioms. */
4832                 key = PyUnicode_EncodeUTF8(keystart,
4833                                            keylen,
4834                                            NULL);
4835                 if (key == NULL)
4836                     goto onError;
4837                 if (args_owned) {
4838                     Py_DECREF(args);
4839                     args_owned = 0;
4840                 }
4841                 args = PyObject_GetItem(dict, key);
4842                 Py_DECREF(key);
4843                 if (args == NULL) {
4844                     goto onError;
4845                 }
4846                 args_owned = 1;
4847                 arglen = -1;
4848                 argidx = -2;
4849             }
4850             while (--fmtcnt >= 0) {
4851                 switch (c = *fmt++) {
4852                 case '-': flags |= F_LJUST; continue;
4853                 case '+': flags |= F_SIGN; continue;
4854                 case ' ': flags |= F_BLANK; continue;
4855                 case '#': flags |= F_ALT; continue;
4856                 case '0': flags |= F_ZERO; continue;
4857                 }
4858                 break;
4859             }
4860             if (c == '*') {
4861                 v = getnextarg(args, arglen, &argidx);
4862                 if (v == NULL)
4863                     goto onError;
4864                 if (!PyInt_Check(v)) {
4865                     PyErr_SetString(PyExc_TypeError,
4866                                     "* wants int");
4867                     goto onError;
4868                 }
4869                 width = PyInt_AsLong(v);
4870                 if (width < 0) {
4871                     flags |= F_LJUST;
4872                     width = -width;
4873                 }
4874                 if (--fmtcnt >= 0)
4875                     c = *fmt++;
4876             }
4877             else if (c >= '0' && c <= '9') {
4878                 width = c - '0';
4879                 while (--fmtcnt >= 0) {
4880                     c = *fmt++;
4881                     if (c < '0' || c > '9')
4882                         break;
4883                     if ((width*10) / 10 != width) {
4884                         PyErr_SetString(PyExc_ValueError,
4885                                         "width too big");
4886                         goto onError;
4887                     }
4888                     width = width*10 + (c - '0');
4889                 }
4890             }
4891             if (c == '.') {
4892                 prec = 0;
4893                 if (--fmtcnt >= 0)
4894                     c = *fmt++;
4895                 if (c == '*') {
4896                     v = getnextarg(args, arglen, &argidx);
4897                     if (v == NULL)
4898                         goto onError;
4899                     if (!PyInt_Check(v)) {
4900                         PyErr_SetString(PyExc_TypeError,
4901                                         "* wants int");
4902                         goto onError;
4903                     }
4904                     prec = PyInt_AsLong(v);
4905                     if (prec < 0)
4906                         prec = 0;
4907                     if (--fmtcnt >= 0)
4908                         c = *fmt++;
4909                 }
4910                 else if (c >= '0' && c <= '9') {
4911                     prec = c - '0';
4912                     while (--fmtcnt >= 0) {
4913                         c = Py_CHARMASK(*fmt++);
4914                         if (c < '0' || c > '9')
4915                             break;
4916                         if ((prec*10) / 10 != prec) {
4917                             PyErr_SetString(PyExc_ValueError,
4918                                             "prec too big");
4919                             goto onError;
4920                         }
4921                         prec = prec*10 + (c - '0');
4922                     }
4923                 }
4924             } /* prec */
4925             if (fmtcnt >= 0) {
4926                 if (c == 'h' || c == 'l' || c == 'L') {
4927                     size = c;
4928                     if (--fmtcnt >= 0)
4929                         c = *fmt++;
4930                 }
4931             }
4932             if (fmtcnt < 0) {
4933                 PyErr_SetString(PyExc_ValueError,
4934                                 "incomplete format");
4935                 goto onError;
4936             }
4937             if (c != '%') {
4938                 v = getnextarg(args, arglen, &argidx);
4939                 if (v == NULL)
4940                     goto onError;
4941             }
4942             sign = 0;
4943             fill = ' ';
4944             switch (c) {
4945
4946             case '%':
4947                 pbuf = formatbuf;
4948                 /* presume that buffer length is at least 1 */
4949                 pbuf[0] = '%';
4950                 len = 1;
4951                 break;
4952
4953             case 's':
4954             case 'r':
4955                 if (PyUnicode_Check(v) && c == 's') {
4956                     temp = v;
4957                     Py_INCREF(temp);
4958                 }
4959                 else {
4960                     PyObject *unicode;
4961                     if (c == 's')
4962                         temp = PyObject_Str(v);
4963                     else
4964                         temp = PyObject_Repr(v);
4965                     if (temp == NULL)
4966                         goto onError;
4967                     if (!PyString_Check(temp)) {
4968                         /* XXX Note: this should never happen, since
4969                                PyObject_Repr() and PyObject_Str() assure
4970                                this */
4971                         Py_DECREF(temp);
4972                         PyErr_SetString(PyExc_TypeError,
4973                                         "%s argument has non-string str()");
4974                         goto onError;
4975                     }
4976                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
4977                                                    PyString_GET_SIZE(temp),
4978                                                NULL,
4979                                                    "strict");
4980                     Py_DECREF(temp);
4981                     temp = unicode;
4982                     if (temp == NULL)
4983                         goto onError;
4984                 }
4985                 pbuf = PyUnicode_AS_UNICODE(temp);
4986                 len = PyUnicode_GET_SIZE(temp);
4987                 if (prec >= 0 && len > prec)
4988                     len = prec;
4989                 break;
4990
4991             case 'i':
4992             case 'd':
4993             case 'u':
4994             case 'o':
4995             case 'x':
4996             case 'X':
4997                 if (c == 'i')
4998                     c = 'd';
4999                 if (PyLong_Check(v)) {
5000                     temp = formatlong(v, flags, prec, c);
5001                     if (!temp)
5002                         goto onError;
5003                     pbuf = PyUnicode_AS_UNICODE(temp);
5004                     len = PyUnicode_GET_SIZE(temp);
5005                     /* unbounded ints can always produce
5006                        a sign character! */
5007                     sign = 1;
5008                 }
5009                 else {
5010                     pbuf = formatbuf;
5011                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5012                                     flags, prec, c, v);
5013                     if (len < 0)
5014                         goto onError;
5015                     /* only d conversion is signed */
5016                     sign = c == 'd';
5017                 }
5018                 if (flags & F_ZERO)
5019                     fill = '0';
5020                 break;
5021
5022             case 'e':
5023             case 'E':
5024             case 'f':
5025             case 'g':
5026             case 'G':
5027                 pbuf = formatbuf;
5028                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5029                         flags, prec, c, v);
5030                 if (len < 0)
5031                     goto onError;
5032                 sign = 1;
5033                 if (flags & F_ZERO)
5034                     fill = '0';
5035                 break;
5036
5037             case 'c':
5038                 pbuf = formatbuf;
5039                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5040                 if (len < 0)
5041                     goto onError;
5042                 break;
5043
5044             default:
5045                 PyErr_Format(PyExc_ValueError,
5046                              "unsupported format character '%c' (0x%x) "
5047                              "at index %i",
5048                              (31<=c && c<=126) ? c : '?',
5049                              c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
5050                 goto onError;
5051             }
5052             if (sign) {
5053                 if (*pbuf == '-' || *pbuf == '+') {
5054                     sign = *pbuf++;
5055                     len--;
5056                 }
5057                 else if (flags & F_SIGN)
5058                     sign = '+';
5059                 else if (flags & F_BLANK)
5060                     sign = ' ';
5061                 else
5062                     sign = 0;
5063             }
5064             if (width < len)
5065                 width = len;
5066             if (rescnt < width + (sign != 0)) {
5067                 reslen -= rescnt;
5068                 rescnt = width + fmtcnt + 100;
5069                 reslen += rescnt;
5070                 if (_PyUnicode_Resize(result, reslen) < 0)
5071                     return NULL;
5072                 res = PyUnicode_AS_UNICODE(result)
5073                     + reslen - rescnt;
5074             }
5075             if (sign) {
5076                 if (fill != ' ')
5077                     *res++ = sign;
5078                 rescnt--;
5079                 if (width > len)
5080                     width--;
5081             }
5082             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5083                 assert(pbuf[0] == '0');
5084                 assert(pbuf[1] == c);
5085                 if (fill != ' ') {
5086                     *res++ = *pbuf++;
5087                     *res++ = *pbuf++;
5088                 }
5089                 rescnt -= 2;
5090                 width -= 2;
5091                 if (width < 0)
5092                     width = 0;
5093                 len -= 2;
5094             }
5095             if (width > len && !(flags & F_LJUST)) {
5096                 do {
5097                     --rescnt;
5098                     *res++ = fill;
5099                 } while (--width > len);
5100             }
5101             if (fill == ' ') {
5102                 if (sign)
5103                     *res++ = sign;
5104                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5105                     assert(pbuf[0] == '0');
5106                     assert(pbuf[1] == c);
5107                     *res++ = *pbuf++;
5108                     *res++ = *pbuf++;
5109                 }
5110             }
5111             memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5112             res += len;
5113             rescnt -= len;
5114             while (--width >= len) {
5115                 --rescnt;
5116                 *res++ = ' ';
5117             }
5118             if (dict && (argidx < arglen) && c != '%') {
5119                 PyErr_SetString(PyExc_TypeError,
5120                                 "not all arguments converted");
5121                 goto onError;
5122             }
5123             Py_XDECREF(temp);
5124         } /* '%' */
5125     } /* until end */
5126     if (argidx < arglen && !dict) {
5127         PyErr_SetString(PyExc_TypeError,
5128                         "not all arguments converted");
5129         goto onError;
5130     }
5131
5132     if (args_owned) {
5133         Py_DECREF(args);
5134     }
5135     Py_DECREF(uformat);
5136     if (_PyUnicode_Resize(result, reslen - rescnt))
5137         goto onError;
5138     return (PyObject *)result;
5139
5140  onError:
5141     Py_XDECREF(result);
5142     Py_DECREF(uformat);
5143     if (args_owned) {
5144         Py_DECREF(args);
5145     }
5146     return NULL;
5147 }
5148
5149 static PyBufferProcs unicode_as_buffer = {
5150     (getreadbufferproc) unicode_buffer_getreadbuf,
5151     (getwritebufferproc) unicode_buffer_getwritebuf,
5152     (getsegcountproc) unicode_buffer_getsegcount,
5153     (getcharbufferproc) unicode_buffer_getcharbuf,
5154 };
5155
5156 PyTypeObject PyUnicode_Type = {
5157     PyObject_HEAD_INIT(&PyType_Type)
5158     0,                                  /* ob_size */
5159     "unicode",                          /* tp_name */
5160     sizeof(PyUnicodeObject),            /* tp_size */
5161     0,                                  /* tp_itemsize */
5162     /* Slots */
5163     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5164     0,                                  /* tp_print */
5165     (getattrfunc)unicode_getattr,       /* tp_getattr */
5166     0,                                  /* tp_setattr */
5167     (cmpfunc) unicode_compare,          /* tp_compare */
5168     (reprfunc) unicode_repr,            /* tp_repr */
5169     0,                                  /* tp_as_number */
5170     &unicode_as_sequence,               /* tp_as_sequence */
5171     0,                                  /* tp_as_mapping */
5172     (hashfunc) unicode_hash,            /* tp_hash*/
5173     0,                                  /* tp_call*/
5174     (reprfunc) unicode_str,             /* tp_str */
5175     (getattrofunc) NULL,                /* tp_getattro */
5176     (setattrofunc) NULL,                /* tp_setattro */
5177     &unicode_as_buffer,                 /* tp_as_buffer */
5178     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
5179 };
5180
5181 /* Initialize the Unicode implementation */
5182
5183 void _PyUnicode_Init(void)
5184 {
5185     /* Doublecheck the configuration... */
5186     if (sizeof(Py_UNICODE) != 2)
5187         Py_FatalError("Unicode configuration error: "
5188                       "sizeof(Py_UNICODE) != 2 bytes");
5189
5190     /* Init the implementation */
5191     unicode_freelist = NULL;
5192     unicode_freelist_size = 0;
5193     unicode_empty = _PyUnicode_New(0);
5194     strcpy(unicode_default_encoding, "ascii");
5195 }
5196
5197 /* Finalize the Unicode implementation */
5198
5199 void
5200 _PyUnicode_Fini(void)
5201 {
5202     PyUnicodeObject *u;
5203
5204     Py_XDECREF(unicode_empty);
5205     unicode_empty = NULL;
5206
5207     for (u = unicode_freelist; u != NULL;) {
5208         PyUnicodeObject *v = u;
5209         u = *(PyUnicodeObject **)u;
5210         if (v->str)
5211             PyMem_DEL(v->str);
5212         Py_XDECREF(v->defenc);
5213         PyObject_DEL(v);
5214     }
5215     unicode_freelist = NULL;
5216     unicode_freelist_size = 0;
5217 }