Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9
  10  Original header:
  11  --------------------------------------------------------------------
  12
  13  * Yet another Unicode string type for Python.  This type supports the
  14  * 16-bit Basic Multilingual Plane (BMP) only.
  15  *
  16  * Note that this string class supports embedded NULL characters.  End
  17  * of string is given by the length attribute.  However, the internal
  18  * representation always stores a trailing NULL to make it easier to
  19  * use unicode strings with standard APIs.
  20  *
  21  * History:
  22  * 1999-01-23 fl  Created
  23  * 1999-01-24 fl  Added split, join, capwords; basic UTF-8 support
  24  * 1999-01-24 fl  Basic UCS-2 support, buffer interface, etc.
  25  * 1999-03-06 fl  Moved declarations to separate file, etc.
  26  * 1999-06-13 fl  Changed join method semantics according to Tim's proposal
  27  * 1999-08-10 fl  Some minor tweaks
  28  *
  29  * Written by Fredrik Lundh, January 1999.
  30  *
  31  * Copyright (c) 1999 by Secret Labs AB.
  32  * Copyright (c) 1999 by Fredrik Lundh.
  33  *
  34  * fredrik@pythonware.com
  35  * http://www.pythonware.com
  36  *
  37  * --------------------------------------------------------------------
  38  * This Unicode String Type is
  39  *
  40  * Copyright (c) 1999 by Secret Labs AB
  41  * Copyright (c) 1999 by Fredrik Lundh
  42  *
  43  * By obtaining, using, and/or copying this software and/or its
  44  * associated documentation, you agree that you have read, understood,
  45  * and will comply with the following terms and conditions:
  46  *
  47  * Permission to use, copy, modify, and distribute this software and its
  48  * associated documentation for any purpose and without fee is hereby
  49  * granted, provided that the above copyright notice appears in all
  50  * copies, and that both that copyright notice and this permission notice
  51  * appear in supporting documentation, and that the name of Secret Labs
  52  * AB or the author not be used in advertising or publicity pertaining to
  53  * distribution of the software without specific, written prior
  54  * permission.
  55  *
  56  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  57  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  58  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  59  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  60  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  61  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  62  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  63  * -------------------------------------------------------------------- */
  64
  65 #include "Python.h"
  66
  67 #include "unicodeobject.h"
  68 #include "ucnhash.h"
  69
  70 #if defined(HAVE_LIMITS_H)
  71 #include <limits.h>
  72 #else
  73 #define INT_MAX 2147483647
  74 #endif
  75
  76 #ifdef MS_WIN32
  77 #include <windows.h>
  78 #endif
  79
  80 /* Limit for the Unicode object free list */
  81
  82 #define MAX_UNICODE_FREELIST_SIZE       1024
  83
  84 /* Limit for the Unicode object free list stay alive optimization.
  85
  86    The implementation will keep allocated Unicode memory intact for
  87    all objects on the free list having a size less than this
  88    limit. This reduces malloc() overhead for small Unicode objects.
  89
  90    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  91    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  92    malloc()-overhead) bytes of unused garbage.
  93
  94    Setting the limit to 0 effectively turns the feature off.
  95
  96    Note: This is an experimental feature ! If you get core dumps when
  97    using Unicode objects, turn this feature off.
  98
  99 */
 100
 101 #define KEEPALIVE_SIZE_LIMIT       9
 102
 103 /* Endianness switches; defaults to little endian */
 104
 105 #ifdef WORDS_BIGENDIAN
 106 # define BYTEORDER_IS_BIG_ENDIAN
 107 #else
 108 # define BYTEORDER_IS_LITTLE_ENDIAN
 109 #endif
 110
 111 /* --- Globals ------------------------------------------------------------
 112
 113    The globals are initialized by the _PyUnicode_Init() API and should
 114    not be used before calling that API.
 115
 116 */
 117
 118 /* The empty Unicode object */
 119 static PyUnicodeObject *unicode_empty;
 120
 121 /* Free list for Unicode objects */
 122 static PyUnicodeObject *unicode_freelist;
 123 static int unicode_freelist_size;
 124
 125 /* Default encoding to use and assume when NULL is passed as encoding
 126    parameter; it is initialized by _PyUnicode_Init().
 127
 128    Always use the PyUnicode_SetDefaultEncoding() and
 129    PyUnicode_GetDefaultEncoding() APIs to access this global.
 130
 131 */
 132
 133 static char unicode_default_encoding[100];
 134
 135 /* --- Unicode Object ----------------------------------------------------- */
 136
 137 static
 138 int _PyUnicode_Resize(register PyUnicodeObject *unicode,
 139                       int length)
 140 {
 141     void *oldstr;
 142
 143     /* Shortcut if there's nothing much to do. */
 144     if (unicode->length == length)
 145         goto reset;
 146
 147     /* Resizing unicode_empty is not allowed. */
 148     if (unicode == unicode_empty) {
 149         PyErr_SetString(PyExc_SystemError,
 150                         "can't resize empty unicode object");
 151         return -1;
 152     }
 153
 154     /* We allocate one more byte to make sure the string is
 155        Ux0000 terminated -- XXX is this needed ? */
 156     oldstr = unicode->str;
 157     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 158     if (!unicode->str) {
 159         unicode->str = oldstr;
 160         PyErr_NoMemory();
 161         return -1;
 162     }
 163     unicode->str[length] = 0;
 164     unicode->length = length;
 165
 166  reset:
 167     /* Reset the object caches */
 168     if (unicode->defenc) {
 169         Py_DECREF(unicode->defenc);
 170         unicode->defenc = NULL;
 171     }
 172     unicode->hash = -1;
 173
 174     return 0;
 175 }
 176
 177 int PyUnicode_Resize(PyObject **unicode,
 178                      int length)
 179 {
 180     PyUnicodeObject *v;
 181
 182     if (unicode == NULL) {
 183         PyErr_BadInternalCall();
 184         return -1;
 185     }
 186     v = (PyUnicodeObject *)*unicode;
 187     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 188         PyErr_BadInternalCall();
 189         return -1;
 190     }
 191     return _PyUnicode_Resize(v, length);
 192 }
 193
 194 /* We allocate one more byte to make sure the string is
 195    Ux0000 terminated -- XXX is this needed ?
 196
 197    XXX This allocator could further be enhanced by assuring that the
 198        free list never reduces its size below 1.
 199
 200 */
 201
 202 static
 203 PyUnicodeObject *_PyUnicode_New(int length)
 204 {
 205     register PyUnicodeObject *unicode;
 206
 207     /* Optimization for empty strings */
 208     if (length == 0 && unicode_empty != NULL) {
 209         Py_INCREF(unicode_empty);
 210         return unicode_empty;
 211     }
 212
 213     /* Unicode freelist & memory allocation */
 214     if (unicode_freelist) {
 215         unicode = unicode_freelist;
 216         unicode_freelist = *(PyUnicodeObject **)unicode;
 217         unicode_freelist_size--;
 218         if (unicode->str) {
 219             /* Keep-Alive optimization: we only upsize the buffer,
 220                never downsize it. */
 221             if ((unicode->length < length) &&
 222                 _PyUnicode_Resize(unicode, length)) {
 223                 PyMem_DEL(unicode->str);
 224                 goto onError;
 225             }
 226         }
 227       else {
 228             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 229       }
 230       PyObject_INIT(unicode, &PyUnicode_Type);
 231     }
 232     else {
 233         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 234         if (unicode == NULL)
 235             return NULL;
 236         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 237     }
 238
 239     if (!unicode->str) {
 240         PyErr_NoMemory();
 241         goto onError;
 242     }
 243     unicode->str[length] = 0;
 244     unicode->length = length;
 245     unicode->hash = -1;
 246     unicode->defenc = NULL;
 247     return unicode;
 248
 249  onError:
 250     _Py_ForgetReference((PyObject *)unicode);
 251     PyObject_DEL(unicode);
 252     return NULL;
 253 }
 254
 255 static
 256 void _PyUnicode_Free(register PyUnicodeObject *unicode)
 257 {
 258     if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 259         /* Keep-Alive optimization */
 260         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 261             PyMem_DEL(unicode->str);
 262             unicode->str = NULL;
 263             unicode->length = 0;
 264         }
 265         if (unicode->defenc) {
 266             Py_DECREF(unicode->defenc);
 267             unicode->defenc = NULL;
 268         }
 269         /* Add to free list */
 270         *(PyUnicodeObject **)unicode = unicode_freelist;
 271         unicode_freelist = unicode;
 272         unicode_freelist_size++;
 273     }
 274     else {
 275         PyMem_DEL(unicode->str);
 276         Py_XDECREF(unicode->defenc);
 277         PyObject_DEL(unicode);
 278     }
 279 }
 280
 281 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 282                                 int size)
 283 {
 284     PyUnicodeObject *unicode;
 285
 286     unicode = _PyUnicode_New(size);
 287     if (!unicode)
 288         return NULL;
 289
 290     /* Copy the Unicode data into the new object */
 291     if (u != NULL)
 292         memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
 293
 294     return (PyObject *)unicode;
 295 }
 296
 297 #ifdef HAVE_WCHAR_H
 298
 299 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 300                                  int size)
 301 {
 302     PyUnicodeObject *unicode;
 303
 304     if (w == NULL) {
 305         PyErr_BadInternalCall();
 306         return NULL;
 307     }
 308
 309     unicode = _PyUnicode_New(size);
 310     if (!unicode)
 311         return NULL;
 312
 313     /* Copy the wchar_t data into the new object */
 314 #ifdef HAVE_USABLE_WCHAR_T
 315     memcpy(unicode->str, w, size * sizeof(wchar_t));
 316 #else
 317     {
 318         register Py_UNICODE *u;
 319         register int i;
 320         u = PyUnicode_AS_UNICODE(unicode);
 321         for (i = size; i >= 0; i--)
 322             *u++ = *w++;
 323     }
 324 #endif
 325
 326     return (PyObject *)unicode;
 327 }
 328
 329 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 330                          register wchar_t *w,
 331                          int size)
 332 {
 333     if (unicode == NULL) {
 334         PyErr_BadInternalCall();
 335         return -1;
 336     }
 337     if (size > PyUnicode_GET_SIZE(unicode))
 338         size = PyUnicode_GET_SIZE(unicode);
 339 #ifdef HAVE_USABLE_WCHAR_T
 340     memcpy(w, unicode->str, size * sizeof(wchar_t));
 341 #else
 342     {
 343         register Py_UNICODE *u;
 344         register int i;
 345         u = PyUnicode_AS_UNICODE(unicode);
 346         for (i = size; i >= 0; i--)
 347             *w++ = *u++;
 348     }
 349 #endif
 350
 351     return size;
 352 }
 353
 354 #endif
 355
 356 PyObject *PyUnicode_FromObject(register PyObject *obj)
 357 {
 358     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 359 }
 360
 361 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 362                                       const char *encoding,
 363                                       const char *errors)
 364 {
 365     const char *s;
 366     int len;
 367     int owned = 0;
 368     PyObject *v;
 369
 370     if (obj == NULL) {
 371         PyErr_BadInternalCall();
 372         return NULL;
 373     }
 374
 375     /* Coerce object */
 376     if (PyInstance_Check(obj)) {
 377         PyObject *func;
 378         func = PyObject_GetAttrString(obj, "__str__");
 379         if (func == NULL) {
 380             PyErr_SetString(PyExc_TypeError,
 381                   "coercing to Unicode: instance doesn't define __str__");
 382             return NULL;
 383         }
 384         obj = PyEval_CallObject(func, NULL);
 385         Py_DECREF(func);
 386         if (obj == NULL)
 387             return NULL;
 388         owned = 1;
 389     }
 390     if (PyUnicode_Check(obj)) {
 391         Py_INCREF(obj);
 392         v = obj;
 393         if (encoding) {
 394             PyErr_SetString(PyExc_TypeError,
 395                             "decoding Unicode is not supported");
 396             return NULL;
 397         }
 398         goto done;
 399     }
 400     else if (PyString_Check(obj)) {
 401         s = PyString_AS_STRING(obj);
 402         len = PyString_GET_SIZE(obj);
 403     }
 404     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 405         /* Overwrite the error message with something more useful in
 406            case of a TypeError. */
 407         if (PyErr_ExceptionMatches(PyExc_TypeError))
 408             PyErr_Format(PyExc_TypeError,
 409                          "coercing to Unicode: need string or buffer, "
 410                          "%.80s found",
 411                          obj->ob_type->tp_name);
 412         goto onError;
 413     }
 414
 415     /* Convert to Unicode */
 416     if (len == 0) {
 417         Py_INCREF(unicode_empty);
 418         v = (PyObject *)unicode_empty;
 419     }
 420     else
 421         v = PyUnicode_Decode(s, len, encoding, errors);
 422  done:
 423     if (owned) {
 424         Py_DECREF(obj);
 425     }
 426     return v;
 427
 428  onError:
 429     if (owned) {
 430         Py_DECREF(obj);
 431     }
 432     return NULL;
 433 }
 434
 435 PyObject *PyUnicode_Decode(const char *s,
 436                            int size,
 437                            const char *encoding,
 438                            const char *errors)
 439 {
 440     PyObject *buffer = NULL, *unicode;
 441
 442     if (encoding == NULL)
 443         encoding = PyUnicode_GetDefaultEncoding();
 444
 445     /* Shortcuts for common default encodings */
 446     if (strcmp(encoding, "utf-8") == 0)
 447         return PyUnicode_DecodeUTF8(s, size, errors);
 448     else if (strcmp(encoding, "latin-1") == 0)
 449         return PyUnicode_DecodeLatin1(s, size, errors);
 450     else if (strcmp(encoding, "ascii") == 0)
 451         return PyUnicode_DecodeASCII(s, size, errors);
 452
 453     /* Decode via the codec registry */
 454     buffer = PyBuffer_FromMemory((void *)s, size);
 455     if (buffer == NULL)
 456         goto onError;
 457     unicode = PyCodec_Decode(buffer, encoding, errors);
 458     if (unicode == NULL)
 459         goto onError;
 460     if (!PyUnicode_Check(unicode)) {
 461         PyErr_Format(PyExc_TypeError,
 462                      "decoder did not return an unicode object (type=%.400s)",
 463                      unicode->ob_type->tp_name);
 464         Py_DECREF(unicode);
 465         goto onError;
 466     }
 467     Py_DECREF(buffer);
 468     return unicode;
 469
 470  onError:
 471     Py_XDECREF(buffer);
 472     return NULL;
 473 }
 474
 475 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 476                            int size,
 477                            const char *encoding,
 478                            const char *errors)
 479 {
 480     PyObject *v, *unicode;
 481
 482     unicode = PyUnicode_FromUnicode(s, size);
 483     if (unicode == NULL)
 484         return NULL;
 485     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 486     Py_DECREF(unicode);
 487     return v;
 488 }
 489
 490 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 491                                     const char *encoding,
 492                                     const char *errors)
 493 {
 494     PyObject *v;
 495
 496     if (!PyUnicode_Check(unicode)) {
 497         PyErr_BadArgument();
 498         goto onError;
 499     }
 500
 501     if (encoding == NULL)
 502         encoding = PyUnicode_GetDefaultEncoding();
 503
 504     /* Shortcuts for common default encodings */
 505     if (errors == NULL) {
 506         if (strcmp(encoding, "utf-8") == 0)
 507         return PyUnicode_AsUTF8String(unicode);
 508         else if (strcmp(encoding, "latin-1") == 0)
 509             return PyUnicode_AsLatin1String(unicode);
 510         else if (strcmp(encoding, "ascii") == 0)
 511             return PyUnicode_AsASCIIString(unicode);
 512     }
 513
 514     /* Encode via the codec registry */
 515     v = PyCodec_Encode(unicode, encoding, errors);
 516     if (v == NULL)
 517         goto onError;
 518     /* XXX Should we really enforce this ? */
 519     if (!PyString_Check(v)) {
 520         PyErr_Format(PyExc_TypeError,
 521                      "encoder did not return a string object (type=%.400s)",
 522                      v->ob_type->tp_name);
 523         Py_DECREF(v);
 524         goto onError;
 525     }
 526     return v;
 527
 528  onError:
 529     return NULL;
 530 }
 531
 532 /* Return a Python string holding the default encoded value of the
 533    Unicode object.
 534
 535    The resulting string is cached in the Unicode object for subsequent
 536    usage by this function. The cached version is needed to implement
 537    the character buffer interface and will live (at least) as long as
 538    the Unicode object itself.
 539
 540    The refcount of the string is *not* incremented.
 541
 542    *** Exported for internal use by the interpreter only !!! ***
 543
 544 */
 545
 546 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 547                                             const char *errors)
 548 {
 549     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 550
 551     if (v)
 552         return v;
 553     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 554     if (v && errors == NULL)
 555         ((PyUnicodeObject *)unicode)->defenc = v;
 556     return v;
 557 }
 558
 559 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 560 {
 561     if (!PyUnicode_Check(unicode)) {
 562         PyErr_BadArgument();
 563         goto onError;
 564     }
 565     return PyUnicode_AS_UNICODE(unicode);
 566
 567  onError:
 568     return NULL;
 569 }
 570
 571 int PyUnicode_GetSize(PyObject *unicode)
 572 {
 573     if (!PyUnicode_Check(unicode)) {
 574         PyErr_BadArgument();
 575         goto onError;
 576     }
 577     return PyUnicode_GET_SIZE(unicode);
 578
 579  onError:
 580     return -1;
 581 }
 582
 583 const char *PyUnicode_GetDefaultEncoding(void)
 584 {
 585     return unicode_default_encoding;
 586 }
 587
 588 int PyUnicode_SetDefaultEncoding(const char *encoding)
 589 {
 590     PyObject *v;
 591
 592     /* Make sure the encoding is valid. As side effect, this also
 593        loads the encoding into the codec registry cache. */
 594     v = _PyCodec_Lookup(encoding);
 595     if (v == NULL)
 596         goto onError;
 597     Py_DECREF(v);
 598     strncpy(unicode_default_encoding,
 599             encoding,
 600             sizeof(unicode_default_encoding));
 601     return 0;
 602
 603  onError:
 604     return -1;
 605 }
 606
 607 /* --- UTF-8 Codec -------------------------------------------------------- */
 608
 609 static
 610 char utf8_code_length[256] = {
 611     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 612        illegal prefix.  see RFC 2279 for details */
 613     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 614     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 615     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 616     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 617     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 618     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 619     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 620     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 622     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 624     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 626     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 627     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 628     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 629 };
 630
 631 static
 632 int utf8_decoding_error(const char **source,
 633                         Py_UNICODE **dest,
 634                         const char *errors,
 635                         const char *details)
 636 {
 637     if ((errors == NULL) ||
 638         (strcmp(errors,"strict") == 0)) {
 639         PyErr_Format(PyExc_UnicodeError,
 640                      "UTF-8 decoding error: %.400s",
 641                      details);
 642         return -1;
 643     }
 644     else if (strcmp(errors,"ignore") == 0) {
 645         (*source)++;
 646         return 0;
 647     }
 648     else if (strcmp(errors,"replace") == 0) {
 649         (*source)++;
 650         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 651         (*dest)++;
 652         return 0;
 653     }
 654     else {
 655         PyErr_Format(PyExc_ValueError,
 656                      "UTF-8 decoding error; unknown error handling code: %.400s",
 657                      errors);
 658         return -1;
 659     }
 660 }
 661
 662 PyObject *PyUnicode_DecodeUTF8(const char *s,
 663                                int size,
 664                                const char *errors)
 665 {
 666     int n;
 667     const char *e;
 668     PyUnicodeObject *unicode;
 669     Py_UNICODE *p;
 670     const char *errmsg = "";
 671
 672     /* Note: size will always be longer than the resulting Unicode
 673        character count */
 674     unicode = _PyUnicode_New(size);
 675     if (!unicode)
 676         return NULL;
 677     if (size == 0)
 678         return (PyObject *)unicode;
 679
 680     /* Unpack UTF-8 encoded data */
 681     p = unicode->str;
 682     e = s + size;
 683
 684     while (s < e) {
 685         Py_UCS4 ch = (unsigned char)*s;
 686
 687         if (ch < 0x80) {
 688             *p++ = (Py_UNICODE)ch;
 689             s++;
 690             continue;
 691         }
 692
 693         n = utf8_code_length[ch];
 694
 695         if (s + n > e) {
 696             errmsg = "unexpected end of data";
 697             goto utf8Error;
 698         }
 699
 700         switch (n) {
 701
 702         case 0:
 703             errmsg = "unexpected code byte";
 704             goto utf8Error;
 705             break;
 706
 707         case 1:
 708             errmsg = "internal error";
 709             goto utf8Error;
 710             break;
 711
 712         case 2:
 713             if ((s[1] & 0xc0) != 0x80) {
 714                 errmsg = "invalid data";
 715                 goto utf8Error;
 716             }
 717             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
 718             if (ch < 0x80) {
 719                 errmsg = "illegal encoding";
 720                 goto utf8Error;
 721             }
 722             else
 723                 *p++ = (Py_UNICODE)ch;
 724             break;
 725
 726         case 3:
 727             if ((s[1] & 0xc0) != 0x80 ||
 728                 (s[2] & 0xc0) != 0x80) {
 729                 errmsg = "invalid data";
 730                 goto utf8Error;
 731             }
 732             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
 733             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
 734                 errmsg = "illegal encoding";
 735                 goto utf8Error;
 736             }
 737             else
 738                                 *p++ = (Py_UNICODE)ch;
 739             break;
 740
 741         case 4:
 742             if ((s[1] & 0xc0) != 0x80 ||
 743                 (s[2] & 0xc0) != 0x80 ||
 744                 (s[3] & 0xc0) != 0x80) {
 745                 errmsg = "invalid data";
 746                 goto utf8Error;
 747             }
 748             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
 749                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
 750             /* validate and convert to UTF-16 */
 751             if ((ch < 0x10000) ||   /* minimum value allowed for 4
 752                                        byte encoding */
 753                 (ch > 0x10ffff)) {  /* maximum value allowed for
 754                                        UTF-16 */
 755                 errmsg = "illegal encoding";
 756                 goto utf8Error;
 757             }
 758             /*  compute and append the two surrogates: */
 759
 760             /*  translate from 10000..10FFFF to 0..FFFF */
 761             ch -= 0x10000;
 762
 763             /*  high surrogate = top 10 bits added to D800 */
 764             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
 765
 766             /*  low surrogate = bottom 10 bits added to DC00 */
 767             *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
 768             break;
 769
 770         default:
 771             /* Other sizes are only needed for UCS-4 */
 772             errmsg = "unsupported Unicode code range";
 773             goto utf8Error;
 774             break;
 775         }
 776         s += n;
 777         continue;
 778
 779     utf8Error:
 780       if (utf8_decoding_error(&s, &p, errors, errmsg))
 781           goto onError;
 782     }
 783
 784     /* Adjust length */
 785     if (_PyUnicode_Resize(unicode, p - unicode->str))
 786         goto onError;
 787
 788     return (PyObject *)unicode;
 789
 790 onError:
 791     Py_DECREF(unicode);
 792     return NULL;
 793 }
 794
 795 /* Not used anymore, now that the encoder supports UTF-16
 796    surrogates. */
 797 #if 0
 798 static
 799 int utf8_encoding_error(const Py_UNICODE **source,
 800                         char **dest,
 801                         const char *errors,
 802                         const char *details)
 803 {
 804     if ((errors == NULL) ||
 805         (strcmp(errors,"strict") == 0)) {
 806         PyErr_Format(PyExc_UnicodeError,
 807                      "UTF-8 encoding error: %.400s",
 808                      details);
 809         return -1;
 810     }
 811     else if (strcmp(errors,"ignore") == 0) {
 812         return 0;
 813     }
 814     else if (strcmp(errors,"replace") == 0) {
 815         **dest = '?';
 816         (*dest)++;
 817         return 0;
 818     }
 819     else {
 820         PyErr_Format(PyExc_ValueError,
 821                      "UTF-8 encoding error; "
 822                      "unknown error handling code: %.400s",
 823                      errors);
 824         return -1;
 825     }
 826 }
 827 #endif
 828
 829 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 830                                int size,
 831                                const char *errors)
 832 {
 833     PyObject *v;
 834     char *p;
 835     char *q;
 836     Py_UCS4 ch2;
 837     unsigned int cbAllocated = 3 * size;
 838     unsigned int cbWritten = 0;
 839     int i = 0;
 840
 841     v = PyString_FromStringAndSize(NULL, cbAllocated);
 842     if (v == NULL)
 843         return NULL;
 844     if (size == 0)
 845         return v;
 846
 847     p = q = PyString_AS_STRING(v);
 848     while (i < size) {
 849         Py_UCS4 ch = s[i++];
 850         if (ch < 0x80) {
 851             *p++ = (char) ch;
 852             cbWritten++;
 853         }
 854         else if (ch < 0x0800) {
 855             *p++ = 0xc0 | (ch >> 6);
 856             *p++ = 0x80 | (ch & 0x3f);
 857             cbWritten += 2;
 858         }
 859         else {
 860             /* Check for high surrogate */
 861             if (0xD800 <= ch && ch <= 0xDBFF) {
 862                 if (i != size) {
 863                     ch2 = s[i];
 864                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 865
 866                         if (cbWritten >= (cbAllocated - 4)) {
 867                             /* Provide enough room for some more
 868                                surrogates */
 869                             cbAllocated += 4*10;
 870                             if (_PyString_Resize(&v, cbAllocated))
 871                                 goto onError;
 872                         }
 873
 874                         /* combine the two values */
 875                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
 876
 877                         *p++ = (char)((ch >> 18) | 0xf0);
 878                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 879                         i++;
 880                         cbWritten += 4;
 881                     }
 882                 }
 883             }
 884             else {
 885                 *p++ = (char)(0xe0 | (ch >> 12));
 886                 cbWritten += 3;
 887             }
 888             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 889             *p++ = (char)(0x80 | (ch & 0x3f));
 890         }
 891     }
 892     *p = '\0';
 893     if (_PyString_Resize(&v, p - q))
 894         goto onError;
 895     return v;
 896
 897  onError:
 898     Py_DECREF(v);
 899     return NULL;
 900 }
 901
 902 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 903 {
 904     PyObject *str;
 905
 906     if (!PyUnicode_Check(unicode)) {
 907         PyErr_BadArgument();
 908         return NULL;
 909     }
 910     str = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
 911                                PyUnicode_GET_SIZE(unicode),
 912                                NULL);
 913     if (str == NULL)
 914         return NULL;
 915     Py_INCREF(str);
 916     return str;
 917 }
 918
 919 /* --- UTF-16 Codec ------------------------------------------------------- */
 920
 921 static
 922 int utf16_decoding_error(const Py_UNICODE **source,
 923                          Py_UNICODE **dest,
 924                          const char *errors,
 925                          const char *details)
 926 {
 927     if ((errors == NULL) ||
 928         (strcmp(errors,"strict") == 0)) {
 929         PyErr_Format(PyExc_UnicodeError,
 930                      "UTF-16 decoding error: %.400s",
 931                      details);
 932         return -1;
 933     }
 934     else if (strcmp(errors,"ignore") == 0) {
 935         return 0;
 936     }
 937     else if (strcmp(errors,"replace") == 0) {
 938         if (dest) {
 939             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 940             (*dest)++;
 941         }
 942         return 0;
 943     }
 944     else {
 945         PyErr_Format(PyExc_ValueError,
 946                      "UTF-16 decoding error; "
 947                      "unknown error handling code: %.400s",
 948                      errors);
 949         return -1;
 950     }
 951 }
 952
 953 PyObject *PyUnicode_DecodeUTF16(const char *s,
 954                                 int size,
 955                                 const char *errors,
 956                                 int *byteorder)
 957 {
 958     PyUnicodeObject *unicode;
 959     Py_UNICODE *p;
 960     const Py_UNICODE *q, *e;
 961     int bo = 0;
 962     const char *errmsg = "";
 963
 964     /* size should be an even number */
 965     if (size % sizeof(Py_UNICODE) != 0) {
 966         if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
 967             return NULL;
 968         /* The remaining input chars are ignored if we fall through
 969            here... */
 970     }
 971
 972     /* Note: size will always be longer than the resulting Unicode
 973        character count */
 974     unicode = _PyUnicode_New(size);
 975     if (!unicode)
 976         return NULL;
 977     if (size == 0)
 978         return (PyObject *)unicode;
 979
 980     /* Unpack UTF-16 encoded data */
 981     p = unicode->str;
 982     q = (Py_UNICODE *)s;
 983     e = q + (size / sizeof(Py_UNICODE));
 984
 985     if (byteorder)
 986         bo = *byteorder;
 987
 988     while (q < e) {
 989         register Py_UNICODE ch = *q++;
 990
 991         /* Check for BOM marks (U+FEFF) in the input and adjust
 992            current byte order setting accordingly. Swap input
 993            bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
 994            !) */
 995 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
 996         if (ch == 0xFEFF) {
 997             bo = -1;
 998             continue;
 999         } else if (ch == 0xFFFE) {
1000             bo = 1;
1001             continue;
1002         }
1003         if (bo == 1)
1004             ch = (ch >> 8) | (ch << 8);
1005 #else
1006         if (ch == 0xFEFF) {
1007             bo = 1;
1008             continue;
1009         } else if (ch == 0xFFFE) {
1010             bo = -1;
1011             continue;
1012         }
1013         if (bo == -1)
1014             ch = (ch >> 8) | (ch << 8);
1015 #endif
1016         if (ch < 0xD800 || ch > 0xDFFF) {
1017             *p++ = ch;
1018             continue;
1019         }
1020
1021         /* UTF-16 code pair: */
1022         if (q >= e) {
1023             errmsg = "unexpected end of data";
1024             goto utf16Error;
1025         }
1026         if (0xDC00 <= *q && *q <= 0xDFFF) {
1027             q++;
1028             if (0xD800 <= *q && *q <= 0xDBFF) {
1029                 /* This is valid data (a UTF-16 surrogate pair), but
1030                    we are not able to store this information since our
1031                    Py_UNICODE type only has 16 bits... this might
1032                    change someday, even though it's unlikely. */
1033                 errmsg = "code pairs are not supported";
1034                 goto utf16Error;
1035             }
1036             else
1037                 continue;
1038         }
1039         errmsg = "illegal encoding";
1040         /* Fall through to report the error */
1041
1042     utf16Error:
1043         if (utf16_decoding_error(&q, &p, errors, errmsg))
1044             goto onError;
1045     }
1046
1047     if (byteorder)
1048         *byteorder = bo;
1049
1050     /* Adjust length */
1051     if (_PyUnicode_Resize(unicode, p - unicode->str))
1052         goto onError;
1053
1054     return (PyObject *)unicode;
1055
1056 onError:
1057     Py_DECREF(unicode);
1058     return NULL;
1059 }
1060
1061 #undef UTF16_ERROR
1062
1063 PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1064                                 int size,
1065                                 const char *errors,
1066                                 int byteorder)
1067 {
1068     PyObject *v;
1069     Py_UNICODE *p;
1070     char *q;
1071
1072     /* We don't create UTF-16 pairs... */
1073     v = PyString_FromStringAndSize(NULL,
1074                         sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1075     if (v == NULL)
1076         return NULL;
1077
1078     q = PyString_AS_STRING(v);
1079     p = (Py_UNICODE *)q;
1080     if (byteorder == 0)
1081         *p++ = 0xFEFF;
1082     if (size == 0)
1083         return v;
1084     if (byteorder == 0 ||
1085 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1086         byteorder == -1
1087 #else
1088         byteorder == 1
1089 #endif
1090         )
1091         memcpy(p, s, size * sizeof(Py_UNICODE));
1092     else
1093         while (size-- > 0) {
1094             Py_UNICODE ch = *s++;
1095             *p++ = (ch >> 8) | (ch << 8);
1096         }
1097     return v;
1098 }
1099
1100 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1101 {
1102     if (!PyUnicode_Check(unicode)) {
1103         PyErr_BadArgument();
1104         return NULL;
1105     }
1106     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1107                                  PyUnicode_GET_SIZE(unicode),
1108                                  NULL,
1109                                  0);
1110 }
1111
1112 /* --- Unicode Escape Codec ----------------------------------------------- */
1113
1114 static
1115 int unicodeescape_decoding_error(const char **source,
1116                                  Py_UNICODE *x,
1117                                  const char *errors,
1118                                  const char *details)
1119 {
1120     if ((errors == NULL) ||
1121         (strcmp(errors,"strict") == 0)) {
1122         PyErr_Format(PyExc_UnicodeError,
1123                      "Unicode-Escape decoding error: %.400s",
1124                      details);
1125         return -1;
1126     }
1127     else if (strcmp(errors,"ignore") == 0) {
1128         return 0;
1129     }
1130     else if (strcmp(errors,"replace") == 0) {
1131         *x = Py_UNICODE_REPLACEMENT_CHARACTER;
1132         return 0;
1133     }
1134     else {
1135         PyErr_Format(PyExc_ValueError,
1136                      "Unicode-Escape decoding error; "
1137                      "unknown error handling code: %.400s",
1138                      errors);
1139         return -1;
1140     }
1141 }
1142
1143 static _Py_UCNHashAPI *pucnHash = NULL;
1144
1145 static
1146 int mystrnicmp(const char *s1, const char *s2, size_t count)
1147 {
1148     char c1, c2;
1149
1150     if (count)
1151     {
1152         do
1153         {
1154            c1 = tolower(*(s1++));
1155            c2 = tolower(*(s2++));
1156         }
1157         while(--count && c1 == c2);
1158
1159         return c1 - c2;
1160     }
1161
1162     return 0;
1163 }
1164
1165 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1166                                         int size,
1167                                         const char *errors)
1168 {
1169     PyUnicodeObject *v;
1170     Py_UNICODE *p = NULL, *buf = NULL;
1171     const char *end;
1172
1173     /* Escaped strings will always be longer than the resulting
1174        Unicode string, so we start with size here and then reduce the
1175        length after conversion to the true value. */
1176     v = _PyUnicode_New(size);
1177     if (v == NULL)
1178         goto onError;
1179     if (size == 0)
1180         return (PyObject *)v;
1181     p = buf = PyUnicode_AS_UNICODE(v);
1182     end = s + size;
1183     while (s < end) {
1184         unsigned char c;
1185         Py_UNICODE x;
1186         int i;
1187
1188         /* Non-escape characters are interpreted as Unicode ordinals */
1189         if (*s != '\\') {
1190             *p++ = (unsigned char)*s++;
1191             continue;
1192         }
1193
1194         /* \ - Escapes */
1195         s++;
1196         switch (*s++) {
1197
1198         /* \x escapes */
1199         case '\n': break;
1200         case '\\': *p++ = '\\'; break;
1201         case '\'': *p++ = '\''; break;
1202         case '\"': *p++ = '\"'; break;
1203         case 'b': *p++ = '\b'; break;
1204         case 'f': *p++ = '\014'; break; /* FF */
1205         case 't': *p++ = '\t'; break;
1206         case 'n': *p++ = '\n'; break;
1207         case 'r': *p++ = '\r'; break;
1208         case 'v': *p++ = '\013'; break; /* VT */
1209         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211         /* \OOO (octal) escapes */
1212         case '0': case '1': case '2': case '3':
1213         case '4': case '5': case '6': case '7':
1214             x = s[-1] - '0';
1215             if ('0' <= *s && *s <= '7') {
1216                 x = (x<<3) + *s++ - '0';
1217                 if ('0' <= *s && *s <= '7')
1218                     x = (x<<3) + *s++ - '0';
1219             }
1220             *p++ = x;
1221             break;
1222
1223         /* \xXXXX escape with 1-n hex digits.  for compatibility
1224            with 8-bit strings, this code ignores all but the last
1225            two digits */
1226         case 'x':
1227             x = 0;
1228             c = (unsigned char)*s;
1229             if (isxdigit(c)) {
1230                 do {
1231                     x = (x<<4) & 0xF0;
1232                     if ('0' <= c && c <= '9')
1233                         x += c - '0';
1234                     else if ('a' <= c && c <= 'f')
1235                         x += 10 + c - 'a';
1236                     else
1237                         x += 10 + c - 'A';
1238                     c = (unsigned char)*++s;
1239                 } while (isxdigit(c));
1240                 *p++ = (unsigned char) x;
1241             } else {
1242                 *p++ = '\\';
1243                 *p++ = (unsigned char)s[-1];
1244             }
1245             break;
1246
1247         /* \uXXXX with 4 hex digits */
1248         case 'u':
1249             for (x = 0, i = 0; i < 4; i++) {
1250                 c = (unsigned char)s[i];
1251                 if (!isxdigit(c)) {
1252                     if (unicodeescape_decoding_error(&s, &x, errors,
1253                                                      "truncated \\uXXXX"))
1254                         goto onError;
1255                     i++;
1256                     break;
1257                 }
1258                 x = (x<<4) & ~0xF;
1259                 if (c >= '0' && c <= '9')
1260                     x += c - '0';
1261                 else if (c >= 'a' && c <= 'f')
1262                     x += 10 + c - 'a';
1263                 else
1264                     x += 10 + c - 'A';
1265             }
1266             s += i;
1267             *p++ = x;
1268             break;
1269
1270         case 'N':
1271             /* Ok, we need to deal with Unicode Character Names now,
1272              * make sure we've imported the hash table data...
1273              */
1274             if (pucnHash == NULL)
1275             {
1276                 PyObject *mod = 0, *v = 0;
1277
1278                 mod = PyImport_ImportModule("ucnhash");
1279                 if (mod == NULL)
1280                     goto onError;
1281                 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1282                 Py_DECREF(mod);
1283                 if (v == NULL)
1284                 {
1285                     goto onError;
1286                 }
1287                 pucnHash = PyCObject_AsVoidPtr(v);
1288                 Py_DECREF(v);
1289                 if (pucnHash == NULL)
1290                 {
1291                     goto onError;
1292                 }
1293             }
1294
1295             if (*s == '{')
1296             {
1297                 const char *start = s + 1;
1298                 const char *endBrace = start;
1299                 Py_UCS4 value;
1300                 unsigned long j;
1301
1302                 /* look for either the closing brace, or we
1303                  * exceed the maximum length of the unicode character names
1304                  */
1305                 while (*endBrace != '}' &&
1306                        (unsigned int)(endBrace - start) <=
1307                            pucnHash->cchMax &&
1308                        endBrace < end)
1309                 {
1310                     endBrace++;
1311                 }
1312                 if (endBrace != end && *endBrace == '}')
1313                 {
1314                     j = pucnHash->hash(start, endBrace - start);
1315                     if (j > pucnHash->cKeys ||
1316                         mystrnicmp(
1317                             start,
1318                             ((_Py_UnicodeCharacterName *)
1319                              (pucnHash->getValue(j)))->pszUCN,
1320                             (int)(endBrace - start)) != 0)
1321                     {
1322                         if (unicodeescape_decoding_error(
1323                                 &s, &x, errors,
1324                                 "Invalid Unicode Character Name"))
1325                         {
1326                             goto onError;
1327                         }
1328                         goto ucnFallthrough;
1329                     }
1330                     value = ((_Py_UnicodeCharacterName *)
1331                                (pucnHash->getValue(j)))->value;
1332                     if (value < 1<<16)
1333                     {
1334                         /* In UCS-2 range, easy solution.. */
1335                         *p++ = value;
1336                     }
1337                     else
1338                     {
1339                         /* Oops, its in UCS-4 space, */
1340                         /*  compute and append the two surrogates: */
1341                         /*  translate from 10000..10FFFF to 0..FFFFF */
1342                         value -= 0x10000;
1343
1344                         /* high surrogate = top 10 bits added to D800 */
1345                         *p++ = 0xD800 + (value >> 10);
1346
1347                         /* low surrogate  = bottom 10 bits added to DC00 */
1348                         *p++ = 0xDC00 + (value & ~0xFC00);
1349                     }
1350                     s = endBrace + 1;
1351                 }
1352                 else
1353                 {
1354                     if (unicodeescape_decoding_error(
1355                             &s, &x, errors,
1356                             "Unicode name missing closing brace"))
1357                         goto onError;
1358                     goto ucnFallthrough;
1359                 }
1360                 break;
1361             }
1362             if (unicodeescape_decoding_error(
1363                     &s, &x, errors,
1364                     "Missing opening brace for Unicode Character Name escape"))
1365                 goto onError;
1366 ucnFallthrough:
1367             /* fall through on purpose */
1368                 default:
1369             *p++ = '\\';
1370             *p++ = (unsigned char)s[-1];
1371             break;
1372         }
1373     }
1374     if (_PyUnicode_Resize(v, (int)(p - buf)))
1375                 goto onError;
1376     return (PyObject *)v;
1377
1378  onError:
1379     Py_XDECREF(v);
1380     return NULL;
1381 }
1382
1383 /* Return a Unicode-Escape string version of the Unicode object.
1384
1385    If quotes is true, the string is enclosed in u"" or u'' quotes as
1386    appropriate.
1387
1388 */
1389
1390 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1391                                   int size,
1392                                   Py_UNICODE ch);
1393
1394 static
1395 PyObject *unicodeescape_string(const Py_UNICODE *s,
1396                                int size,
1397                                int quotes)
1398 {
1399     PyObject *repr;
1400     char *p;
1401     char *q;
1402
1403     static const char *hexdigit = "0123456789ABCDEF";
1404
1405     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1406     if (repr == NULL)
1407         return NULL;
1408
1409     p = q = PyString_AS_STRING(repr);
1410
1411     if (quotes) {
1412         *p++ = 'u';
1413         *p++ = (findchar(s, size, '\'') &&
1414                 !findchar(s, size, '"')) ? '"' : '\'';
1415     }
1416     while (size-- > 0) {
1417         Py_UNICODE ch = *s++;
1418         /* Escape quotes */
1419         if (quotes && (ch == q[1] || ch == '\\')) {
1420             *p++ = '\\';
1421             *p++ = (char) ch;
1422         }
1423         /* Map 16-bit characters to '\uxxxx' */
1424         else if (ch >= 256) {
1425             *p++ = '\\';
1426             *p++ = 'u';
1427             *p++ = hexdigit[(ch >> 12) & 0xf];
1428             *p++ = hexdigit[(ch >> 8) & 0xf];
1429             *p++ = hexdigit[(ch >> 4) & 0xf];
1430             *p++ = hexdigit[ch & 15];
1431         }
1432         /* Map non-printable US ASCII to '\ooo' */
1433         else if (ch < ' ' || ch >= 128) {
1434             *p++ = '\\';
1435             *p++ = hexdigit[(ch >> 6) & 7];
1436             *p++ = hexdigit[(ch >> 3) & 7];
1437             *p++ = hexdigit[ch & 7];
1438         }
1439         /* Copy everything else as-is */
1440         else
1441             *p++ = (char) ch;
1442     }
1443     if (quotes)
1444         *p++ = q[1];
1445
1446     *p = '\0';
1447     if (_PyString_Resize(&repr, p - q))
1448         goto onError;
1449
1450     return repr;
1451
1452  onError:
1453     Py_DECREF(repr);
1454     return NULL;
1455 }
1456
1457 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1458                                         int size)
1459 {
1460     return unicodeescape_string(s, size, 0);
1461 }
1462
1463 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1464 {
1465     if (!PyUnicode_Check(unicode)) {
1466         PyErr_BadArgument();
1467         return NULL;
1468     }
1469     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1470                                          PyUnicode_GET_SIZE(unicode));
1471 }
1472
1473 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1474
1475 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1476                                            int size,
1477                                            const char *errors)
1478 {
1479     PyUnicodeObject *v;
1480     Py_UNICODE *p, *buf;
1481     const char *end;
1482     const char *bs;
1483
1484     /* Escaped strings will always be longer than the resulting
1485        Unicode string, so we start with size here and then reduce the
1486        length after conversion to the true value. */
1487     v = _PyUnicode_New(size);
1488     if (v == NULL)
1489         goto onError;
1490     if (size == 0)
1491         return (PyObject *)v;
1492     p = buf = PyUnicode_AS_UNICODE(v);
1493     end = s + size;
1494     while (s < end) {
1495         unsigned char c;
1496         Py_UNICODE x;
1497         int i;
1498
1499         /* Non-escape characters are interpreted as Unicode ordinals */
1500         if (*s != '\\') {
1501             *p++ = (unsigned char)*s++;
1502             continue;
1503         }
1504
1505         /* \u-escapes are only interpreted iff the number of leading
1506            backslashes if odd */
1507         bs = s;
1508         for (;s < end;) {
1509             if (*s != '\\')
1510                 break;
1511             *p++ = (unsigned char)*s++;
1512         }
1513         if (((s - bs) & 1) == 0 ||
1514             s >= end ||
1515             *s != 'u') {
1516             continue;
1517         }
1518         p--;
1519         s++;
1520
1521         /* \uXXXX with 4 hex digits */
1522         for (x = 0, i = 0; i < 4; i++) {
1523             c = (unsigned char)s[i];
1524             if (!isxdigit(c)) {
1525                 if (unicodeescape_decoding_error(&s, &x, errors,
1526                                                  "truncated \\uXXXX"))
1527                     goto onError;
1528                 i++;
1529                 break;
1530             }
1531             x = (x<<4) & ~0xF;
1532             if (c >= '0' && c <= '9')
1533                 x += c - '0';
1534             else if (c >= 'a' && c <= 'f')
1535                 x += 10 + c - 'a';
1536             else
1537                 x += 10 + c - 'A';
1538         }
1539         s += i;
1540         *p++ = x;
1541     }
1542     if (_PyUnicode_Resize(v, (int)(p - buf)))
1543         goto onError;
1544     return (PyObject *)v;
1545
1546  onError:
1547     Py_XDECREF(v);
1548     return NULL;
1549 }
1550
1551 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1552                                            int size)
1553 {
1554     PyObject *repr;
1555     char *p;
1556     char *q;
1557
1558     static const char *hexdigit = "0123456789ABCDEF";
1559
1560     repr = PyString_FromStringAndSize(NULL, 6 * size);
1561     if (repr == NULL)
1562         return NULL;
1563     if (size == 0)
1564         return repr;
1565
1566     p = q = PyString_AS_STRING(repr);
1567     while (size-- > 0) {
1568         Py_UNICODE ch = *s++;
1569         /* Map 16-bit characters to '\uxxxx' */
1570         if (ch >= 256) {
1571             *p++ = '\\';
1572             *p++ = 'u';
1573             *p++ = hexdigit[(ch >> 12) & 0xf];
1574             *p++ = hexdigit[(ch >> 8) & 0xf];
1575             *p++ = hexdigit[(ch >> 4) & 0xf];
1576             *p++ = hexdigit[ch & 15];
1577         }
1578         /* Copy everything else as-is */
1579         else
1580             *p++ = (char) ch;
1581     }
1582     *p = '\0';
1583     if (_PyString_Resize(&repr, p - q))
1584         goto onError;
1585
1586     return repr;
1587
1588  onError:
1589     Py_DECREF(repr);
1590     return NULL;
1591 }
1592
1593 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1594 {
1595     if (!PyUnicode_Check(unicode)) {
1596         PyErr_BadArgument();
1597         return NULL;
1598     }
1599     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1600                                             PyUnicode_GET_SIZE(unicode));
1601 }
1602
1603 /* --- Latin-1 Codec ------------------------------------------------------ */
1604
1605 PyObject *PyUnicode_DecodeLatin1(const char *s,
1606                                  int size,
1607                                  const char *errors)
1608 {
1609     PyUnicodeObject *v;
1610     Py_UNICODE *p;
1611
1612     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1613     v = _PyUnicode_New(size);
1614     if (v == NULL)
1615         goto onError;
1616     if (size == 0)
1617         return (PyObject *)v;
1618     p = PyUnicode_AS_UNICODE(v);
1619     while (size-- > 0)
1620         *p++ = (unsigned char)*s++;
1621     return (PyObject *)v;
1622
1623  onError:
1624     Py_XDECREF(v);
1625     return NULL;
1626 }
1627
1628 static
1629 int latin1_encoding_error(const Py_UNICODE **source,
1630                           char **dest,
1631                           const char *errors,
1632                           const char *details)
1633 {
1634     if ((errors == NULL) ||
1635         (strcmp(errors,"strict") == 0)) {
1636         PyErr_Format(PyExc_UnicodeError,
1637                      "Latin-1 encoding error: %.400s",
1638                      details);
1639         return -1;
1640     }
1641     else if (strcmp(errors,"ignore") == 0) {
1642         return 0;
1643     }
1644     else if (strcmp(errors,"replace") == 0) {
1645         **dest = '?';
1646         (*dest)++;
1647         return 0;
1648     }
1649     else {
1650         PyErr_Format(PyExc_ValueError,
1651                      "Latin-1 encoding error; "
1652                      "unknown error handling code: %.400s",
1653                      errors);
1654         return -1;
1655     }
1656 }
1657
1658 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1659                                  int size,
1660                                  const char *errors)
1661 {
1662     PyObject *repr;
1663     char *s, *start;
1664
1665     repr = PyString_FromStringAndSize(NULL, size);
1666     if (repr == NULL)
1667         return NULL;
1668     if (size == 0)
1669         return repr;
1670
1671     s = PyString_AS_STRING(repr);
1672     start = s;
1673     while (size-- > 0) {
1674         Py_UNICODE ch = *p++;
1675         if (ch >= 256) {
1676             if (latin1_encoding_error(&p, &s, errors,
1677                                       "ordinal not in range(256)"))
1678                 goto onError;
1679         }
1680         else
1681             *s++ = (char)ch;
1682     }
1683     /* Resize if error handling skipped some characters */
1684     if (s - start < PyString_GET_SIZE(repr))
1685         if (_PyString_Resize(&repr, s - start))
1686             goto onError;
1687     return repr;
1688
1689  onError:
1690     Py_DECREF(repr);
1691     return NULL;
1692 }
1693
1694 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1695 {
1696     if (!PyUnicode_Check(unicode)) {
1697         PyErr_BadArgument();
1698         return NULL;
1699     }
1700     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1701                                   PyUnicode_GET_SIZE(unicode),
1702                                   NULL);
1703 }
1704
1705 /* --- 7-bit ASCII Codec -------------------------------------------------- */
1706
1707 static
1708 int ascii_decoding_error(const char **source,
1709                          Py_UNICODE **dest,
1710                          const char *errors,
1711                          const char *details)
1712 {
1713     if ((errors == NULL) ||
1714         (strcmp(errors,"strict") == 0)) {
1715         PyErr_Format(PyExc_UnicodeError,
1716                      "ASCII decoding error: %.400s",
1717                      details);
1718         return -1;
1719     }
1720     else if (strcmp(errors,"ignore") == 0) {
1721         return 0;
1722     }
1723     else if (strcmp(errors,"replace") == 0) {
1724         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1725         (*dest)++;
1726         return 0;
1727     }
1728     else {
1729         PyErr_Format(PyExc_ValueError,
1730                      "ASCII decoding error; "
1731                      "unknown error handling code: %.400s",
1732                      errors);
1733         return -1;
1734     }
1735 }
1736
1737 PyObject *PyUnicode_DecodeASCII(const char *s,
1738                                 int size,
1739                                 const char *errors)
1740 {
1741     PyUnicodeObject *v;
1742     Py_UNICODE *p;
1743
1744     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1745     v = _PyUnicode_New(size);
1746     if (v == NULL)
1747         goto onError;
1748     if (size == 0)
1749         return (PyObject *)v;
1750     p = PyUnicode_AS_UNICODE(v);
1751     while (size-- > 0) {
1752         register unsigned char c;
1753
1754         c = (unsigned char)*s++;
1755         if (c < 128)
1756             *p++ = c;
1757         else if (ascii_decoding_error(&s, &p, errors,
1758                                       "ordinal not in range(128)"))
1759                 goto onError;
1760     }
1761     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1762         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1763             goto onError;
1764     return (PyObject *)v;
1765
1766  onError:
1767     Py_XDECREF(v);
1768     return NULL;
1769 }
1770
1771 static
1772 int ascii_encoding_error(const Py_UNICODE **source,
1773                          char **dest,
1774                          const char *errors,
1775                          const char *details)
1776 {
1777     if ((errors == NULL) ||
1778         (strcmp(errors,"strict") == 0)) {
1779         PyErr_Format(PyExc_UnicodeError,
1780                      "ASCII encoding error: %.400s",
1781                      details);
1782         return -1;
1783     }
1784     else if (strcmp(errors,"ignore") == 0) {
1785         return 0;
1786     }
1787     else if (strcmp(errors,"replace") == 0) {
1788         **dest = '?';
1789         (*dest)++;
1790         return 0;
1791     }
1792     else {
1793         PyErr_Format(PyExc_ValueError,
1794                      "ASCII encoding error; "
1795                      "unknown error handling code: %.400s",
1796                      errors);
1797         return -1;
1798     }
1799 }
1800
1801 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1802                                 int size,
1803                                 const char *errors)
1804 {
1805     PyObject *repr;
1806     char *s, *start;
1807
1808     repr = PyString_FromStringAndSize(NULL, size);
1809     if (repr == NULL)
1810         return NULL;
1811     if (size == 0)
1812         return repr;
1813
1814     s = PyString_AS_STRING(repr);
1815     start = s;
1816     while (size-- > 0) {
1817         Py_UNICODE ch = *p++;
1818         if (ch >= 128) {
1819             if (ascii_encoding_error(&p, &s, errors,
1820                                       "ordinal not in range(128)"))
1821                 goto onError;
1822         }
1823         else
1824             *s++ = (char)ch;
1825     }
1826     /* Resize if error handling skipped some characters */
1827     if (s - start < PyString_GET_SIZE(repr))
1828         if (_PyString_Resize(&repr, s - start))
1829             goto onError;
1830     return repr;
1831
1832  onError:
1833     Py_DECREF(repr);
1834     return NULL;
1835 }
1836
1837 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1838 {
1839     if (!PyUnicode_Check(unicode)) {
1840         PyErr_BadArgument();
1841         return NULL;
1842     }
1843     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1844                                  PyUnicode_GET_SIZE(unicode),
1845                                  NULL);
1846 }
1847
1848 #ifdef MS_WIN32
1849
1850 /* --- MBCS codecs for Windows -------------------------------------------- */
1851
1852 PyObject *PyUnicode_DecodeMBCS(const char *s,
1853                                 int size,
1854                                 const char *errors)
1855 {
1856     PyUnicodeObject *v;
1857     Py_UNICODE *p;
1858
1859     /* First get the size of the result */
1860     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1861     if (size > 0 && usize==0)
1862         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1863
1864     v = _PyUnicode_New(usize);
1865     if (v == NULL)
1866         return NULL;
1867     if (usize == 0)
1868         return (PyObject *)v;
1869     p = PyUnicode_AS_UNICODE(v);
1870     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1871         Py_DECREF(v);
1872         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1873     }
1874
1875     return (PyObject *)v;
1876 }
1877
1878 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1879                                 int size,
1880                                 const char *errors)
1881 {
1882     PyObject *repr;
1883     char *s;
1884     DWORD mbcssize;
1885
1886     /* If there are no characters, bail now! */
1887     if (size==0)
1888             return PyString_FromString("");
1889
1890     /* First get the size of the result */
1891     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1892     if (mbcssize==0)
1893         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1894
1895     repr = PyString_FromStringAndSize(NULL, mbcssize);
1896     if (repr == NULL)
1897         return NULL;
1898     if (mbcssize == 0)
1899         return repr;
1900
1901     /* Do the conversion */
1902     s = PyString_AS_STRING(repr);
1903     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1904         Py_DECREF(repr);
1905         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1906     }
1907     return repr;
1908 }
1909
1910 #endif /* MS_WIN32 */
1911
1912 /* --- Character Mapping Codec -------------------------------------------- */
1913
1914 static
1915 int charmap_decoding_error(const char **source,
1916                          Py_UNICODE **dest,
1917                          const char *errors,
1918                          const char *details)
1919 {
1920     if ((errors == NULL) ||
1921         (strcmp(errors,"strict") == 0)) {
1922         PyErr_Format(PyExc_UnicodeError,
1923                      "charmap decoding error: %.400s",
1924                      details);
1925         return -1;
1926     }
1927     else if (strcmp(errors,"ignore") == 0) {
1928         return 0;
1929     }
1930     else if (strcmp(errors,"replace") == 0) {
1931         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1932         (*dest)++;
1933         return 0;
1934     }
1935     else {
1936         PyErr_Format(PyExc_ValueError,
1937                      "charmap decoding error; "
1938                      "unknown error handling code: %.400s",
1939                      errors);
1940         return -1;
1941     }
1942 }
1943
1944 PyObject *PyUnicode_DecodeCharmap(const char *s,
1945                                   int size,
1946                                   PyObject *mapping,
1947                                   const char *errors)
1948 {
1949     PyUnicodeObject *v;
1950     Py_UNICODE *p;
1951
1952     /* Default to Latin-1 */
1953     if (mapping == NULL)
1954         return PyUnicode_DecodeLatin1(s, size, errors);
1955
1956     v = _PyUnicode_New(size);
1957     if (v == NULL)
1958         goto onError;
1959     if (size == 0)
1960         return (PyObject *)v;
1961     p = PyUnicode_AS_UNICODE(v);
1962     while (size-- > 0) {
1963         unsigned char ch = *s++;
1964         PyObject *w, *x;
1965
1966         /* Get mapping (char ordinal -> integer, Unicode char or None) */
1967         w = PyInt_FromLong((long)ch);
1968         if (w == NULL)
1969             goto onError;
1970         x = PyObject_GetItem(mapping, w);
1971         Py_DECREF(w);
1972         if (x == NULL) {
1973             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974                 /* No mapping found: default to Latin-1 mapping */
1975                 PyErr_Clear();
1976                 *p++ = (Py_UNICODE)ch;
1977                 continue;
1978             }
1979             goto onError;
1980         }
1981
1982         /* Apply mapping */
1983         if (PyInt_Check(x)) {
1984             long value = PyInt_AS_LONG(x);
1985             if (value < 0 || value > 65535) {
1986                 PyErr_SetString(PyExc_TypeError,
1987                                 "character mapping must be in range(65536)");
1988                 Py_DECREF(x);
1989                 goto onError;
1990             }
1991             *p++ = (Py_UNICODE)value;
1992         }
1993         else if (x == Py_None) {
1994             /* undefined mapping */
1995             if (charmap_decoding_error(&s, &p, errors,
1996                                        "character maps to <undefined>")) {
1997                 Py_DECREF(x);
1998                 goto onError;
1999             }
2000         }
2001         else if (PyUnicode_Check(x)) {
2002             if (PyUnicode_GET_SIZE(x) != 1) {
2003                 /* 1-n mapping */
2004                 PyErr_SetString(PyExc_NotImplementedError,
2005                                 "1-n mappings are currently not implemented");
2006                 Py_DECREF(x);
2007                 goto onError;
2008             }
2009             *p++ = *PyUnicode_AS_UNICODE(x);
2010         }
2011         else {
2012             /* wrong return value */
2013             PyErr_SetString(PyExc_TypeError,
2014                   "character mapping must return integer, None or unicode");
2015             Py_DECREF(x);
2016             goto onError;
2017         }
2018         Py_DECREF(x);
2019     }
2020     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2021         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2022             goto onError;
2023     return (PyObject *)v;
2024
2025  onError:
2026     Py_XDECREF(v);
2027     return NULL;
2028 }
2029
2030 static
2031 int charmap_encoding_error(const Py_UNICODE **source,
2032                            char **dest,
2033                            const char *errors,
2034                            const char *details)
2035 {
2036     if ((errors == NULL) ||
2037         (strcmp(errors,"strict") == 0)) {
2038         PyErr_Format(PyExc_UnicodeError,
2039                      "charmap encoding error: %.400s",
2040                      details);
2041         return -1;
2042     }
2043     else if (strcmp(errors,"ignore") == 0) {
2044         return 0;
2045     }
2046     else if (strcmp(errors,"replace") == 0) {
2047         **dest = '?';
2048         (*dest)++;
2049         return 0;
2050     }
2051     else {
2052         PyErr_Format(PyExc_ValueError,
2053                      "charmap encoding error; "
2054                      "unknown error handling code: %.400s",
2055                      errors);
2056         return -1;
2057     }
2058 }
2059
2060 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2061                                   int size,
2062                                   PyObject *mapping,
2063                                   const char *errors)
2064 {
2065     PyObject *v;
2066     char *s;
2067
2068     /* Default to Latin-1 */
2069     if (mapping == NULL)
2070         return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072     v = PyString_FromStringAndSize(NULL, size);
2073     if (v == NULL)
2074         return NULL;
2075     if (size == 0)
2076         return v;
2077     s = PyString_AS_STRING(v);
2078     while (size-- > 0) {
2079         Py_UNICODE ch = *p++;
2080         PyObject *w, *x;
2081
2082         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083         w = PyInt_FromLong((long)ch);
2084         if (w == NULL)
2085             goto onError;
2086         x = PyObject_GetItem(mapping, w);
2087         Py_DECREF(w);
2088         if (x == NULL) {
2089             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2090                 /* No mapping found: default to Latin-1 mapping if possible */
2091                 PyErr_Clear();
2092                 if (ch < 256) {
2093                     *s++ = (char)ch;
2094                     continue;
2095                 }
2096                 else if (!charmap_encoding_error(&p, &s, errors,
2097                                      "missing character mapping"))
2098                     continue;
2099             }
2100             goto onError;
2101         }
2102
2103         /* Apply mapping */
2104         if (PyInt_Check(x)) {
2105             long value = PyInt_AS_LONG(x);
2106             if (value < 0 || value > 255) {
2107                 PyErr_SetString(PyExc_TypeError,
2108                                 "character mapping must be in range(256)");
2109                 Py_DECREF(x);
2110                 goto onError;
2111             }
2112             *s++ = (char)value;
2113         }
2114         else if (x == Py_None) {
2115             /* undefined mapping */
2116             if (charmap_encoding_error(&p, &s, errors,
2117                                        "character maps to <undefined>")) {
2118                 Py_DECREF(x);
2119                 goto onError;
2120             }
2121         }
2122         else if (PyString_Check(x)) {
2123             if (PyString_GET_SIZE(x) != 1) {
2124                 /* 1-n mapping */
2125                 PyErr_SetString(PyExc_NotImplementedError,
2126                       "1-n mappings are currently not implemented");
2127                 Py_DECREF(x);
2128                 goto onError;
2129             }
2130             *s++ = *PyString_AS_STRING(x);
2131         }
2132         else {
2133             /* wrong return value */
2134             PyErr_SetString(PyExc_TypeError,
2135                   "character mapping must return integer, None or unicode");
2136             Py_DECREF(x);
2137             goto onError;
2138         }
2139         Py_DECREF(x);
2140     }
2141     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2142         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2143             goto onError;
2144     return v;
2145
2146  onError:
2147     Py_DECREF(v);
2148     return NULL;
2149 }
2150
2151 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2152                                     PyObject *mapping)
2153 {
2154     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2155         PyErr_BadArgument();
2156         return NULL;
2157     }
2158     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2159                                    PyUnicode_GET_SIZE(unicode),
2160                                    mapping,
2161                                    NULL);
2162 }
2163
2164 static
2165 int translate_error(const Py_UNICODE **source,
2166                     Py_UNICODE **dest,
2167                     const char *errors,
2168                     const char *details)
2169 {
2170     if ((errors == NULL) ||
2171         (strcmp(errors,"strict") == 0)) {
2172         PyErr_Format(PyExc_UnicodeError,
2173                      "translate error: %.400s",
2174                      details);
2175         return -1;
2176     }
2177     else if (strcmp(errors,"ignore") == 0) {
2178         return 0;
2179     }
2180     else if (strcmp(errors,"replace") == 0) {
2181         **dest = '?';
2182         (*dest)++;
2183         return 0;
2184     }
2185     else {
2186         PyErr_Format(PyExc_ValueError,
2187                      "translate error; "
2188                      "unknown error handling code: %.400s",
2189                      errors);
2190         return -1;
2191     }
2192 }
2193
2194 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2195                                      int size,
2196                                      PyObject *mapping,
2197                                      const char *errors)
2198 {
2199     PyUnicodeObject *v;
2200     Py_UNICODE *p;
2201
2202     if (mapping == NULL) {
2203         PyErr_BadArgument();
2204         return NULL;
2205     }
2206
2207     /* Output will never be longer than input */
2208     v = _PyUnicode_New(size);
2209     if (v == NULL)
2210         goto onError;
2211     if (size == 0)
2212         goto done;
2213     p = PyUnicode_AS_UNICODE(v);
2214     while (size-- > 0) {
2215         Py_UNICODE ch = *s++;
2216         PyObject *w, *x;
2217
2218         /* Get mapping */
2219         w = PyInt_FromLong(ch);
2220         if (w == NULL)
2221             goto onError;
2222         x = PyObject_GetItem(mapping, w);
2223         Py_DECREF(w);
2224         if (x == NULL) {
2225             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2226                 /* No mapping found: default to 1-1 mapping */
2227                 PyErr_Clear();
2228                 *p++ = ch;
2229                 continue;
2230             }
2231             goto onError;
2232         }
2233
2234         /* Apply mapping */
2235         if (PyInt_Check(x))
2236             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2237         else if (x == Py_None) {
2238             /* undefined mapping */
2239             if (translate_error(&s, &p, errors,
2240                                 "character maps to <undefined>")) {
2241                 Py_DECREF(x);
2242                 goto onError;
2243             }
2244         }
2245         else if (PyUnicode_Check(x)) {
2246             if (PyUnicode_GET_SIZE(x) != 1) {
2247                 /* 1-n mapping */
2248                 PyErr_SetString(PyExc_NotImplementedError,
2249                                 "1-n mappings are currently not implemented");
2250                 Py_DECREF(x);
2251                 goto onError;
2252             }
2253             *p++ = *PyUnicode_AS_UNICODE(x);
2254         }
2255         else {
2256             /* wrong return value */
2257             PyErr_SetString(PyExc_TypeError,
2258                   "translate mapping must return integer, None or unicode");
2259             Py_DECREF(x);
2260             goto onError;
2261         }
2262         Py_DECREF(x);
2263     }
2264     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2265         if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2266             goto onError;
2267
2268  done:
2269     return (PyObject *)v;
2270
2271  onError:
2272     Py_XDECREF(v);
2273     return NULL;
2274 }
2275
2276 PyObject *PyUnicode_Translate(PyObject *str,
2277                               PyObject *mapping,
2278                               const char *errors)
2279 {
2280     PyObject *result;
2281
2282     str = PyUnicode_FromObject(str);
2283     if (str == NULL)
2284         goto onError;
2285     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2286                                         PyUnicode_GET_SIZE(str),
2287                                         mapping,
2288                                         errors);
2289     Py_DECREF(str);
2290     return result;
2291
2292  onError:
2293     Py_XDECREF(str);
2294     return NULL;
2295 }
2296
2297 /* --- Decimal Encoder ---------------------------------------------------- */
2298
2299 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2300                             int length,
2301                             char *output,
2302                             const char *errors)
2303 {
2304     Py_UNICODE *p, *end;
2305
2306     if (output == NULL) {
2307         PyErr_BadArgument();
2308         return -1;
2309     }
2310
2311     p = s;
2312     end = s + length;
2313     while (p < end) {
2314         register Py_UNICODE ch = *p++;
2315         int decimal;
2316
2317         if (Py_UNICODE_ISSPACE(ch)) {
2318             *output++ = ' ';
2319             continue;
2320         }
2321         decimal = Py_UNICODE_TODECIMAL(ch);
2322         if (decimal >= 0) {
2323             *output++ = '0' + decimal;
2324             continue;
2325         }
2326         if (0 < ch && ch < 256) {
2327             *output++ = (char)ch;
2328             continue;
2329         }
2330         /* All other characters are considered invalid */
2331         if (errors == NULL || strcmp(errors, "strict") == 0) {
2332             PyErr_SetString(PyExc_ValueError,
2333                             "invalid decimal Unicode string");
2334             goto onError;
2335         }
2336         else if (strcmp(errors, "ignore") == 0)
2337             continue;
2338         else if (strcmp(errors, "replace") == 0) {
2339             *output++ = '?';
2340             continue;
2341         }
2342     }
2343     /* 0-terminate the output string */
2344     *output++ = '\0';
2345     return 0;
2346
2347  onError:
2348     return -1;
2349 }
2350
2351 /* --- Helpers ------------------------------------------------------------ */
2352
2353 static
2354 int count(PyUnicodeObject *self,
2355           int start,
2356           int end,
2357           PyUnicodeObject *substring)
2358 {
2359     int count = 0;
2360
2361     if (substring->length == 0)
2362         return (end - start + 1);
2363
2364     end -= substring->length;
2365
2366     while (start <= end)
2367         if (Py_UNICODE_MATCH(self, start, substring)) {
2368             count++;
2369             start += substring->length;
2370         } else
2371             start++;
2372
2373     return count;
2374 }
2375
2376 int PyUnicode_Count(PyObject *str,
2377                     PyObject *substr,
2378                     int start,
2379                     int end)
2380 {
2381     int result;
2382
2383     str = PyUnicode_FromObject(str);
2384     if (str == NULL)
2385         return -1;
2386     substr = PyUnicode_FromObject(substr);
2387     if (substr == NULL) {
2388         Py_DECREF(str);
2389         return -1;
2390     }
2391
2392     result = count((PyUnicodeObject *)str,
2393                    start, end,
2394                    (PyUnicodeObject *)substr);
2395
2396     Py_DECREF(str);
2397     Py_DECREF(substr);
2398     return result;
2399 }
2400
2401 static
2402 int findstring(PyUnicodeObject *self,
2403                PyUnicodeObject *substring,
2404                int start,
2405                int end,
2406                int direction)
2407 {
2408     if (start < 0)
2409         start += self->length;
2410     if (start < 0)
2411         start = 0;
2412
2413     if (substring->length == 0)
2414         return start;
2415
2416     if (end > self->length)
2417         end = self->length;
2418     if (end < 0)
2419         end += self->length;
2420     if (end < 0)
2421         end = 0;
2422
2423     end -= substring->length;
2424
2425     if (direction < 0) {
2426         for (; end >= start; end--)
2427             if (Py_UNICODE_MATCH(self, end, substring))
2428                 return end;
2429     } else {
2430         for (; start <= end; start++)
2431             if (Py_UNICODE_MATCH(self, start, substring))
2432                 return start;
2433     }
2434
2435     return -1;
2436 }
2437
2438 int PyUnicode_Find(PyObject *str,
2439                    PyObject *substr,
2440                    int start,
2441                    int end,
2442                    int direction)
2443 {
2444     int result;
2445
2446     str = PyUnicode_FromObject(str);
2447     if (str == NULL)
2448         return -1;
2449     substr = PyUnicode_FromObject(substr);
2450     if (substr == NULL) {
2451         Py_DECREF(substr);
2452         return -1;
2453     }
2454
2455     result = findstring((PyUnicodeObject *)str,
2456                         (PyUnicodeObject *)substr,
2457                         start, end, direction);
2458     Py_DECREF(str);
2459     Py_DECREF(substr);
2460     return result;
2461 }
2462
2463 static
2464 int tailmatch(PyUnicodeObject *self,
2465               PyUnicodeObject *substring,
2466               int start,
2467               int end,
2468               int direction)
2469 {
2470     if (start < 0)
2471         start += self->length;
2472     if (start < 0)
2473         start = 0;
2474
2475     if (substring->length == 0)
2476         return 1;
2477
2478     if (end > self->length)
2479         end = self->length;
2480     if (end < 0)
2481         end += self->length;
2482     if (end < 0)
2483         end = 0;
2484
2485     end -= substring->length;
2486     if (end < start)
2487         return 0;
2488
2489     if (direction > 0) {
2490         if (Py_UNICODE_MATCH(self, end, substring))
2491             return 1;
2492     } else {
2493         if (Py_UNICODE_MATCH(self, start, substring))
2494             return 1;
2495     }
2496
2497     return 0;
2498 }
2499
2500 int PyUnicode_Tailmatch(PyObject *str,
2501                         PyObject *substr,
2502                         int start,
2503                         int end,
2504                         int direction)
2505 {
2506     int result;
2507
2508     str = PyUnicode_FromObject(str);
2509     if (str == NULL)
2510         return -1;
2511     substr = PyUnicode_FromObject(substr);
2512     if (substr == NULL) {
2513         Py_DECREF(substr);
2514         return -1;
2515     }
2516
2517     result = tailmatch((PyUnicodeObject *)str,
2518                        (PyUnicodeObject *)substr,
2519                        start, end, direction);
2520     Py_DECREF(str);
2521     Py_DECREF(substr);
2522     return result;
2523 }
2524
2525 static
2526 const Py_UNICODE *findchar(const Py_UNICODE *s,
2527                      int size,
2528                      Py_UNICODE ch)
2529 {
2530     /* like wcschr, but doesn't stop at NULL characters */
2531
2532     while (size-- > 0) {
2533         if (*s == ch)
2534             return s;
2535         s++;
2536     }
2537
2538     return NULL;
2539 }
2540
2541 /* Apply fixfct filter to the Unicode object self and return a
2542    reference to the modified object */
2543
2544 static
2545 PyObject *fixup(PyUnicodeObject *self,
2546                 int (*fixfct)(PyUnicodeObject *s))
2547 {
2548
2549     PyUnicodeObject *u;
2550
2551     u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2552                                                  self->length);
2553     if (u == NULL)
2554         return NULL;
2555     if (!fixfct(u)) {
2556         /* fixfct should return TRUE if it modified the buffer. If
2557            FALSE, return a reference to the original buffer instead
2558            (to save space, not time) */
2559         Py_INCREF(self);
2560         Py_DECREF(u);
2561         return (PyObject*) self;
2562     }
2563     return (PyObject*) u;
2564 }
2565
2566 static
2567 int fixupper(PyUnicodeObject *self)
2568 {
2569     int len = self->length;
2570     Py_UNICODE *s = self->str;
2571     int status = 0;
2572
2573     while (len-- > 0) {
2574         register Py_UNICODE ch;
2575
2576         ch = Py_UNICODE_TOUPPER(*s);
2577         if (ch != *s) {
2578             status = 1;
2579             *s = ch;
2580         }
2581         s++;
2582     }
2583
2584     return status;
2585 }
2586
2587 static
2588 int fixlower(PyUnicodeObject *self)
2589 {
2590     int len = self->length;
2591     Py_UNICODE *s = self->str;
2592     int status = 0;
2593
2594     while (len-- > 0) {
2595         register Py_UNICODE ch;
2596
2597         ch = Py_UNICODE_TOLOWER(*s);
2598         if (ch != *s) {
2599             status = 1;
2600             *s = ch;
2601         }
2602         s++;
2603     }
2604
2605     return status;
2606 }
2607
2608 static
2609 int fixswapcase(PyUnicodeObject *self)
2610 {
2611     int len = self->length;
2612     Py_UNICODE *s = self->str;
2613     int status = 0;
2614
2615     while (len-- > 0) {
2616         if (Py_UNICODE_ISUPPER(*s)) {
2617             *s = Py_UNICODE_TOLOWER(*s);
2618             status = 1;
2619         } else if (Py_UNICODE_ISLOWER(*s)) {
2620             *s = Py_UNICODE_TOUPPER(*s);
2621             status = 1;
2622         }
2623         s++;
2624     }
2625
2626     return status;
2627 }
2628
2629 static
2630 int fixcapitalize(PyUnicodeObject *self)
2631 {
2632     if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2633         self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2634         return 1;
2635     }
2636     return 0;
2637 }
2638
2639 static
2640 int fixtitle(PyUnicodeObject *self)
2641 {
2642     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2643     register Py_UNICODE *e;
2644     int previous_is_cased;
2645
2646     /* Shortcut for single character strings */
2647     if (PyUnicode_GET_SIZE(self) == 1) {
2648         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2649         if (*p != ch) {
2650             *p = ch;
2651             return 1;
2652         }
2653         else
2654             return 0;
2655     }
2656
2657     e = p + PyUnicode_GET_SIZE(self);
2658     previous_is_cased = 0;
2659     for (; p < e; p++) {
2660         register const Py_UNICODE ch = *p;
2661
2662         if (previous_is_cased)
2663             *p = Py_UNICODE_TOLOWER(ch);
2664         else
2665             *p = Py_UNICODE_TOTITLE(ch);
2666
2667         if (Py_UNICODE_ISLOWER(ch) ||
2668             Py_UNICODE_ISUPPER(ch) ||
2669             Py_UNICODE_ISTITLE(ch))
2670             previous_is_cased = 1;
2671         else
2672             previous_is_cased = 0;
2673     }
2674     return 1;
2675 }
2676
2677 PyObject *PyUnicode_Join(PyObject *separator,
2678                          PyObject *seq)
2679 {
2680     Py_UNICODE *sep;
2681     int seplen;
2682     PyUnicodeObject *res = NULL;
2683     int reslen = 0;
2684     Py_UNICODE *p;
2685     int seqlen = 0;
2686     int sz = 100;
2687     int i;
2688
2689     seqlen = PySequence_Size(seq);
2690     if (seqlen < 0 && PyErr_Occurred())
2691         return NULL;
2692
2693     if (separator == NULL) {
2694         Py_UNICODE blank = ' ';
2695         sep = &blank;
2696         seplen = 1;
2697     }
2698     else {
2699         separator = PyUnicode_FromObject(separator);
2700         if (separator == NULL)
2701             return NULL;
2702         sep = PyUnicode_AS_UNICODE(separator);
2703         seplen = PyUnicode_GET_SIZE(separator);
2704     }
2705
2706     res = _PyUnicode_New(sz);
2707     if (res == NULL)
2708         goto onError;
2709     p = PyUnicode_AS_UNICODE(res);
2710     reslen = 0;
2711
2712     for (i = 0; i < seqlen; i++) {
2713         int itemlen;
2714         PyObject *item;
2715
2716         item = PySequence_GetItem(seq, i);
2717         if (item == NULL)
2718             goto onError;
2719         if (!PyUnicode_Check(item)) {
2720             PyObject *v;
2721             v = PyUnicode_FromObject(item);
2722             Py_DECREF(item);
2723             item = v;
2724             if (item == NULL)
2725                 goto onError;
2726         }
2727         itemlen = PyUnicode_GET_SIZE(item);
2728         while (reslen + itemlen + seplen >= sz) {
2729             if (_PyUnicode_Resize(res, sz*2))
2730                 goto onError;
2731             sz *= 2;
2732             p = PyUnicode_AS_UNICODE(res) + reslen;
2733         }
2734         if (i > 0) {
2735             memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2736             p += seplen;
2737             reslen += seplen;
2738         }
2739         memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2740         p += itemlen;
2741         reslen += itemlen;
2742         Py_DECREF(item);
2743     }
2744     if (_PyUnicode_Resize(res, reslen))
2745         goto onError;
2746
2747     Py_XDECREF(separator);
2748     return (PyObject *)res;
2749
2750  onError:
2751     Py_XDECREF(separator);
2752     Py_DECREF(res);
2753     return NULL;
2754 }
2755
2756 static
2757 PyUnicodeObject *pad(PyUnicodeObject *self,
2758                      int left,
2759                      int right,
2760                      Py_UNICODE fill)
2761 {
2762     PyUnicodeObject *u;
2763
2764     if (left < 0)
2765         left = 0;
2766     if (right < 0)
2767         right = 0;
2768
2769     if (left == 0 && right == 0) {
2770         Py_INCREF(self);
2771         return self;
2772     }
2773
2774     u = _PyUnicode_New(left + self->length + right);
2775     if (u) {
2776         if (left)
2777             Py_UNICODE_FILL(u->str, fill, left);
2778         Py_UNICODE_COPY(u->str + left, self->str, self->length);
2779         if (right)
2780             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2781     }
2782
2783     return u;
2784 }
2785
2786 #define SPLIT_APPEND(data, left, right)                                 \
2787         str = PyUnicode_FromUnicode(data + left, right - left);         \
2788         if (!str)                                                       \
2789             goto onError;                                               \
2790         if (PyList_Append(list, str)) {                                 \
2791             Py_DECREF(str);                                             \
2792             goto onError;                                               \
2793         }                                                               \
2794         else                                                            \
2795             Py_DECREF(str);
2796
2797 static
2798 PyObject *split_whitespace(PyUnicodeObject *self,
2799                            PyObject *list,
2800                            int maxcount)
2801 {
2802     register int i;
2803     register int j;
2804     int len = self->length;
2805     PyObject *str;
2806
2807     for (i = j = 0; i < len; ) {
2808         /* find a token */
2809         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2810             i++;
2811         j = i;
2812         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2813             i++;
2814         if (j < i) {
2815             if (maxcount-- <= 0)
2816                 break;
2817             SPLIT_APPEND(self->str, j, i);
2818             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2819                 i++;
2820             j = i;
2821         }
2822     }
2823     if (j < len) {
2824         SPLIT_APPEND(self->str, j, len);
2825     }
2826     return list;
2827
2828  onError:
2829     Py_DECREF(list);
2830     return NULL;
2831 }
2832
2833 PyObject *PyUnicode_Splitlines(PyObject *string,
2834                                int keepends)
2835 {
2836     register int i;
2837     register int j;
2838     int len;
2839     PyObject *list;
2840     PyObject *str;
2841     Py_UNICODE *data;
2842
2843     string = PyUnicode_FromObject(string);
2844     if (string == NULL)
2845         return NULL;
2846     data = PyUnicode_AS_UNICODE(string);
2847     len = PyUnicode_GET_SIZE(string);
2848
2849     list = PyList_New(0);
2850     if (!list)
2851         goto onError;
2852
2853     for (i = j = 0; i < len; ) {
2854         int eol;
2855
2856         /* Find a line and append it */
2857         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2858             i++;
2859
2860         /* Skip the line break reading CRLF as one line break */
2861         eol = i;
2862         if (i < len) {
2863             if (data[i] == '\r' && i + 1 < len &&
2864                 data[i+1] == '\n')
2865                 i += 2;
2866             else
2867                 i++;
2868             if (keepends)
2869                 eol = i;
2870         }
2871         SPLIT_APPEND(data, j, eol);
2872         j = i;
2873     }
2874     if (j < len) {
2875         SPLIT_APPEND(data, j, len);
2876     }
2877
2878     Py_DECREF(string);
2879     return list;
2880
2881  onError:
2882     Py_DECREF(list);
2883     Py_DECREF(string);
2884     return NULL;
2885 }
2886
2887 static
2888 PyObject *split_char(PyUnicodeObject *self,
2889                      PyObject *list,
2890                      Py_UNICODE ch,
2891                      int maxcount)
2892 {
2893     register int i;
2894     register int j;
2895     int len = self->length;
2896     PyObject *str;
2897
2898     for (i = j = 0; i < len; ) {
2899         if (self->str[i] == ch) {
2900             if (maxcount-- <= 0)
2901                 break;
2902             SPLIT_APPEND(self->str, j, i);
2903             i = j = i + 1;
2904         } else
2905             i++;
2906     }
2907     if (j <= len) {
2908         SPLIT_APPEND(self->str, j, len);
2909     }
2910     return list;
2911
2912  onError:
2913     Py_DECREF(list);
2914     return NULL;
2915 }
2916
2917 static
2918 PyObject *split_substring(PyUnicodeObject *self,
2919                           PyObject *list,
2920                           PyUnicodeObject *substring,
2921                           int maxcount)
2922 {
2923     register int i;
2924     register int j;
2925     int len = self->length;
2926     int sublen = substring->length;
2927     PyObject *str;
2928
2929     for (i = j = 0; i < len - sublen; ) {
2930         if (Py_UNICODE_MATCH(self, i, substring)) {
2931             if (maxcount-- <= 0)
2932                 break;
2933             SPLIT_APPEND(self->str, j, i);
2934             i = j = i + sublen;
2935         } else
2936             i++;
2937     }
2938     if (j <= len) {
2939         SPLIT_APPEND(self->str, j, len);
2940     }
2941     return list;
2942
2943  onError:
2944     Py_DECREF(list);
2945     return NULL;
2946 }
2947
2948 #undef SPLIT_APPEND
2949
2950 static
2951 PyObject *split(PyUnicodeObject *self,
2952                 PyUnicodeObject *substring,
2953                 int maxcount)
2954 {
2955     PyObject *list;
2956
2957     if (maxcount < 0)
2958         maxcount = INT_MAX;
2959
2960     list = PyList_New(0);
2961     if (!list)
2962         return NULL;
2963
2964     if (substring == NULL)
2965         return split_whitespace(self,list,maxcount);
2966
2967     else if (substring->length == 1)
2968         return split_char(self,list,substring->str[0],maxcount);
2969
2970     else if (substring->length == 0) {
2971         Py_DECREF(list);
2972         PyErr_SetString(PyExc_ValueError, "empty separator");
2973         return NULL;
2974     }
2975     else
2976         return split_substring(self,list,substring,maxcount);
2977 }
2978
2979 static
2980 PyObject *strip(PyUnicodeObject *self,
2981                 int left,
2982                 int right)
2983 {
2984     Py_UNICODE *p = self->str;
2985     int start = 0;
2986     int end = self->length;
2987
2988     if (left)
2989         while (start < end && Py_UNICODE_ISSPACE(p[start]))
2990             start++;
2991
2992     if (right)
2993         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2994             end--;
2995
2996     if (start == 0 && end == self->length) {
2997         /* couldn't strip anything off, return original string */
2998         Py_INCREF(self);
2999         return (PyObject*) self;
3000     }
3001
3002     return (PyObject*) PyUnicode_FromUnicode(
3003         self->str + start,
3004         end - start
3005         );
3006 }
3007
3008 static
3009 PyObject *replace(PyUnicodeObject *self,
3010                   PyUnicodeObject *str1,
3011                   PyUnicodeObject *str2,
3012                   int maxcount)
3013 {
3014     PyUnicodeObject *u;
3015
3016     if (maxcount < 0)
3017         maxcount = INT_MAX;
3018
3019     if (str1->length == 1 && str2->length == 1) {
3020         int i;
3021
3022         /* replace characters */
3023         if (!findchar(self->str, self->length, str1->str[0])) {
3024             /* nothing to replace, return original string */
3025             Py_INCREF(self);
3026             u = self;
3027         } else {
3028             Py_UNICODE u1 = str1->str[0];
3029             Py_UNICODE u2 = str2->str[0];
3030
3031             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3032                 self->str,
3033                 self->length
3034                 );
3035             if (u)
3036                 for (i = 0; i < u->length; i++)
3037                     if (u->str[i] == u1) {
3038                         if (--maxcount < 0)
3039                             break;
3040                         u->str[i] = u2;
3041                     }
3042         }
3043
3044     } else {
3045         int n, i;
3046         Py_UNICODE *p;
3047
3048         /* replace strings */
3049         n = count(self, 0, self->length, str1);
3050         if (n > maxcount)
3051             n = maxcount;
3052         if (n == 0) {
3053             /* nothing to replace, return original string */
3054             Py_INCREF(self);
3055             u = self;
3056         } else {
3057             u = _PyUnicode_New(
3058                 self->length + n * (str2->length - str1->length));
3059             if (u) {
3060                 i = 0;
3061                 p = u->str;
3062                 while (i <= self->length - str1->length)
3063                     if (Py_UNICODE_MATCH(self, i, str1)) {
3064                         /* replace string segment */
3065                         Py_UNICODE_COPY(p, str2->str, str2->length);
3066                         p += str2->length;
3067                         i += str1->length;
3068                         if (--n <= 0) {
3069                             /* copy remaining part */
3070                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3071                             break;
3072                         }
3073                     } else
3074                         *p++ = self->str[i++];
3075             }
3076         }
3077     }
3078
3079     return (PyObject *) u;
3080 }
3081
3082 /* --- Unicode Object Methods --------------------------------------------- */
3083
3084 static char title__doc__[] =
3085 "S.title() -> unicode\n\
3086 \n\
3087 Return a titlecased version of S, i.e. words start with title case\n\
3088 characters, all remaining cased characters have lower case.";
3089
3090 static PyObject*
3091 unicode_title(PyUnicodeObject *self, PyObject *args)
3092 {
3093     if (!PyArg_NoArgs(args))
3094         return NULL;
3095     return fixup(self, fixtitle);
3096 }
3097
3098 static char capitalize__doc__[] =
3099 "S.capitalize() -> unicode\n\
3100 \n\
3101 Return a capitalized version of S, i.e. make the first character\n\
3102 have upper case.";
3103
3104 static PyObject*
3105 unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3106 {
3107     if (!PyArg_NoArgs(args))
3108         return NULL;
3109     return fixup(self, fixcapitalize);
3110 }
3111
3112 #if 0
3113 static char capwords__doc__[] =
3114 "S.capwords() -> unicode\n\
3115 \n\
3116 Apply .capitalize() to all words in S and return the result with\n\
3117 normalized whitespace (all whitespace strings are replaced by ' ').";
3118
3119 static PyObject*
3120 unicode_capwords(PyUnicodeObject *self, PyObject *args)
3121 {
3122     PyObject *list;
3123     PyObject *item;
3124     int i;
3125
3126     if (!PyArg_NoArgs(args))
3127         return NULL;
3128
3129     /* Split into words */
3130     list = split(self, NULL, -1);
3131     if (!list)
3132         return NULL;
3133
3134     /* Capitalize each word */
3135     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3136         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3137                      fixcapitalize);
3138         if (item == NULL)
3139             goto onError;
3140         Py_DECREF(PyList_GET_ITEM(list, i));
3141         PyList_SET_ITEM(list, i, item);
3142     }
3143
3144     /* Join the words to form a new string */
3145     item = PyUnicode_Join(NULL, list);
3146
3147 onError:
3148     Py_DECREF(list);
3149     return (PyObject *)item;
3150 }
3151 #endif
3152
3153 static char center__doc__[] =
3154 "S.center(width) -> unicode\n\
3155 \n\
3156 Return S centered in a Unicode string of length width. Padding is done\n\
3157 using spaces.";
3158
3159 static PyObject *
3160 unicode_center(PyUnicodeObject *self, PyObject *args)
3161 {
3162     int marg, left;
3163     int width;
3164
3165     if (!PyArg_ParseTuple(args, "i:center", &width))
3166         return NULL;
3167
3168     if (self->length >= width) {
3169         Py_INCREF(self);
3170         return (PyObject*) self;
3171     }
3172
3173     marg = width - self->length;
3174     left = marg / 2 + (marg & width & 1);
3175
3176     return (PyObject*) pad(self, left, marg - left, ' ');
3177 }
3178
3179 #if 0
3180
3181 /* This code should go into some future Unicode collation support
3182    module. The basic comparison should compare ordinals on a naive
3183    basis (this is what Java does and thus JPython too). */
3184
3185 /* speedy UTF-16 code point order comparison */
3186 /* gleaned from: */
3187 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3188
3189 static short utf16Fixup[32] =
3190 {
3191     0, 0, 0, 0, 0, 0, 0, 0,
3192     0, 0, 0, 0, 0, 0, 0, 0,
3193     0, 0, 0, 0, 0, 0, 0, 0,
3194     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3195 };
3196
3197 static int
3198 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3199 {
3200     int len1, len2;
3201
3202     Py_UNICODE *s1 = str1->str;
3203     Py_UNICODE *s2 = str2->str;
3204
3205     len1 = str1->length;
3206     len2 = str2->length;
3207
3208     while (len1 > 0 && len2 > 0) {
3209         Py_UNICODE c1, c2;
3210         long diff;
3211
3212         c1 = *s1++;
3213         c2 = *s2++;
3214         if (c1 > (1<<11) * 26)
3215             c1 += utf16Fixup[c1>>11];
3216         if (c2 > (1<<11) * 26)
3217             c2 += utf16Fixup[c2>>11];
3218
3219         /* now c1 and c2 are in UTF-32-compatible order */
3220         diff = (long)c1 - (long)c2;
3221         if (diff)
3222             return (diff < 0) ? -1 : (diff != 0);
3223         len1--; len2--;
3224     }
3225
3226     return (len1 < len2) ? -1 : (len1 != len2);
3227 }
3228
3229 #else
3230
3231 static int
3232 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3233 {
3234     register int len1, len2;
3235
3236     Py_UNICODE *s1 = str1->str;
3237     Py_UNICODE *s2 = str2->str;
3238
3239     len1 = str1->length;
3240     len2 = str2->length;
3241
3242     while (len1 > 0 && len2 > 0) {
3243         register long diff;
3244
3245         diff = (long)*s1++ - (long)*s2++;
3246         if (diff)
3247             return (diff < 0) ? -1 : (diff != 0);
3248         len1--; len2--;
3249     }
3250
3251     return (len1 < len2) ? -1 : (len1 != len2);
3252 }
3253
3254 #endif
3255
3256 int PyUnicode_Compare(PyObject *left,
3257                       PyObject *right)
3258 {
3259     PyUnicodeObject *u = NULL, *v = NULL;
3260     int result;
3261
3262     /* Coerce the two arguments */
3263     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3264     if (u == NULL)
3265         goto onError;
3266     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3267     if (v == NULL)
3268         goto onError;
3269
3270     /* Shortcut for empty or interned objects */
3271     if (v == u) {
3272         Py_DECREF(u);
3273         Py_DECREF(v);
3274         return 0;
3275     }
3276
3277     result = unicode_compare(u, v);
3278
3279     Py_DECREF(u);
3280     Py_DECREF(v);
3281     return result;
3282
3283 onError:
3284     Py_XDECREF(u);
3285     Py_XDECREF(v);
3286     return -1;
3287 }
3288
3289 int PyUnicode_Contains(PyObject *container,
3290                        PyObject *element)
3291 {
3292     PyUnicodeObject *u = NULL, *v = NULL;
3293     int result;
3294     register const Py_UNICODE *p, *e;
3295     register Py_UNICODE ch;
3296
3297     /* Coerce the two arguments */
3298     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3299     if (v == NULL) {
3300         PyErr_SetString(PyExc_TypeError,
3301             "'in <string>' requires character as left operand");
3302         goto onError;
3303     }
3304     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3305     if (u == NULL) {
3306         Py_DECREF(v);
3307         goto onError;
3308     }
3309
3310     /* Check v in u */
3311     if (PyUnicode_GET_SIZE(v) != 1) {
3312         PyErr_SetString(PyExc_TypeError,
3313             "'in <string>' requires character as left operand");
3314         goto onError;
3315     }
3316     ch = *PyUnicode_AS_UNICODE(v);
3317     p = PyUnicode_AS_UNICODE(u);
3318     e = p + PyUnicode_GET_SIZE(u);
3319     result = 0;
3320     while (p < e) {
3321         if (*p++ == ch) {
3322             result = 1;
3323             break;
3324         }
3325     }
3326
3327     Py_DECREF(u);
3328     Py_DECREF(v);
3329     return result;
3330
3331 onError:
3332     Py_XDECREF(u);
3333     Py_XDECREF(v);
3334     return -1;
3335 }
3336
3337 /* Concat to string or Unicode object giving a new Unicode object. */
3338
3339 PyObject *PyUnicode_Concat(PyObject *left,
3340                            PyObject *right)
3341 {
3342     PyUnicodeObject *u = NULL, *v = NULL, *w;
3343
3344     /* Coerce the two arguments */
3345     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3346     if (u == NULL)
3347         goto onError;
3348     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3349     if (v == NULL)
3350         goto onError;
3351
3352     /* Shortcuts */
3353     if (v == unicode_empty) {
3354         Py_DECREF(v);
3355         return (PyObject *)u;
3356     }
3357     if (u == unicode_empty) {
3358         Py_DECREF(u);
3359         return (PyObject *)v;
3360     }
3361
3362     /* Concat the two Unicode strings */
3363     w = _PyUnicode_New(u->length + v->length);
3364     if (w == NULL)
3365         goto onError;
3366     Py_UNICODE_COPY(w->str, u->str, u->length);
3367     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3368
3369     Py_DECREF(u);
3370     Py_DECREF(v);
3371     return (PyObject *)w;
3372
3373 onError:
3374     Py_XDECREF(u);
3375     Py_XDECREF(v);
3376     return NULL;
3377 }
3378
3379 static char count__doc__[] =
3380 "S.count(sub[, start[, end]]) -> int\n\
3381 \n\
3382 Return the number of occurrences of substring sub in Unicode string\n\
3383 S[start:end].  Optional arguments start and end are\n\
3384 interpreted as in slice notation.";
3385
3386 static PyObject *
3387 unicode_count(PyUnicodeObject *self, PyObject *args)
3388 {
3389     PyUnicodeObject *substring;
3390     int start = 0;
3391     int end = INT_MAX;
3392     PyObject *result;
3393
3394     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3395                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3396         return NULL;
3397
3398     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3399                                                 (PyObject *)substring);
3400     if (substring == NULL)
3401         return NULL;
3402
3403     if (start < 0)
3404         start += self->length;
3405     if (start < 0)
3406         start = 0;
3407     if (end > self->length)
3408         end = self->length;
3409     if (end < 0)
3410         end += self->length;
3411     if (end < 0)
3412         end = 0;
3413
3414     result = PyInt_FromLong((long) count(self, start, end, substring));
3415
3416     Py_DECREF(substring);
3417     return result;
3418 }
3419
3420 static char encode__doc__[] =
3421 "S.encode([encoding[,errors]]) -> string\n\
3422 \n\
3423 Return an encoded string version of S. Default encoding is the current\n\
3424 default string encoding. errors may be given to set a different error\n\
3425 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3426 a ValueError. Other possible values are 'ignore' and 'replace'.";
3427
3428 static PyObject *
3429 unicode_encode(PyUnicodeObject *self, PyObject *args)
3430 {
3431     char *encoding = NULL;
3432     char *errors = NULL;
3433     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3434         return NULL;
3435     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3436 }
3437
3438 static char expandtabs__doc__[] =
3439 "S.expandtabs([tabsize]) -> unicode\n\
3440 \n\
3441 Return a copy of S where all tab characters are expanded using spaces.\n\
3442 If tabsize is not given, a tab size of 8 characters is assumed.";
3443
3444 static PyObject*
3445 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3446 {
3447     Py_UNICODE *e;
3448     Py_UNICODE *p;
3449     Py_UNICODE *q;
3450     int i, j;
3451     PyUnicodeObject *u;
3452     int tabsize = 8;
3453
3454     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3455         return NULL;
3456
3457     /* First pass: determine size of output string */
3458     i = j = 0;
3459     e = self->str + self->length;
3460     for (p = self->str; p < e; p++)
3461         if (*p == '\t') {
3462             if (tabsize > 0)
3463                 j += tabsize - (j % tabsize);
3464         }
3465         else {
3466             j++;
3467             if (*p == '\n' || *p == '\r') {
3468                 i += j;
3469                 j = 0;
3470             }
3471         }
3472
3473     /* Second pass: create output string and fill it */
3474     u = _PyUnicode_New(i + j);
3475     if (!u)
3476         return NULL;
3477
3478     j = 0;
3479     q = u->str;
3480
3481     for (p = self->str; p < e; p++)
3482         if (*p == '\t') {
3483             if (tabsize > 0) {
3484                 i = tabsize - (j % tabsize);
3485                 j += i;
3486                 while (i--)
3487                     *q++ = ' ';
3488             }
3489         }
3490         else {
3491             j++;
3492             *q++ = *p;
3493             if (*p == '\n' || *p == '\r')
3494                 j = 0;
3495         }
3496
3497     return (PyObject*) u;
3498 }
3499
3500 static char find__doc__[] =
3501 "S.find(sub [,start [,end]]) -> int\n\
3502 \n\
3503 Return the lowest index in S where substring sub is found,\n\
3504 such that sub is contained within s[start,end].  Optional\n\
3505 arguments start and end are interpreted as in slice notation.\n\
3506 \n\
3507 Return -1 on failure.";
3508
3509 static PyObject *
3510 unicode_find(PyUnicodeObject *self, PyObject *args)
3511 {
3512     PyUnicodeObject *substring;
3513     int start = 0;
3514     int end = INT_MAX;
3515     PyObject *result;
3516
3517     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3518                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3519         return NULL;
3520     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3521                                                 (PyObject *)substring);
3522     if (substring == NULL)
3523         return NULL;
3524
3525     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3526
3527     Py_DECREF(substring);
3528     return result;
3529 }
3530
3531 static PyObject *
3532 unicode_getitem(PyUnicodeObject *self, int index)
3533 {
3534     if (index < 0 || index >= self->length) {
3535         PyErr_SetString(PyExc_IndexError, "string index out of range");
3536         return NULL;
3537     }
3538
3539     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3540 }
3541
3542 static long
3543 unicode_hash(PyUnicodeObject *self)
3544 {
3545     /* Since Unicode objects compare equal to their ASCII string
3546        counterparts, they should use the individual character values
3547        as basis for their hash value.  This is needed to assure that
3548        strings and Unicode objects behave in the same way as
3549        dictionary keys. */
3550
3551     register int len;
3552     register Py_UNICODE *p;
3553     register long x;
3554
3555     if (self->hash != -1)
3556         return self->hash;
3557     len = PyUnicode_GET_SIZE(self);
3558     p = PyUnicode_AS_UNICODE(self);
3559     x = *p << 7;
3560     while (--len >= 0)
3561         x = (1000003*x) ^ *p++;
3562     x ^= PyUnicode_GET_SIZE(self);
3563     if (x == -1)
3564         x = -2;
3565     self->hash = x;
3566     return x;
3567 }
3568
3569 static char index__doc__[] =
3570 "S.index(sub [,start [,end]]) -> int\n\
3571 \n\
3572 Like S.find() but raise ValueError when the substring is not found.";
3573
3574 static PyObject *
3575 unicode_index(PyUnicodeObject *self, PyObject *args)
3576 {
3577     int result;
3578     PyUnicodeObject *substring;
3579     int start = 0;
3580     int end = INT_MAX;
3581
3582     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3583                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3584         return NULL;
3585
3586     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3587                                                 (PyObject *)substring);
3588     if (substring == NULL)
3589         return NULL;
3590
3591     result = findstring(self, substring, start, end, 1);
3592
3593     Py_DECREF(substring);
3594     if (result < 0) {
3595         PyErr_SetString(PyExc_ValueError, "substring not found");
3596         return NULL;
3597     }
3598     return PyInt_FromLong(result);
3599 }
3600
3601 static char islower__doc__[] =
3602 "S.islower() -> int\n\
3603 \n\
3604 Return 1 if  all cased characters in S are lowercase and there is\n\
3605 at least one cased character in S, 0 otherwise.";
3606
3607 static PyObject*
3608 unicode_islower(PyUnicodeObject *self, PyObject *args)
3609 {
3610     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3611     register const Py_UNICODE *e;
3612     int cased;
3613
3614     if (!PyArg_NoArgs(args))
3615         return NULL;
3616
3617     /* Shortcut for single character strings */
3618     if (PyUnicode_GET_SIZE(self) == 1)
3619         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3620
3621     /* Special case for empty strings */
3622     if (PyString_GET_SIZE(self) == 0)
3623         return PyInt_FromLong(0);
3624
3625     e = p + PyUnicode_GET_SIZE(self);
3626     cased = 0;
3627     for (; p < e; p++) {
3628         register const Py_UNICODE ch = *p;
3629
3630         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3631             return PyInt_FromLong(0);
3632         else if (!cased && Py_UNICODE_ISLOWER(ch))
3633             cased = 1;
3634     }
3635     return PyInt_FromLong(cased);
3636 }
3637
3638 static char isupper__doc__[] =
3639 "S.isupper() -> int\n\
3640 \n\
3641 Return 1 if  all cased characters in S are uppercase and there is\n\
3642 at least one cased character in S, 0 otherwise.";
3643
3644 static PyObject*
3645 unicode_isupper(PyUnicodeObject *self, PyObject *args)
3646 {
3647     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3648     register const Py_UNICODE *e;
3649     int cased;
3650
3651     if (!PyArg_NoArgs(args))
3652         return NULL;
3653
3654     /* Shortcut for single character strings */
3655     if (PyUnicode_GET_SIZE(self) == 1)
3656         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3657
3658     /* Special case for empty strings */
3659     if (PyString_GET_SIZE(self) == 0)
3660         return PyInt_FromLong(0);
3661
3662     e = p + PyUnicode_GET_SIZE(self);
3663     cased = 0;
3664     for (; p < e; p++) {
3665         register const Py_UNICODE ch = *p;
3666
3667         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3668             return PyInt_FromLong(0);
3669         else if (!cased && Py_UNICODE_ISUPPER(ch))
3670             cased = 1;
3671     }
3672     return PyInt_FromLong(cased);
3673 }
3674
3675 static char istitle__doc__[] =
3676 "S.istitle() -> int\n\
3677 \n\
3678 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3679 may only follow uncased characters and lowercase characters only cased\n\
3680 ones. Return 0 otherwise.";
3681
3682 static PyObject*
3683 unicode_istitle(PyUnicodeObject *self, PyObject *args)
3684 {
3685     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3686     register const Py_UNICODE *e;
3687     int cased, previous_is_cased;
3688
3689     if (!PyArg_NoArgs(args))
3690         return NULL;
3691
3692     /* Shortcut for single character strings */
3693     if (PyUnicode_GET_SIZE(self) == 1)
3694         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3695                               (Py_UNICODE_ISUPPER(*p) != 0));
3696
3697     /* Special case for empty strings */
3698     if (PyString_GET_SIZE(self) == 0)
3699         return PyInt_FromLong(0);
3700
3701     e = p + PyUnicode_GET_SIZE(self);
3702     cased = 0;
3703     previous_is_cased = 0;
3704     for (; p < e; p++) {
3705         register const Py_UNICODE ch = *p;
3706
3707         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3708             if (previous_is_cased)
3709                 return PyInt_FromLong(0);
3710             previous_is_cased = 1;
3711             cased = 1;
3712         }
3713         else if (Py_UNICODE_ISLOWER(ch)) {
3714             if (!previous_is_cased)
3715                 return PyInt_FromLong(0);
3716             previous_is_cased = 1;
3717             cased = 1;
3718         }
3719         else
3720             previous_is_cased = 0;
3721     }
3722     return PyInt_FromLong(cased);
3723 }
3724
3725 static char isspace__doc__[] =
3726 "S.isspace() -> int\n\
3727 \n\
3728 Return 1 if there are only whitespace characters in S,\n\
3729 0 otherwise.";
3730
3731 static PyObject*
3732 unicode_isspace(PyUnicodeObject *self, PyObject *args)
3733 {
3734     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3735     register const Py_UNICODE *e;
3736
3737     if (!PyArg_NoArgs(args))
3738         return NULL;
3739
3740     /* Shortcut for single character strings */
3741     if (PyUnicode_GET_SIZE(self) == 1 &&
3742         Py_UNICODE_ISSPACE(*p))
3743         return PyInt_FromLong(1);
3744
3745     /* Special case for empty strings */
3746     if (PyString_GET_SIZE(self) == 0)
3747         return PyInt_FromLong(0);
3748
3749     e = p + PyUnicode_GET_SIZE(self);
3750     for (; p < e; p++) {
3751         if (!Py_UNICODE_ISSPACE(*p))
3752             return PyInt_FromLong(0);
3753     }
3754     return PyInt_FromLong(1);
3755 }
3756
3757 static char isalpha__doc__[] =
3758 "S.isalpha() -> int\n\
3759 \n\
3760 Return 1 if  all characters in S are alphabetic\n\
3761 and there is at least one character in S, 0 otherwise.";
3762
3763 static PyObject*
3764 unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3765 {
3766     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3767     register const Py_UNICODE *e;
3768
3769     if (!PyArg_NoArgs(args))
3770         return NULL;
3771
3772     /* Shortcut for single character strings */
3773     if (PyUnicode_GET_SIZE(self) == 1 &&
3774         Py_UNICODE_ISALPHA(*p))
3775         return PyInt_FromLong(1);
3776
3777     /* Special case for empty strings */
3778     if (PyString_GET_SIZE(self) == 0)
3779         return PyInt_FromLong(0);
3780
3781     e = p + PyUnicode_GET_SIZE(self);
3782     for (; p < e; p++) {
3783         if (!Py_UNICODE_ISALPHA(*p))
3784             return PyInt_FromLong(0);
3785     }
3786     return PyInt_FromLong(1);
3787 }
3788
3789 static char isalnum__doc__[] =
3790 "S.isalnum() -> int\n\
3791 \n\
3792 Return 1 if  all characters in S are alphanumeric\n\
3793 and there is at least one character in S, 0 otherwise.";
3794
3795 static PyObject*
3796 unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3797 {
3798     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3799     register const Py_UNICODE *e;
3800
3801     if (!PyArg_NoArgs(args))
3802         return NULL;
3803
3804     /* Shortcut for single character strings */
3805     if (PyUnicode_GET_SIZE(self) == 1 &&
3806         Py_UNICODE_ISALNUM(*p))
3807         return PyInt_FromLong(1);
3808
3809     /* Special case for empty strings */
3810     if (PyString_GET_SIZE(self) == 0)
3811         return PyInt_FromLong(0);
3812
3813     e = p + PyUnicode_GET_SIZE(self);
3814     for (; p < e; p++) {
3815         if (!Py_UNICODE_ISALNUM(*p))
3816             return PyInt_FromLong(0);
3817     }
3818     return PyInt_FromLong(1);
3819 }
3820
3821 static char isdecimal__doc__[] =
3822 "S.isdecimal() -> int\n\
3823 \n\
3824 Return 1 if there are only decimal characters in S,\n\
3825 0 otherwise.";
3826
3827 static PyObject*
3828 unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3829 {
3830     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3831     register const Py_UNICODE *e;
3832
3833     if (!PyArg_NoArgs(args))
3834         return NULL;
3835
3836     /* Shortcut for single character strings */
3837     if (PyUnicode_GET_SIZE(self) == 1 &&
3838         Py_UNICODE_ISDECIMAL(*p))
3839         return PyInt_FromLong(1);
3840
3841     /* Special case for empty strings */
3842     if (PyString_GET_SIZE(self) == 0)
3843         return PyInt_FromLong(0);
3844
3845     e = p + PyUnicode_GET_SIZE(self);
3846     for (; p < e; p++) {
3847         if (!Py_UNICODE_ISDECIMAL(*p))
3848             return PyInt_FromLong(0);
3849     }
3850     return PyInt_FromLong(1);
3851 }
3852
3853 static char isdigit__doc__[] =
3854 "S.isdigit() -> int\n\
3855 \n\
3856 Return 1 if there are only digit characters in S,\n\
3857 0 otherwise.";
3858
3859 static PyObject*
3860 unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3861 {
3862     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3863     register const Py_UNICODE *e;
3864
3865     if (!PyArg_NoArgs(args))
3866         return NULL;
3867
3868     /* Shortcut for single character strings */
3869     if (PyUnicode_GET_SIZE(self) == 1 &&
3870         Py_UNICODE_ISDIGIT(*p))
3871         return PyInt_FromLong(1);
3872
3873     /* Special case for empty strings */
3874     if (PyString_GET_SIZE(self) == 0)
3875         return PyInt_FromLong(0);
3876
3877     e = p + PyUnicode_GET_SIZE(self);
3878     for (; p < e; p++) {
3879         if (!Py_UNICODE_ISDIGIT(*p))
3880             return PyInt_FromLong(0);
3881     }
3882     return PyInt_FromLong(1);
3883 }
3884
3885 static char isnumeric__doc__[] =
3886 "S.isnumeric() -> int\n\
3887 \n\
3888 Return 1 if there are only numeric characters in S,\n\
3889 0 otherwise.";
3890
3891 static PyObject*
3892 unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3893 {
3894     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3895     register const Py_UNICODE *e;
3896
3897     if (!PyArg_NoArgs(args))
3898         return NULL;
3899
3900     /* Shortcut for single character strings */
3901     if (PyUnicode_GET_SIZE(self) == 1 &&
3902         Py_UNICODE_ISNUMERIC(*p))
3903         return PyInt_FromLong(1);
3904
3905     /* Special case for empty strings */
3906     if (PyString_GET_SIZE(self) == 0)
3907         return PyInt_FromLong(0);
3908
3909     e = p + PyUnicode_GET_SIZE(self);
3910     for (; p < e; p++) {
3911         if (!Py_UNICODE_ISNUMERIC(*p))
3912             return PyInt_FromLong(0);
3913     }
3914     return PyInt_FromLong(1);
3915 }
3916
3917 static char join__doc__[] =
3918 "S.join(sequence) -> unicode\n\
3919 \n\
3920 Return a string which is the concatenation of the strings in the\n\
3921 sequence.  The separator between elements is S.";
3922
3923 static PyObject*
3924 unicode_join(PyUnicodeObject *self, PyObject *args)
3925 {
3926     PyObject *data;
3927     if (!PyArg_ParseTuple(args, "O:join", &data))
3928         return NULL;
3929
3930     return PyUnicode_Join((PyObject *)self, data);
3931 }
3932
3933 static int
3934 unicode_length(PyUnicodeObject *self)
3935 {
3936     return self->length;
3937 }
3938
3939 static char ljust__doc__[] =
3940 "S.ljust(width) -> unicode\n\
3941 \n\
3942 Return S left justified in a Unicode string of length width. Padding is\n\
3943 done using spaces.";
3944
3945 static PyObject *
3946 unicode_ljust(PyUnicodeObject *self, PyObject *args)
3947 {
3948     int width;
3949     if (!PyArg_ParseTuple(args, "i:ljust", &width))
3950         return NULL;
3951
3952     if (self->length >= width) {
3953         Py_INCREF(self);
3954         return (PyObject*) self;
3955     }
3956
3957     return (PyObject*) pad(self, 0, width - self->length, ' ');
3958 }
3959
3960 static char lower__doc__[] =
3961 "S.lower() -> unicode\n\
3962 \n\
3963 Return a copy of the string S converted to lowercase.";
3964
3965 static PyObject*
3966 unicode_lower(PyUnicodeObject *self, PyObject *args)
3967 {
3968     if (!PyArg_NoArgs(args))
3969         return NULL;
3970     return fixup(self, fixlower);
3971 }
3972
3973 static char lstrip__doc__[] =
3974 "S.lstrip() -> unicode\n\
3975 \n\
3976 Return a copy of the string S with leading whitespace removed.";
3977
3978 static PyObject *
3979 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3980 {
3981     if (!PyArg_NoArgs(args))
3982         return NULL;
3983     return strip(self, 1, 0);
3984 }
3985
3986 static PyObject*
3987 unicode_repeat(PyUnicodeObject *str, int len)
3988 {
3989     PyUnicodeObject *u;
3990     Py_UNICODE *p;
3991
3992     if (len < 0)
3993         len = 0;
3994
3995     if (len == 1) {
3996         /* no repeat, return original string */
3997         Py_INCREF(str);
3998         return (PyObject*) str;
3999     }
4000
4001     u = _PyUnicode_New(len * str->length);
4002     if (!u)
4003         return NULL;
4004
4005     p = u->str;
4006
4007     while (len-- > 0) {
4008         Py_UNICODE_COPY(p, str->str, str->length);
4009         p += str->length;
4010     }
4011
4012     return (PyObject*) u;
4013 }
4014
4015 PyObject *PyUnicode_Replace(PyObject *obj,
4016                             PyObject *subobj,
4017                             PyObject *replobj,
4018                             int maxcount)
4019 {
4020     PyObject *self;
4021     PyObject *str1;
4022     PyObject *str2;
4023     PyObject *result;
4024
4025     self = PyUnicode_FromObject(obj);
4026     if (self == NULL)
4027         return NULL;
4028     str1 = PyUnicode_FromObject(subobj);
4029     if (str1 == NULL) {
4030         Py_DECREF(self);
4031         return NULL;
4032     }
4033     str2 = PyUnicode_FromObject(replobj);
4034     if (str2 == NULL) {
4035         Py_DECREF(self);
4036         Py_DECREF(str1);
4037         return NULL;
4038     }
4039     result = replace((PyUnicodeObject *)self,
4040                      (PyUnicodeObject *)str1,
4041                      (PyUnicodeObject *)str2,
4042                      maxcount);
4043     Py_DECREF(self);
4044     Py_DECREF(str1);
4045     Py_DECREF(str2);
4046     return result;
4047 }
4048
4049 static char replace__doc__[] =
4050 "S.replace (old, new[, maxsplit]) -> unicode\n\
4051 \n\
4052 Return a copy of S with all occurrences of substring\n\
4053 old replaced by new.  If the optional argument maxsplit is\n\
4054 given, only the first maxsplit occurrences are replaced.";
4055
4056 static PyObject*
4057 unicode_replace(PyUnicodeObject *self, PyObject *args)
4058 {
4059     PyUnicodeObject *str1;
4060     PyUnicodeObject *str2;
4061     int maxcount = -1;
4062     PyObject *result;
4063
4064     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4065         return NULL;
4066     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4067     if (str1 == NULL)
4068         return NULL;
4069     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4070     if (str2 == NULL)
4071         return NULL;
4072
4073     result = replace(self, str1, str2, maxcount);
4074
4075     Py_DECREF(str1);
4076     Py_DECREF(str2);
4077     return result;
4078 }
4079
4080 static
4081 PyObject *unicode_repr(PyObject *unicode)
4082 {
4083     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4084                                 PyUnicode_GET_SIZE(unicode),
4085                                 1);
4086 }
4087
4088 static char rfind__doc__[] =
4089 "S.rfind(sub [,start [,end]]) -> int\n\
4090 \n\
4091 Return the highest index in S where substring sub is found,\n\
4092 such that sub is contained within s[start,end].  Optional\n\
4093 arguments start and end are interpreted as in slice notation.\n\
4094 \n\
4095 Return -1 on failure.";
4096
4097 static PyObject *
4098 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4099 {
4100     PyUnicodeObject *substring;
4101     int start = 0;
4102     int end = INT_MAX;
4103     PyObject *result;
4104
4105     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4106                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4107         return NULL;
4108     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4109                                                 (PyObject *)substring);
4110     if (substring == NULL)
4111         return NULL;
4112
4113     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4114
4115     Py_DECREF(substring);
4116     return result;
4117 }
4118
4119 static char rindex__doc__[] =
4120 "S.rindex(sub [,start [,end]]) -> int\n\
4121 \n\
4122 Like S.rfind() but raise ValueError when the substring is not found.";
4123
4124 static PyObject *
4125 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4126 {
4127     int result;
4128     PyUnicodeObject *substring;
4129     int start = 0;
4130     int end = INT_MAX;
4131
4132     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4133                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4134         return NULL;
4135     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4136                                                 (PyObject *)substring);
4137     if (substring == NULL)
4138         return NULL;
4139
4140     result = findstring(self, substring, start, end, -1);
4141
4142     Py_DECREF(substring);
4143     if (result < 0) {
4144         PyErr_SetString(PyExc_ValueError, "substring not found");
4145         return NULL;
4146     }
4147     return PyInt_FromLong(result);
4148 }
4149
4150 static char rjust__doc__[] =
4151 "S.rjust(width) -> unicode\n\
4152 \n\
4153 Return S right justified in a Unicode string of length width. Padding is\n\
4154 done using spaces.";
4155
4156 static PyObject *
4157 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4158 {
4159     int width;
4160     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4161         return NULL;
4162
4163     if (self->length >= width) {
4164         Py_INCREF(self);
4165         return (PyObject*) self;
4166     }
4167
4168     return (PyObject*) pad(self, width - self->length, 0, ' ');
4169 }
4170
4171 static char rstrip__doc__[] =
4172 "S.rstrip() -> unicode\n\
4173 \n\
4174 Return a copy of the string S with trailing whitespace removed.";
4175
4176 static PyObject *
4177 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4178 {
4179     if (!PyArg_NoArgs(args))
4180         return NULL;
4181     return strip(self, 0, 1);
4182 }
4183
4184 static PyObject*
4185 unicode_slice(PyUnicodeObject *self, int start, int end)
4186 {
4187     /* standard clamping */
4188     if (start < 0)
4189         start = 0;
4190     if (end < 0)
4191         end = 0;
4192     if (end > self->length)
4193         end = self->length;
4194     if (start == 0 && end == self->length) {
4195         /* full slice, return original string */
4196         Py_INCREF(self);
4197         return (PyObject*) self;
4198     }
4199     if (start > end)
4200         start = end;
4201     /* copy slice */
4202     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4203                                              end - start);
4204 }
4205
4206 PyObject *PyUnicode_Split(PyObject *s,
4207                           PyObject *sep,
4208                           int maxsplit)
4209 {
4210     PyObject *result;
4211
4212     s = PyUnicode_FromObject(s);
4213     if (s == NULL)
4214         return NULL;
4215     if (sep != NULL) {
4216         sep = PyUnicode_FromObject(sep);
4217         if (sep == NULL) {
4218             Py_DECREF(s);
4219             return NULL;
4220         }
4221     }
4222
4223     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4224
4225     Py_DECREF(s);
4226     Py_XDECREF(sep);
4227     return result;
4228 }
4229
4230 static char split__doc__[] =
4231 "S.split([sep [,maxsplit]]) -> list of strings\n\
4232 \n\
4233 Return a list of the words in S, using sep as the\n\
4234 delimiter string.  If maxsplit is given, at most maxsplit\n\
4235 splits are done. If sep is not specified, any whitespace string\n\
4236 is a separator.";
4237
4238 static PyObject*
4239 unicode_split(PyUnicodeObject *self, PyObject *args)
4240 {
4241     PyObject *substring = Py_None;
4242     int maxcount = -1;
4243
4244     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4245         return NULL;
4246
4247     if (substring == Py_None)
4248         return split(self, NULL, maxcount);
4249     else if (PyUnicode_Check(substring))
4250         return split(self, (PyUnicodeObject *)substring, maxcount);
4251     else
4252         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4253 }
4254
4255 static char splitlines__doc__[] =
4256 "S.splitlines([keepends]]) -> list of strings\n\
4257 \n\
4258 Return a list of the lines in S, breaking at line boundaries.\n\
4259 Line breaks are not included in the resulting list unless keepends\n\
4260 is given and true.";
4261
4262 static PyObject*
4263 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4264 {
4265     int keepends = 0;
4266
4267     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4268         return NULL;
4269
4270     return PyUnicode_Splitlines((PyObject *)self, keepends);
4271 }
4272
4273 static
4274 PyObject *unicode_str(PyUnicodeObject *self)
4275 {
4276     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4277 }
4278
4279 static char strip__doc__[] =
4280 "S.strip() -> unicode\n\
4281 \n\
4282 Return a copy of S with leading and trailing whitespace removed.";
4283
4284 static PyObject *
4285 unicode_strip(PyUnicodeObject *self, PyObject *args)
4286 {
4287     if (!PyArg_NoArgs(args))
4288         return NULL;
4289     return strip(self, 1, 1);
4290 }
4291
4292 static char swapcase__doc__[] =
4293 "S.swapcase() -> unicode\n\
4294 \n\
4295 Return a copy of S with uppercase characters converted to lowercase\n\
4296 and vice versa.";
4297
4298 static PyObject*
4299 unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4300 {
4301     if (!PyArg_NoArgs(args))
4302         return NULL;
4303     return fixup(self, fixswapcase);
4304 }
4305
4306 static char translate__doc__[] =
4307 "S.translate(table) -> unicode\n\
4308 \n\
4309 Return a copy of the string S, where all characters have been mapped\n\
4310 through the given translation table, which must be a mapping of\n\
4311 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4312 are left untouched. Characters mapped to None are deleted.";
4313
4314 static PyObject*
4315 unicode_translate(PyUnicodeObject *self, PyObject *args)
4316 {
4317     PyObject *table;
4318
4319     if (!PyArg_ParseTuple(args, "O:translate", &table))
4320         return NULL;
4321     return PyUnicode_TranslateCharmap(self->str,
4322                                       self->length,
4323                                       table,
4324                                       "ignore");
4325 }
4326
4327 static char upper__doc__[] =
4328 "S.upper() -> unicode\n\
4329 \n\
4330 Return a copy of S converted to uppercase.";
4331
4332 static PyObject*
4333 unicode_upper(PyUnicodeObject *self, PyObject *args)
4334 {
4335     if (!PyArg_NoArgs(args))
4336         return NULL;
4337     return fixup(self, fixupper);
4338 }
4339
4340 #if 0
4341 static char zfill__doc__[] =
4342 "S.zfill(width) -> unicode\n\
4343 \n\
4344 Pad a numeric string x with zeros on the left, to fill a field\n\
4345 of the specified width. The string x is never truncated.";
4346
4347 static PyObject *
4348 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4349 {
4350     int fill;
4351     PyUnicodeObject *u;
4352
4353     int width;
4354     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4355         return NULL;
4356
4357     if (self->length >= width) {
4358         Py_INCREF(self);
4359         return (PyObject*) self;
4360     }
4361
4362     fill = width - self->length;
4363
4364     u = pad(self, fill, 0, '0');
4365
4366     if (u->str[fill] == '+' || u->str[fill] == '-') {
4367         /* move sign to beginning of string */
4368         u->str[0] = u->str[fill];
4369         u->str[fill] = '0';
4370     }
4371
4372     return (PyObject*) u;
4373 }
4374 #endif
4375
4376 #if 0
4377 static PyObject*
4378 unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4379 {
4380     if (!PyArg_NoArgs(args))
4381         return NULL;
4382     return PyInt_FromLong(unicode_freelist_size);
4383 }
4384 #endif
4385
4386 static char startswith__doc__[] =
4387 "S.startswith(prefix[, start[, end]]) -> int\n\
4388 \n\
4389 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4390 optional start, test S beginning at that position.  With optional end, stop\n\
4391 comparing S at that position.";
4392
4393 static PyObject *
4394 unicode_startswith(PyUnicodeObject *self,
4395                    PyObject *args)
4396 {
4397     PyUnicodeObject *substring;
4398     int start = 0;
4399     int end = INT_MAX;
4400     PyObject *result;
4401
4402     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4403                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4404         return NULL;
4405     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4406                                                 (PyObject *)substring);
4407     if (substring == NULL)
4408         return NULL;
4409
4410     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4411
4412     Py_DECREF(substring);
4413     return result;
4414 }
4415
4416
4417 static char endswith__doc__[] =
4418 "S.endswith(suffix[, start[, end]]) -> int\n\
4419 \n\
4420 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4421 optional start, test S beginning at that position.  With optional end, stop\n\
4422 comparing S at that position.";
4423
4424 static PyObject *
4425 unicode_endswith(PyUnicodeObject *self,
4426                  PyObject *args)
4427 {
4428     PyUnicodeObject *substring;
4429     int start = 0;
4430     int end = INT_MAX;
4431     PyObject *result;
4432
4433     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4434                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4435         return NULL;
4436     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4437                                                 (PyObject *)substring);
4438     if (substring == NULL)
4439         return NULL;
4440
4441     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4442
4443     Py_DECREF(substring);
4444     return result;
4445 }
4446
4447
4448 static PyMethodDef unicode_methods[] = {
4449
4450     /* Order is according to common usage: often used methods should
4451        appear first, since lookup is done sequentially. */
4452
4453     {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4454     {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4455     {"split", (PyCFunction) unicode_split, 1, split__doc__},
4456     {"join", (PyCFunction) unicode_join, 1, join__doc__},
4457     {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4458     {"title", (PyCFunction) unicode_title, 0, title__doc__},
4459     {"center", (PyCFunction) unicode_center, 1, center__doc__},
4460     {"count", (PyCFunction) unicode_count, 1, count__doc__},
4461     {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4462     {"find", (PyCFunction) unicode_find, 1, find__doc__},
4463     {"index", (PyCFunction) unicode_index, 1, index__doc__},
4464     {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4465     {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4466     {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4467 /*  {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4468     {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4469     {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4470     {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4471     {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4472     {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4473     {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4474     {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4475     {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4476     {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4477     {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4478     {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4479     {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4480     {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4481     {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4482     {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4483     {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4484     {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4485     {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4486     {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4487     {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
4488 #if 0
4489     {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4490     {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4491 #endif
4492
4493 #if 0
4494     /* This one is just used for debugging the implementation. */
4495     {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4496 #endif
4497
4498     {NULL, NULL}
4499 };
4500
4501 static PyObject *
4502 unicode_getattr(PyUnicodeObject *self, char *name)
4503 {
4504     return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4505 }
4506
4507 static PySequenceMethods unicode_as_sequence = {
4508     (inquiry) unicode_length,           /* sq_length */
4509     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4510     (intargfunc) unicode_repeat,        /* sq_repeat */
4511     (intargfunc) unicode_getitem,       /* sq_item */
4512     (intintargfunc) unicode_slice,      /* sq_slice */
4513     0,                                  /* sq_ass_item */
4514     0,                                  /* sq_ass_slice */
4515     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4516 };
4517
4518 static int
4519 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4520                           int index,
4521                           const void **ptr)
4522 {
4523     if (index != 0) {
4524         PyErr_SetString(PyExc_SystemError,
4525                         "accessing non-existent unicode segment");
4526         return -1;
4527     }
4528     *ptr = (void *) self->str;
4529     return PyUnicode_GET_DATA_SIZE(self);
4530 }
4531
4532 static int
4533 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4534                            const void **ptr)
4535 {
4536     PyErr_SetString(PyExc_TypeError,
4537                     "cannot use unicode as modifyable buffer");
4538     return -1;
4539 }
4540
4541 static int
4542 unicode_buffer_getsegcount(PyUnicodeObject *self,
4543                            int *lenp)
4544 {
4545     if (lenp)
4546         *lenp = PyUnicode_GET_DATA_SIZE(self);
4547     return 1;
4548 }
4549
4550 static int
4551 unicode_buffer_getcharbuf(PyUnicodeObject *self,
4552                           int index,
4553                           const void **ptr)
4554 {
4555     PyObject *str;
4556
4557     if (index != 0) {
4558         PyErr_SetString(PyExc_SystemError,
4559                         "accessing non-existent unicode segment");
4560         return -1;
4561     }
4562     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
4563     if (str == NULL)
4564         return -1;
4565     *ptr = (void *) PyString_AS_STRING(str);
4566     return PyString_GET_SIZE(str);
4567 }
4568
4569 /* Helpers for PyUnicode_Format() */
4570
4571 static PyObject *
4572 getnextarg(PyObject *args, int arglen, int *p_argidx)
4573 {
4574     int argidx = *p_argidx;
4575     if (argidx < arglen) {
4576         (*p_argidx)++;
4577         if (arglen < 0)
4578             return args;
4579         else
4580             return PyTuple_GetItem(args, argidx);
4581     }
4582     PyErr_SetString(PyExc_TypeError,
4583                     "not enough arguments for format string");
4584     return NULL;
4585 }
4586
4587 #define F_LJUST (1<<0)
4588 #define F_SIGN  (1<<1)
4589 #define F_BLANK (1<<2)
4590 #define F_ALT   (1<<3)
4591 #define F_ZERO  (1<<4)
4592
4593 static
4594 int usprintf(register Py_UNICODE *buffer, char *format, ...)
4595 {
4596     register int i;
4597     int len;
4598     va_list va;
4599     char *charbuffer;
4600     va_start(va, format);
4601
4602     /* First, format the string as char array, then expand to Py_UNICODE
4603        array. */
4604     charbuffer = (char *)buffer;
4605     len = vsprintf(charbuffer, format, va);
4606     for (i = len - 1; i >= 0; i--)
4607         buffer[i] = (Py_UNICODE) charbuffer[i];
4608
4609     va_end(va);
4610     return len;
4611 }
4612
4613 static int
4614 formatfloat(Py_UNICODE *buf,
4615             size_t buflen,
4616             int flags,
4617             int prec,
4618             int type,
4619             PyObject *v)
4620 {
4621     /* fmt = '%#.' + `prec` + `type`
4622        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
4623     char fmt[20];
4624     double x;
4625
4626     x = PyFloat_AsDouble(v);
4627     if (x == -1.0 && PyErr_Occurred())
4628         return -1;
4629     if (prec < 0)
4630         prec = 6;
4631     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4632         type = 'g';
4633     sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4634     /* worst case length calc to ensure no buffer overrun:
4635          fmt = %#.<prec>g
4636          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4637             for any double rep.)
4638          len = 1 + prec + 1 + 2 + 5 = 9 + prec
4639        If prec=0 the effective precision is 1 (the leading digit is
4640        always given), therefore increase by one to 10+prec. */
4641     if (buflen <= (size_t)10 + (size_t)prec) {
4642         PyErr_SetString(PyExc_OverflowError,
4643             "formatted float is too long (precision too long?)");
4644         return -1;
4645     }
4646     return usprintf(buf, fmt, x);
4647 }
4648
4649 static int
4650 formatint(Py_UNICODE *buf,
4651           size_t buflen,
4652           int flags,
4653           int prec,
4654           int type,
4655           PyObject *v)
4656 {
4657     /* fmt = '%#.' + `prec` + 'l' + `type`
4658        worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
4659     char fmt[20];
4660     long x;
4661
4662     x = PyInt_AsLong(v);
4663     if (x == -1 && PyErr_Occurred())
4664         return -1;
4665     if (prec < 0)
4666         prec = 1;
4667     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4668        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4669     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4670         PyErr_SetString(PyExc_OverflowError,
4671             "formatted integer is too long (precision too long?)");
4672         return -1;
4673     }
4674     sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4675     return usprintf(buf, fmt, x);
4676 }
4677
4678 static int
4679 formatchar(Py_UNICODE *buf,
4680            size_t buflen,
4681            PyObject *v)
4682 {
4683     /* presume that the buffer is at least 2 characters long */
4684     if (PyUnicode_Check(v)) {
4685         if (PyUnicode_GET_SIZE(v) != 1)
4686             goto onError;
4687         buf[0] = PyUnicode_AS_UNICODE(v)[0];
4688     }
4689
4690     else if (PyString_Check(v)) {
4691         if (PyString_GET_SIZE(v) != 1)
4692             goto onError;
4693         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4694     }
4695
4696     else {
4697         /* Integer input truncated to a character */
4698         long x;
4699         x = PyInt_AsLong(v);
4700         if (x == -1 && PyErr_Occurred())
4701             goto onError;
4702         buf[0] = (char) x;
4703     }
4704     buf[1] = '\0';
4705     return 1;
4706
4707  onError:
4708     PyErr_SetString(PyExc_TypeError,
4709                     "%c requires int or char");
4710     return -1;
4711 }
4712
4713 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4714
4715    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4716    chars are formatted. XXX This is a magic number. Each formatting
4717    routine does bounds checking to ensure no overflow, but a better
4718    solution may be to malloc a buffer of appropriate size for each
4719    format. For now, the current solution is sufficient.
4720 */
4721 #define FORMATBUFLEN (size_t)120
4722
4723 PyObject *PyUnicode_Format(PyObject *format,
4724                            PyObject *args)
4725 {
4726     Py_UNICODE *fmt, *res;
4727     int fmtcnt, rescnt, reslen, arglen, argidx;
4728     int args_owned = 0;
4729     PyUnicodeObject *result = NULL;
4730     PyObject *dict = NULL;
4731     PyObject *uformat;
4732
4733     if (format == NULL || args == NULL) {
4734         PyErr_BadInternalCall();
4735         return NULL;
4736     }
4737     uformat = PyUnicode_FromObject(format);
4738     if (uformat == NULL)
4739         return NULL;
4740     fmt = PyUnicode_AS_UNICODE(uformat);
4741     fmtcnt = PyUnicode_GET_SIZE(uformat);
4742
4743     reslen = rescnt = fmtcnt + 100;
4744     result = _PyUnicode_New(reslen);
4745     if (result == NULL)
4746         goto onError;
4747     res = PyUnicode_AS_UNICODE(result);
4748
4749     if (PyTuple_Check(args)) {
4750         arglen = PyTuple_Size(args);
4751         argidx = 0;
4752     }
4753     else {
4754         arglen = -1;
4755         argidx = -2;
4756     }
4757     if (args->ob_type->tp_as_mapping)
4758         dict = args;
4759
4760     while (--fmtcnt >= 0) {
4761         if (*fmt != '%') {
4762             if (--rescnt < 0) {
4763                 rescnt = fmtcnt + 100;
4764                 reslen += rescnt;
4765                 if (_PyUnicode_Resize(result, reslen) < 0)
4766                     return NULL;
4767                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4768                 --rescnt;
4769             }
4770             *res++ = *fmt++;
4771         }
4772         else {
4773             /* Got a format specifier */
4774             int flags = 0;
4775             int width = -1;
4776             int prec = -1;
4777             int size = 0;
4778             Py_UNICODE c = '\0';
4779             Py_UNICODE fill;
4780             PyObject *v = NULL;
4781             PyObject *temp = NULL;
4782             Py_UNICODE *pbuf;
4783             Py_UNICODE sign;
4784             int len;
4785             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
4786
4787             fmt++;
4788             if (*fmt == '(') {
4789                 Py_UNICODE *keystart;
4790                 int keylen;
4791                 PyObject *key;
4792                 int pcount = 1;
4793
4794                 if (dict == NULL) {
4795                     PyErr_SetString(PyExc_TypeError,
4796                                     "format requires a mapping");
4797                     goto onError;
4798                 }
4799                 ++fmt;
4800                 --fmtcnt;
4801                 keystart = fmt;
4802                 /* Skip over balanced parentheses */
4803                 while (pcount > 0 && --fmtcnt >= 0) {
4804                     if (*fmt == ')')
4805                         --pcount;
4806                     else if (*fmt == '(')
4807                         ++pcount;
4808                     fmt++;
4809                 }
4810                 keylen = fmt - keystart - 1;
4811                 if (fmtcnt < 0 || pcount > 0) {
4812                     PyErr_SetString(PyExc_ValueError,
4813                                     "incomplete format key");
4814                     goto onError;
4815                 }
4816                 /* keys are converted to strings using UTF-8 and
4817                    then looked up since Python uses strings to hold
4818                    variables names etc. in its namespaces and we
4819                    wouldn't want to break common idioms. */
4820                 key = PyUnicode_EncodeUTF8(keystart,
4821                                            keylen,
4822                                            NULL);
4823                 if (key == NULL)
4824                     goto onError;
4825                 if (args_owned) {
4826                     Py_DECREF(args);
4827                     args_owned = 0;
4828                 }
4829                 args = PyObject_GetItem(dict, key);
4830                 Py_DECREF(key);
4831                 if (args == NULL) {
4832                     goto onError;
4833                 }
4834                 args_owned = 1;
4835                 arglen = -1;
4836                 argidx = -2;
4837             }
4838             while (--fmtcnt >= 0) {
4839                 switch (c = *fmt++) {
4840                 case '-': flags |= F_LJUST; continue;
4841                 case '+': flags |= F_SIGN; continue;
4842                 case ' ': flags |= F_BLANK; continue;
4843                 case '#': flags |= F_ALT; continue;
4844                 case '0': flags |= F_ZERO; continue;
4845                 }
4846                 break;
4847             }
4848             if (c == '*') {
4849                 v = getnextarg(args, arglen, &argidx);
4850                 if (v == NULL)
4851                     goto onError;
4852                 if (!PyInt_Check(v)) {
4853                     PyErr_SetString(PyExc_TypeError,
4854                                     "* wants int");
4855                     goto onError;
4856                 }
4857                 width = PyInt_AsLong(v);
4858                 if (width < 0) {
4859                     flags |= F_LJUST;
4860                     width = -width;
4861                 }
4862                 if (--fmtcnt >= 0)
4863                     c = *fmt++;
4864             }
4865             else if (c >= '0' && c <= '9') {
4866                 width = c - '0';
4867                 while (--fmtcnt >= 0) {
4868                     c = *fmt++;
4869                     if (c < '0' || c > '9')
4870                         break;
4871                     if ((width*10) / 10 != width) {
4872                         PyErr_SetString(PyExc_ValueError,
4873                                         "width too big");
4874                         goto onError;
4875                     }
4876                     width = width*10 + (c - '0');
4877                 }
4878             }
4879             if (c == '.') {
4880                 prec = 0;
4881                 if (--fmtcnt >= 0)
4882                     c = *fmt++;
4883                 if (c == '*') {
4884                     v = getnextarg(args, arglen, &argidx);
4885                     if (v == NULL)
4886                         goto onError;
4887                     if (!PyInt_Check(v)) {
4888                         PyErr_SetString(PyExc_TypeError,
4889                                         "* wants int");
4890                         goto onError;
4891                     }
4892                     prec = PyInt_AsLong(v);
4893                     if (prec < 0)
4894                         prec = 0;
4895                     if (--fmtcnt >= 0)
4896                         c = *fmt++;
4897                 }
4898                 else if (c >= '0' && c <= '9') {
4899                     prec = c - '0';
4900                     while (--fmtcnt >= 0) {
4901                         c = Py_CHARMASK(*fmt++);
4902                         if (c < '0' || c > '9')
4903                             break;
4904                         if ((prec*10) / 10 != prec) {
4905                             PyErr_SetString(PyExc_ValueError,
4906                                             "prec too big");
4907                             goto onError;
4908                         }
4909                         prec = prec*10 + (c - '0');
4910                     }
4911                 }
4912             } /* prec */
4913             if (fmtcnt >= 0) {
4914                 if (c == 'h' || c == 'l' || c == 'L') {
4915                     size = c;
4916                     if (--fmtcnt >= 0)
4917                         c = *fmt++;
4918                 }
4919             }
4920             if (fmtcnt < 0) {
4921                 PyErr_SetString(PyExc_ValueError,
4922                                 "incomplete format");
4923                 goto onError;
4924             }
4925             if (c != '%') {
4926                 v = getnextarg(args, arglen, &argidx);
4927                 if (v == NULL)
4928                     goto onError;
4929             }
4930             sign = 0;
4931             fill = ' ';
4932             switch (c) {
4933
4934             case '%':
4935                 pbuf = formatbuf;
4936                 /* presume that buffer length is at least 1 */
4937                 pbuf[0] = '%';
4938                 len = 1;
4939                 break;
4940
4941             case 's':
4942             case 'r':
4943                 if (PyUnicode_Check(v) && c == 's') {
4944                     temp = v;
4945                     Py_INCREF(temp);
4946                 }
4947                 else {
4948                     PyObject *unicode;
4949                     if (c == 's')
4950                         temp = PyObject_Str(v);
4951                     else
4952                         temp = PyObject_Repr(v);
4953                     if (temp == NULL)
4954                         goto onError;
4955                     if (!PyString_Check(temp)) {
4956                         /* XXX Note: this should never happen, since
4957                                PyObject_Repr() and PyObject_Str() assure
4958                                this */
4959                         Py_DECREF(temp);
4960                         PyErr_SetString(PyExc_TypeError,
4961                                         "%s argument has non-string str()");
4962                         goto onError;
4963                     }
4964                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
4965                                                    PyString_GET_SIZE(temp),
4966                                                NULL,
4967                                                    "strict");
4968                     Py_DECREF(temp);
4969                     temp = unicode;
4970                     if (temp == NULL)
4971                         goto onError;
4972                 }
4973                 pbuf = PyUnicode_AS_UNICODE(temp);
4974                 len = PyUnicode_GET_SIZE(temp);
4975                 if (prec >= 0 && len > prec)
4976                     len = prec;
4977                 break;
4978
4979             case 'i':
4980             case 'd':
4981             case 'u':
4982             case 'o':
4983             case 'x':
4984             case 'X':
4985                 if (c == 'i')
4986                     c = 'd';
4987                 pbuf = formatbuf;
4988                 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
4989                         flags, prec, c, v);
4990                 if (len < 0)
4991                     goto onError;
4992                 sign = (c == 'd');
4993                 if (flags & F_ZERO) {
4994                     fill = '0';
4995                     if ((flags&F_ALT) &&
4996                         (c == 'x' || c == 'X') &&
4997                         pbuf[0] == '0' && pbuf[1] == c) {
4998                         *res++ = *pbuf++;
4999                         *res++ = *pbuf++;
5000                         rescnt -= 2;
5001                         len -= 2;
5002                         width -= 2;
5003                         if (width < 0)
5004                             width = 0;
5005                     }
5006                 }
5007                 break;
5008
5009             case 'e':
5010             case 'E':
5011             case 'f':
5012             case 'g':
5013             case 'G':
5014                 pbuf = formatbuf;
5015                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5016                         flags, prec, c, v);
5017                 if (len < 0)
5018                     goto onError;
5019                 sign = 1;
5020                 if (flags&F_ZERO)
5021                     fill = '0';
5022                 break;
5023
5024             case 'c':
5025                 pbuf = formatbuf;
5026                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5027                 if (len < 0)
5028                     goto onError;
5029                 break;
5030
5031             default:
5032                 PyErr_Format(PyExc_ValueError,
5033                              "unsupported format character '%c' (0x%x)",
5034                              c, c);
5035                 goto onError;
5036             }
5037             if (sign) {
5038                 if (*pbuf == '-' || *pbuf == '+') {
5039                     sign = *pbuf++;
5040                     len--;
5041                 }
5042                 else if (flags & F_SIGN)
5043                     sign = '+';
5044                 else if (flags & F_BLANK)
5045                     sign = ' ';
5046                 else
5047                     sign = 0;
5048             }
5049             if (width < len)
5050                 width = len;
5051             if (rescnt < width + (sign != 0)) {
5052                 reslen -= rescnt;
5053                 rescnt = width + fmtcnt + 100;
5054                 reslen += rescnt;
5055                 if (_PyUnicode_Resize(result, reslen) < 0)
5056                     return NULL;
5057                 res = PyUnicode_AS_UNICODE(result)
5058                     + reslen - rescnt;
5059             }
5060             if (sign) {
5061                 if (fill != ' ')
5062                     *res++ = sign;
5063                 rescnt--;
5064                 if (width > len)
5065                     width--;
5066             }
5067             if (width > len && !(flags & F_LJUST)) {
5068                 do {
5069                     --rescnt;
5070                     *res++ = fill;
5071                 } while (--width > len);
5072             }
5073             if (sign && fill == ' ')
5074                 *res++ = sign;
5075             memcpy(res, pbuf, len * sizeof(Py_UNICODE));
5076             res += len;
5077             rescnt -= len;
5078             while (--width >= len) {
5079                 --rescnt;
5080                 *res++ = ' ';
5081             }
5082             if (dict && (argidx < arglen) && c != '%') {
5083                 PyErr_SetString(PyExc_TypeError,
5084                                 "not all arguments converted");
5085                 goto onError;
5086             }
5087             Py_XDECREF(temp);
5088         } /* '%' */
5089     } /* until end */
5090     if (argidx < arglen && !dict) {
5091         PyErr_SetString(PyExc_TypeError,
5092                         "not all arguments converted");
5093         goto onError;
5094     }
5095
5096     if (args_owned) {
5097         Py_DECREF(args);
5098     }
5099     Py_DECREF(uformat);
5100     if (_PyUnicode_Resize(result, reslen - rescnt))
5101         goto onError;
5102     return (PyObject *)result;
5103
5104  onError:
5105     Py_XDECREF(result);
5106     Py_DECREF(uformat);
5107     if (args_owned) {
5108         Py_DECREF(args);
5109     }
5110     return NULL;
5111 }
5112
5113 static PyBufferProcs unicode_as_buffer = {
5114     (getreadbufferproc) unicode_buffer_getreadbuf,
5115     (getwritebufferproc) unicode_buffer_getwritebuf,
5116     (getsegcountproc) unicode_buffer_getsegcount,
5117     (getcharbufferproc) unicode_buffer_getcharbuf,
5118 };
5119
5120 PyTypeObject PyUnicode_Type = {
5121     PyObject_HEAD_INIT(&PyType_Type)
5122     0,                                  /* ob_size */
5123     "unicode",                          /* tp_name */
5124     sizeof(PyUnicodeObject),            /* tp_size */
5125     0,                                  /* tp_itemsize */
5126     /* Slots */
5127     (destructor)_PyUnicode_Free,        /* tp_dealloc */
5128     0,                                  /* tp_print */
5129     (getattrfunc)unicode_getattr,       /* tp_getattr */
5130     0,                                  /* tp_setattr */
5131     (cmpfunc) unicode_compare,          /* tp_compare */
5132     (reprfunc) unicode_repr,            /* tp_repr */
5133     0,                                  /* tp_as_number */
5134     &unicode_as_sequence,               /* tp_as_sequence */
5135     0,                                  /* tp_as_mapping */
5136     (hashfunc) unicode_hash,            /* tp_hash*/
5137     0,                                  /* tp_call*/
5138     (reprfunc) unicode_str,             /* tp_str */
5139     (getattrofunc) NULL,                /* tp_getattro */
5140     (setattrofunc) NULL,                /* tp_setattro */
5141     &unicode_as_buffer,                 /* tp_as_buffer */
5142     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
5143 };
5144
5145 /* Initialize the Unicode implementation */
5146
5147 void _PyUnicode_Init(void)
5148 {
5149     /* Doublecheck the configuration... */
5150     if (sizeof(Py_UNICODE) != 2)
5151         Py_FatalError("Unicode configuration error: "
5152                       "sizeof(Py_UNICODE) != 2 bytes");
5153
5154     /* Init the implementation */
5155     unicode_freelist = NULL;
5156     unicode_freelist_size = 0;
5157     unicode_empty = _PyUnicode_New(0);
5158     strcpy(unicode_default_encoding, "ascii");
5159 }
5160
5161 /* Finalize the Unicode implementation */
5162
5163 void
5164 _PyUnicode_Fini(void)
5165 {
5166     PyUnicodeObject *u = unicode_freelist;
5167
5168     while (u != NULL) {
5169         PyUnicodeObject *v = u;
5170         u = *(PyUnicodeObject **)u;
5171         if (v->str)
5172             PyMem_DEL(v->str);
5173         Py_XDECREF(v->defenc);
5174         PyObject_DEL(v);
5175     }
5176     unicode_freelist = NULL;
5177     unicode_freelist_size = 0;
5178     Py_XDECREF(unicode_empty);
5179     unicode_empty = NULL;
5180 }