Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Copyright (c) Corporation for National Research Initiatives.
   8
   9 --------------------------------------------------------------------
  10 The original string type implementation is:
  11
  12     Copyright (c) 1999 by Secret Labs AB
  13     Copyright (c) 1999 by Fredrik Lundh
  14
  15 By obtaining, using, and/or copying this software and/or its
  16 associated documentation, you agree that you have read, understood,
  17 and will comply with the following terms and conditions:
  18
  19 Permission to use, copy, modify, and distribute this software and its
  20 associated documentation for any purpose and without fee is hereby
  21 granted, provided that the above copyright notice appears in all
  22 copies, and that both that copyright notice and this permission notice
  23 appear in supporting documentation, and that the name of Secret Labs
  24 AB or the author not be used in advertising or publicity pertaining to
  25 distribution of the software without specific, written prior
  26 permission.
  27
  28 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  29 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  30 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  31 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  32 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  33 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  34 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  35 --------------------------------------------------------------------
  36
  37 */
  38
  39 #include "Python.h"
  40
  41 #include "unicodeobject.h"
  42 #include "ucnhash.h"
  43
  44 #ifdef MS_WIN32
  45 #include <windows.h>
  46 #endif
  47
  48 /* Limit for the Unicode object free list */
  49
  50 #define MAX_UNICODE_FREELIST_SIZE       1024
  51
  52 /* Limit for the Unicode object free list stay alive optimization.
  53
  54    The implementation will keep allocated Unicode memory intact for
  55    all objects on the free list having a size less than this
  56    limit. This reduces malloc() overhead for small Unicode objects.
  57
  58    At worst this will result in MAX_UNICODE_FREELIST_SIZE *
  59    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  60    malloc()-overhead) bytes of unused garbage.
  61
  62    Setting the limit to 0 effectively turns the feature off.
  63
  64    Note: This is an experimental feature ! If you get core dumps when
  65    using Unicode objects, turn this feature off.
  66
  67 */
  68
  69 #define KEEPALIVE_SIZE_LIMIT       9
  70
  71 /* Endianness switches; defaults to little endian */
  72
  73 #ifdef WORDS_BIGENDIAN
  74 # define BYTEORDER_IS_BIG_ENDIAN
  75 #else
  76 # define BYTEORDER_IS_LITTLE_ENDIAN
  77 #endif
  78
  79 /* --- Globals ------------------------------------------------------------
  80
  81    The globals are initialized by the _PyUnicode_Init() API and should
  82    not be used before calling that API.
  83
  84 */
  85
  86 /* Free list for Unicode objects */
  87 static PyUnicodeObject *unicode_freelist;
  88 static int unicode_freelist_size;
  89
  90 /* The empty Unicode object is shared to improve performance. */
  91 static PyUnicodeObject *unicode_empty;
  92
  93 /* Single character Unicode strings in the Latin-1 range are being
  94    shared as well. */
  95 static PyUnicodeObject *unicode_latin1[256];
  96
  97 /* Default encoding to use and assume when NULL is passed as encoding
  98    parameter; it is initialized by _PyUnicode_Init().
  99
 100    Always use the PyUnicode_SetDefaultEncoding() and
 101    PyUnicode_GetDefaultEncoding() APIs to access this global.
 102
 103 */
 104 static char unicode_default_encoding[100];
 105
 106 Py_UNICODE
 107 PyUnicode_GetMax(void)
 108 {
 109 #ifdef Py_UNICODE_WIDE
 110         return 0x10FFFF;
 111 #else
 112         /* This is actually an illegal character, so it should
 113            not be passed to unichr. */
 114         return 0xFFFF;
 115 #endif
 116 }
 117
 118 /* --- Unicode Object ----------------------------------------------------- */
 119
 120 static
 121 int unicode_resize(register PyUnicodeObject *unicode,
 122                       int length)
 123 {
 124     void *oldstr;
 125
 126     /* Shortcut if there's nothing much to do. */
 127     if (unicode->length == length)
 128         goto reset;
 129
 130     /* Resizing shared object (unicode_empty or single character
 131        objects) in-place is not allowed. Use PyUnicode_Resize()
 132        instead ! */
 133     if (unicode == unicode_empty ||
 134         (unicode->length == 1 &&
 135          unicode->str[0] < 256 &&
 136          unicode_latin1[unicode->str[0]] == unicode)) {
 137         PyErr_SetString(PyExc_SystemError,
 138                         "can't resize shared unicode objects");
 139         return -1;
 140     }
 141
 142     /* We allocate one more byte to make sure the string is
 143        Ux0000 terminated -- XXX is this needed ? */
 144     oldstr = unicode->str;
 145     PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
 146     if (!unicode->str) {
 147         unicode->str = oldstr;
 148         PyErr_NoMemory();
 149         return -1;
 150     }
 151     unicode->str[length] = 0;
 152     unicode->length = length;
 153
 154  reset:
 155     /* Reset the object caches */
 156     if (unicode->defenc) {
 157         Py_DECREF(unicode->defenc);
 158         unicode->defenc = NULL;
 159     }
 160     unicode->hash = -1;
 161
 162     return 0;
 163 }
 164
 165 /* We allocate one more byte to make sure the string is
 166    Ux0000 terminated -- XXX is this needed ?
 167
 168    XXX This allocator could further be enhanced by assuring that the
 169        free list never reduces its size below 1.
 170
 171 */
 172
 173 static
 174 PyUnicodeObject *_PyUnicode_New(int length)
 175 {
 176     register PyUnicodeObject *unicode;
 177
 178     /* Optimization for empty strings */
 179     if (length == 0 && unicode_empty != NULL) {
 180         Py_INCREF(unicode_empty);
 181         return unicode_empty;
 182     }
 183
 184     /* Unicode freelist & memory allocation */
 185     if (unicode_freelist) {
 186         unicode = unicode_freelist;
 187         unicode_freelist = *(PyUnicodeObject **)unicode;
 188         unicode_freelist_size--;
 189         if (unicode->str) {
 190             /* Keep-Alive optimization: we only upsize the buffer,
 191                never downsize it. */
 192             if ((unicode->length < length) &&
 193                 unicode_resize(unicode, length)) {
 194                 PyMem_DEL(unicode->str);
 195                 goto onError;
 196             }
 197         }
 198         else {
 199             unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 200         }
 201         PyObject_INIT(unicode, &PyUnicode_Type);
 202     }
 203     else {
 204         unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
 205         if (unicode == NULL)
 206             return NULL;
 207         unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
 208     }
 209
 210     if (!unicode->str) {
 211         PyErr_NoMemory();
 212         goto onError;
 213     }
 214     unicode->str[length] = 0;
 215     unicode->length = length;
 216     unicode->hash = -1;
 217     unicode->defenc = NULL;
 218     return unicode;
 219
 220  onError:
 221     _Py_ForgetReference((PyObject *)unicode);
 222     PyObject_DEL(unicode);
 223     return NULL;
 224 }
 225
 226 static
 227 void unicode_dealloc(register PyUnicodeObject *unicode)
 228 {
 229     if (PyUnicode_CheckExact(unicode) &&
 230         unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
 231         /* Keep-Alive optimization */
 232         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 233             PyMem_DEL(unicode->str);
 234             unicode->str = NULL;
 235             unicode->length = 0;
 236         }
 237         if (unicode->defenc) {
 238             Py_DECREF(unicode->defenc);
 239             unicode->defenc = NULL;
 240         }
 241         /* Add to free list */
 242         *(PyUnicodeObject **)unicode = unicode_freelist;
 243         unicode_freelist = unicode;
 244         unicode_freelist_size++;
 245     }
 246     else {
 247         PyMem_DEL(unicode->str);
 248         Py_XDECREF(unicode->defenc);
 249         unicode->ob_type->tp_free((PyObject *)unicode);
 250     }
 251 }
 252
 253 int PyUnicode_Resize(PyObject **unicode,
 254                      int length)
 255 {
 256     register PyUnicodeObject *v;
 257
 258     /* Argument checks */
 259     if (unicode == NULL) {
 260         PyErr_BadInternalCall();
 261         return -1;
 262     }
 263     v = (PyUnicodeObject *)*unicode;
 264     if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
 265         PyErr_BadInternalCall();
 266         return -1;
 267     }
 268
 269     /* Resizing unicode_empty and single character objects is not
 270        possible since these are being shared. We simply return a fresh
 271        copy with the same Unicode content. */
 272     if (v->length != length &&
 273         (v == unicode_empty || v->length == 1)) {
 274         PyUnicodeObject *w = _PyUnicode_New(length);
 275         if (w == NULL)
 276             return -1;
 277         Py_UNICODE_COPY(w->str, v->str,
 278                         length < v->length ? length : v->length);
 279         *unicode = (PyObject *)w;
 280         return 0;
 281     }
 282
 283     /* Note that we don't have to modify *unicode for unshared Unicode
 284        objects, since we can modify them in-place. */
 285     return unicode_resize(v, length);
 286 }
 287
 288 /* Internal API for use in unicodeobject.c only ! */
 289 #define _PyUnicode_Resize(unicodevar, length) \
 290         PyUnicode_Resize(((PyObject **)(unicodevar)), length)
 291
 292 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 293                                 int size)
 294 {
 295     PyUnicodeObject *unicode;
 296
 297     /* If the Unicode data is known at construction time, we can apply
 298        some optimizations which share commonly used objects. */
 299     if (u != NULL) {
 300
 301         /* Optimization for empty strings */
 302         if (size == 0 && unicode_empty != NULL) {
 303             Py_INCREF(unicode_empty);
 304             return (PyObject *)unicode_empty;
 305         }
 306
 307         /* Single character Unicode objects in the Latin-1 range are
 308            shared when using this constructor */
 309         if (size == 1 && *u < 256) {
 310             unicode = unicode_latin1[*u];
 311             if (!unicode) {
 312                 unicode = _PyUnicode_New(1);
 313                 if (!unicode)
 314                     return NULL;
 315                 unicode->str[0] = *u;
 316                 unicode_latin1[*u] = unicode;
 317             }
 318             Py_INCREF(unicode);
 319             return (PyObject *)unicode;
 320         }
 321     }
 322
 323     unicode = _PyUnicode_New(size);
 324     if (!unicode)
 325         return NULL;
 326
 327     /* Copy the Unicode data into the new object */
 328     if (u != NULL)
 329         Py_UNICODE_COPY(unicode->str, u, size);
 330
 331     return (PyObject *)unicode;
 332 }
 333
 334 #ifdef HAVE_WCHAR_H
 335
 336 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 337                                  int size)
 338 {
 339     PyUnicodeObject *unicode;
 340
 341     if (w == NULL) {
 342         PyErr_BadInternalCall();
 343         return NULL;
 344     }
 345
 346     unicode = _PyUnicode_New(size);
 347     if (!unicode)
 348         return NULL;
 349
 350     /* Copy the wchar_t data into the new object */
 351 #ifdef HAVE_USABLE_WCHAR_T
 352     memcpy(unicode->str, w, size * sizeof(wchar_t));
 353 #else
 354     {
 355         register Py_UNICODE *u;
 356         register int i;
 357         u = PyUnicode_AS_UNICODE(unicode);
 358         for (i = size; i >= 0; i--)
 359             *u++ = *w++;
 360     }
 361 #endif
 362
 363     return (PyObject *)unicode;
 364 }
 365
 366 int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
 367                          register wchar_t *w,
 368                          int size)
 369 {
 370     if (unicode == NULL) {
 371         PyErr_BadInternalCall();
 372         return -1;
 373     }
 374     if (size > PyUnicode_GET_SIZE(unicode))
 375         size = PyUnicode_GET_SIZE(unicode);
 376 #ifdef HAVE_USABLE_WCHAR_T
 377     memcpy(w, unicode->str, size * sizeof(wchar_t));
 378 #else
 379     {
 380         register Py_UNICODE *u;
 381         register int i;
 382         u = PyUnicode_AS_UNICODE(unicode);
 383         for (i = size; i >= 0; i--)
 384             *w++ = *u++;
 385     }
 386 #endif
 387
 388     return size;
 389 }
 390
 391 #endif
 392
 393 PyObject *PyUnicode_FromObject(register PyObject *obj)
 394 {
 395     /* XXX Perhaps we should make this API an alias of
 396            PyObject_Unicode() instead ?! */
 397     if (PyUnicode_CheckExact(obj)) {
 398         Py_INCREF(obj);
 399         return obj;
 400     }
 401     if (PyUnicode_Check(obj)) {
 402         /* For a Unicode subtype that's not a Unicode object,
 403            return a true Unicode object with the same data. */
 404         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
 405                                      PyUnicode_GET_SIZE(obj));
 406     }
 407     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
 408 }
 409
 410 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
 411                                       const char *encoding,
 412                                       const char *errors)
 413 {
 414     const char *s = NULL;
 415     int len;
 416     int owned = 0;
 417     PyObject *v;
 418
 419     if (obj == NULL) {
 420         PyErr_BadInternalCall();
 421         return NULL;
 422     }
 423
 424 #if 0
 425     /* For b/w compatibility we also accept Unicode objects provided
 426        that no encodings is given and then redirect to
 427        PyObject_Unicode() which then applies the additional logic for
 428        Unicode subclasses.
 429
 430        NOTE: This API should really only be used for object which
 431              represent *encoded* Unicode !
 432
 433     */
 434         if (PyUnicode_Check(obj)) {
 435             if (encoding) {
 436                 PyErr_SetString(PyExc_TypeError,
 437                                 "decoding Unicode is not supported");
 438             return NULL;
 439             }
 440         return PyObject_Unicode(obj);
 441             }
 442 #else
 443     if (PyUnicode_Check(obj)) {
 444         PyErr_SetString(PyExc_TypeError,
 445                         "decoding Unicode is not supported");
 446         return NULL;
 447         }
 448 #endif
 449
 450     /* Coerce object */
 451     if (PyString_Check(obj)) {
 452             s = PyString_AS_STRING(obj);
 453             len = PyString_GET_SIZE(obj);
 454             }
 455     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
 456         /* Overwrite the error message with something more useful in
 457            case of a TypeError. */
 458         if (PyErr_ExceptionMatches(PyExc_TypeError))
 459         PyErr_Format(PyExc_TypeError,
 460                          "coercing to Unicode: need string or buffer, "
 461                          "%.80s found",
 462                      obj->ob_type->tp_name);
 463         goto onError;
 464     }
 465
 466     /* Convert to Unicode */
 467     if (len == 0) {
 468         Py_INCREF(unicode_empty);
 469         v = (PyObject *)unicode_empty;
 470     }
 471     else
 472         v = PyUnicode_Decode(s, len, encoding, errors);
 473
 474     if (owned) {
 475         Py_DECREF(obj);
 476     }
 477     return v;
 478
 479  onError:
 480     if (owned) {
 481         Py_DECREF(obj);
 482     }
 483     return NULL;
 484 }
 485
 486 PyObject *PyUnicode_Decode(const char *s,
 487                            int size,
 488                            const char *encoding,
 489                            const char *errors)
 490 {
 491     PyObject *buffer = NULL, *unicode;
 492
 493     if (encoding == NULL)
 494         encoding = PyUnicode_GetDefaultEncoding();
 495
 496     /* Shortcuts for common default encodings */
 497     if (strcmp(encoding, "utf-8") == 0)
 498         return PyUnicode_DecodeUTF8(s, size, errors);
 499     else if (strcmp(encoding, "latin-1") == 0)
 500         return PyUnicode_DecodeLatin1(s, size, errors);
 501     else if (strcmp(encoding, "ascii") == 0)
 502         return PyUnicode_DecodeASCII(s, size, errors);
 503
 504     /* Decode via the codec registry */
 505     buffer = PyBuffer_FromMemory((void *)s, size);
 506     if (buffer == NULL)
 507         goto onError;
 508     unicode = PyCodec_Decode(buffer, encoding, errors);
 509     if (unicode == NULL)
 510         goto onError;
 511     if (!PyUnicode_Check(unicode)) {
 512         PyErr_Format(PyExc_TypeError,
 513                      "decoder did not return an unicode object (type=%.400s)",
 514                      unicode->ob_type->tp_name);
 515         Py_DECREF(unicode);
 516         goto onError;
 517     }
 518     Py_DECREF(buffer);
 519     return unicode;
 520
 521  onError:
 522     Py_XDECREF(buffer);
 523     return NULL;
 524 }
 525
 526 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
 527                            int size,
 528                            const char *encoding,
 529                            const char *errors)
 530 {
 531     PyObject *v, *unicode;
 532
 533     unicode = PyUnicode_FromUnicode(s, size);
 534     if (unicode == NULL)
 535         return NULL;
 536     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
 537     Py_DECREF(unicode);
 538     return v;
 539 }
 540
 541 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
 542                                     const char *encoding,
 543                                     const char *errors)
 544 {
 545     PyObject *v;
 546
 547     if (!PyUnicode_Check(unicode)) {
 548         PyErr_BadArgument();
 549         goto onError;
 550     }
 551
 552     if (encoding == NULL)
 553         encoding = PyUnicode_GetDefaultEncoding();
 554
 555     /* Shortcuts for common default encodings */
 556     if (errors == NULL) {
 557         if (strcmp(encoding, "utf-8") == 0)
 558             return PyUnicode_AsUTF8String(unicode);
 559         else if (strcmp(encoding, "latin-1") == 0)
 560             return PyUnicode_AsLatin1String(unicode);
 561         else if (strcmp(encoding, "ascii") == 0)
 562             return PyUnicode_AsASCIIString(unicode);
 563     }
 564
 565     /* Encode via the codec registry */
 566     v = PyCodec_Encode(unicode, encoding, errors);
 567     if (v == NULL)
 568         goto onError;
 569     /* XXX Should we really enforce this ? */
 570     if (!PyString_Check(v)) {
 571         PyErr_Format(PyExc_TypeError,
 572                      "encoder did not return a string object (type=%.400s)",
 573                      v->ob_type->tp_name);
 574         Py_DECREF(v);
 575         goto onError;
 576     }
 577     return v;
 578
 579  onError:
 580     return NULL;
 581 }
 582
 583 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
 584                                             const char *errors)
 585 {
 586     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
 587
 588     if (v)
 589         return v;
 590     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
 591     if (v && errors == NULL)
 592         ((PyUnicodeObject *)unicode)->defenc = v;
 593     return v;
 594 }
 595
 596 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
 597 {
 598     if (!PyUnicode_Check(unicode)) {
 599         PyErr_BadArgument();
 600         goto onError;
 601     }
 602     return PyUnicode_AS_UNICODE(unicode);
 603
 604  onError:
 605     return NULL;
 606 }
 607
 608 int PyUnicode_GetSize(PyObject *unicode)
 609 {
 610     if (!PyUnicode_Check(unicode)) {
 611         PyErr_BadArgument();
 612         goto onError;
 613     }
 614     return PyUnicode_GET_SIZE(unicode);
 615
 616  onError:
 617     return -1;
 618 }
 619
 620 const char *PyUnicode_GetDefaultEncoding(void)
 621 {
 622     return unicode_default_encoding;
 623 }
 624
 625 int PyUnicode_SetDefaultEncoding(const char *encoding)
 626 {
 627     PyObject *v;
 628
 629     /* Make sure the encoding is valid. As side effect, this also
 630        loads the encoding into the codec registry cache. */
 631     v = _PyCodec_Lookup(encoding);
 632     if (v == NULL)
 633         goto onError;
 634     Py_DECREF(v);
 635     strncpy(unicode_default_encoding,
 636             encoding,
 637             sizeof(unicode_default_encoding));
 638     return 0;
 639
 640  onError:
 641     return -1;
 642 }
 643
 644 /* --- UTF-7 Codec -------------------------------------------------------- */
 645
 646 /* see RFC2152 for details */
 647
 648 static
 649 char utf7_special[128] = {
 650     /* indicate whether a UTF-7 character is special i.e. cannot be directly
 651        encoded:
 652            0 - not special
 653            1 - special
 654            2 - whitespace (optional)
 655            3 - RFC2152 Set O (optional) */
 656     1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
 657     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 658     2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
 659     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
 660     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 661     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
 662     3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 663     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
 664
 665 };
 666
 667 #define SPECIAL(c, encodeO, encodeWS) \
 668         (((c)>127 || utf7_special[(c)] == 1) || \
 669          (encodeWS && (utf7_special[(c)] == 2)) || \
 670      (encodeO && (utf7_special[(c)] == 3)))
 671
 672 #define B64(n)  ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
 673 #define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
 674 #define UB64(c)        ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
 675                         (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
 676
 677 #define ENCODE(out, ch, bits) \
 678     while (bits >= 6) { \
 679         *out++ = B64(ch >> (bits-6)); \
 680         bits -= 6; \
 681     }
 682
 683 #define DECODE(out, ch, bits, surrogate) \
 684     while (bits >= 16) { \
 685         Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
 686         bits -= 16; \
 687                 if (surrogate) { \
 688                         /* We have already generated an error for the high surrogate
 689                so let's not bother seeing if the low surrogate is correct or not */\
 690                         surrogate = 0; \
 691                 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
 692             /* This is a surrogate pair. Unfortunately we can't represent \
 693                it in a 16-bit character */ \
 694                         surrogate = 1; \
 695             errmsg = "code pairs are not supported"; \
 696                 goto utf7Error; \
 697                 } else { \
 698                                 *out++ = outCh; \
 699                 } \
 700     } \
 701
 702 static
 703 int utf7_decoding_error(Py_UNICODE **dest,
 704                         const char *errors,
 705                         const char *details)
 706 {
 707     if ((errors == NULL) ||
 708         (strcmp(errors,"strict") == 0)) {
 709         PyErr_Format(PyExc_UnicodeError,
 710                      "UTF-7 decoding error: %.400s",
 711                      details);
 712         return -1;
 713     }
 714     else if (strcmp(errors,"ignore") == 0) {
 715         return 0;
 716     }
 717     else if (strcmp(errors,"replace") == 0) {
 718         if (dest != NULL) {
 719             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 720             (*dest)++;
 721         }
 722         return 0;
 723     }
 724     else {
 725         PyErr_Format(PyExc_ValueError,
 726                      "UTF-7 decoding error; unknown error handling code: %.400s",
 727                      errors);
 728         return -1;
 729     }
 730 }
 731
 732 PyObject *PyUnicode_DecodeUTF7(const char *s,
 733                                int size,
 734                                const char *errors)
 735 {
 736     const char *e;
 737     PyUnicodeObject *unicode;
 738     Py_UNICODE *p;
 739     const char *errmsg = "";
 740     int inShift = 0;
 741     unsigned int bitsleft = 0;
 742     unsigned long charsleft = 0;
 743         int surrogate = 0;
 744
 745     unicode = _PyUnicode_New(size);
 746     if (!unicode)
 747         return NULL;
 748     if (size == 0)
 749         return (PyObject *)unicode;
 750
 751     p = unicode->str;
 752     e = s + size;
 753
 754     while (s < e) {
 755         Py_UNICODE ch = *s;
 756
 757         if (inShift) {
 758             if ((ch == '-') || !B64CHAR(ch)) {
 759                 inShift = 0;
 760                 s++;
 761
 762                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 763                 if (bitsleft >= 6) {
 764                     /* The shift sequence has a partial character in it. If
 765                        bitsleft < 6 then we could just classify it as padding
 766                        but that is not the case here */
 767
 768                     errmsg = "partial character in shift sequence";
 769                     goto utf7Error;
 770                 }
 771                 /* According to RFC2152 the remaining bits should be zero. We
 772                    choose to signal an error/insert a replacement character
 773                    here so indicate the potential of a misencoded character. */
 774
 775                 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
 776                 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
 777                     errmsg = "non-zero padding bits in shift sequence";
 778                     goto utf7Error;
 779                 }
 780
 781                 if (ch == '-') {
 782                     if ((s < e) && (*(s) == '-')) {
 783                         *p++ = '-';
 784                         inShift = 1;
 785                     }
 786                 } else if (SPECIAL(ch,0,0)) {
 787                     errmsg = "unexpected special character";
 788                         goto utf7Error;
 789                 } else  {
 790                     *p++ = ch;
 791                 }
 792             } else {
 793                 charsleft = (charsleft << 6) | UB64(ch);
 794                 bitsleft += 6;
 795                 s++;
 796                 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
 797             }
 798         }
 799         else if ( ch == '+' ) {
 800             s++;
 801             if (s < e && *s == '-') {
 802                 s++;
 803                 *p++ = '+';
 804             } else
 805             {
 806                 inShift = 1;
 807                 bitsleft = 0;
 808             }
 809         }
 810         else if (SPECIAL(ch,0,0)) {
 811             errmsg = "unexpected special character";
 812             s++;
 813                 goto utf7Error;
 814         }
 815         else {
 816             *p++ = ch;
 817             s++;
 818         }
 819         continue;
 820     utf7Error:
 821       if (utf7_decoding_error(&p, errors, errmsg))
 822           goto onError;
 823     }
 824
 825     if (inShift) {
 826         if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
 827             goto onError;
 828     }
 829
 830     if (_PyUnicode_Resize(&unicode, p - unicode->str))
 831         goto onError;
 832
 833     return (PyObject *)unicode;
 834
 835 onError:
 836     Py_DECREF(unicode);
 837     return NULL;
 838 }
 839
 840
 841 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
 842                    int size,
 843                    int encodeSetO,
 844                    int encodeWhiteSpace,
 845                    const char *errors)
 846 {
 847     PyObject *v;
 848     /* It might be possible to tighten this worst case */
 849     unsigned int cbAllocated = 5 * size;
 850     int inShift = 0;
 851     int i = 0;
 852     unsigned int bitsleft = 0;
 853     unsigned long charsleft = 0;
 854     char * out;
 855     char * start;
 856
 857     if (size == 0)
 858                 return PyString_FromStringAndSize(NULL, 0);
 859
 860     v = PyString_FromStringAndSize(NULL, cbAllocated);
 861     if (v == NULL)
 862         return NULL;
 863
 864     start = out = PyString_AS_STRING(v);
 865     for (;i < size; ++i) {
 866         Py_UNICODE ch = s[i];
 867
 868         if (!inShift) {
 869                         if (ch == '+') {
 870                                 *out++ = '+';
 871                 *out++ = '-';
 872             } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 873                 charsleft = ch;
 874                 bitsleft = 16;
 875                 *out++ = '+';
 876                                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 877                 inShift = bitsleft > 0;
 878                         } else {
 879                                 *out++ = (char) ch;
 880                         }
 881                 } else {
 882             if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
 883                 *out++ = B64(charsleft << (6-bitsleft));
 884                 charsleft = 0;
 885                 bitsleft = 0;
 886                 /* Characters not in the BASE64 set implicitly unshift the sequence
 887                    so no '-' is required, except if the character is itself a '-' */
 888                 if (B64CHAR(ch) || ch == '-') {
 889                     *out++ = '-';
 890                 }
 891                 inShift = 0;
 892                 *out++ = (char) ch;
 893             } else {
 894                 bitsleft += 16;
 895                 charsleft = (charsleft << 16) | ch;
 896                 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
 897
 898                 /* If the next character is special then we dont' need to terminate
 899                    the shift sequence. If the next character is not a BASE64 character
 900                    or '-' then the shift sequence will be terminated implicitly and we
 901                    don't have to insert a '-'. */
 902
 903                 if (bitsleft == 0) {
 904                     if (i + 1 < size) {
 905                         Py_UNICODE ch2 = s[i+1];
 906
 907                         if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
 908
 909                         } else if (B64CHAR(ch2) || ch2 == '-') {
 910                             *out++ = '-';
 911                             inShift = 0;
 912                         } else {
 913                             inShift = 0;
 914                         }
 915
 916                     }
 917                     else {
 918                         *out++ = '-';
 919                         inShift = 0;
 920                     }
 921                 }
 922             }
 923         }
 924         }
 925     if (bitsleft) {
 926         *out++= B64(charsleft << (6-bitsleft) );
 927         *out++ = '-';
 928     }
 929
 930     if (_PyString_Resize(&v, out - start)) {
 931         Py_DECREF(v);
 932         return NULL;
 933     }
 934     return v;
 935 }
 936
 937 #undef SPECIAL
 938 #undef B64
 939 #undef B64CHAR
 940 #undef UB64
 941 #undef ENCODE
 942 #undef DECODE
 943
 944 /* --- UTF-8 Codec -------------------------------------------------------- */
 945
 946 static
 947 char utf8_code_length[256] = {
 948     /* Map UTF-8 encoded prefix byte to sequence length.  zero means
 949        illegal prefix.  see RFC 2279 for details */
 950     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 951     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 952     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 953     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 954     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 955     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 956     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 957     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 958     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 959     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 960     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 961     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 962     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 963     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 964     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 965     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
 966 };
 967
 968 static
 969 int utf8_decoding_error(const char **source,
 970                         Py_UNICODE **dest,
 971                         const char *errors,
 972                         const char *details)
 973 {
 974     if ((errors == NULL) ||
 975         (strcmp(errors,"strict") == 0)) {
 976         PyErr_Format(PyExc_UnicodeError,
 977                      "UTF-8 decoding error: %.400s",
 978                      details);
 979         return -1;
 980     }
 981     else if (strcmp(errors,"ignore") == 0) {
 982         (*source)++;
 983         return 0;
 984     }
 985     else if (strcmp(errors,"replace") == 0) {
 986         (*source)++;
 987         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
 988         (*dest)++;
 989         return 0;
 990     }
 991     else {
 992         PyErr_Format(PyExc_ValueError,
 993                      "UTF-8 decoding error; unknown error handling code: %.400s",
 994                      errors);
 995         return -1;
 996     }
 997 }
 998
 999 PyObject *PyUnicode_DecodeUTF8(const char *s,
1000                                int size,
1001                                const char *errors)
1002 {
1003     int n;
1004     const char *e;
1005     PyUnicodeObject *unicode;
1006     Py_UNICODE *p;
1007     const char *errmsg = "";
1008
1009     /* Note: size will always be longer than the resulting Unicode
1010        character count */
1011     unicode = _PyUnicode_New(size);
1012     if (!unicode)
1013         return NULL;
1014     if (size == 0)
1015         return (PyObject *)unicode;
1016
1017     /* Unpack UTF-8 encoded data */
1018     p = unicode->str;
1019     e = s + size;
1020
1021     while (s < e) {
1022         Py_UCS4 ch = (unsigned char)*s;
1023
1024         if (ch < 0x80) {
1025             *p++ = (Py_UNICODE)ch;
1026             s++;
1027             continue;
1028         }
1029
1030         n = utf8_code_length[ch];
1031
1032         if (s + n > e) {
1033             errmsg = "unexpected end of data";
1034             goto utf8Error;
1035         }
1036
1037         switch (n) {
1038
1039         case 0:
1040             errmsg = "unexpected code byte";
1041             goto utf8Error;
1042
1043         case 1:
1044             errmsg = "internal error";
1045             goto utf8Error;
1046
1047         case 2:
1048             if ((s[1] & 0xc0) != 0x80) {
1049                 errmsg = "invalid data";
1050                 goto utf8Error;
1051             }
1052             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1053             if (ch < 0x80) {
1054                 errmsg = "illegal encoding";
1055                 goto utf8Error;
1056             }
1057             else
1058                 *p++ = (Py_UNICODE)ch;
1059             break;
1060
1061         case 3:
1062             if ((s[1] & 0xc0) != 0x80 ||
1063                 (s[2] & 0xc0) != 0x80) {
1064                 errmsg = "invalid data";
1065                 goto utf8Error;
1066             }
1067             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1068             if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1069                 errmsg = "illegal encoding";
1070                 goto utf8Error;
1071             }
1072             else
1073                                 *p++ = (Py_UNICODE)ch;
1074             break;
1075
1076         case 4:
1077             if ((s[1] & 0xc0) != 0x80 ||
1078                 (s[2] & 0xc0) != 0x80 ||
1079                 (s[3] & 0xc0) != 0x80) {
1080                 errmsg = "invalid data";
1081                 goto utf8Error;
1082             }
1083             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1084                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1085             /* validate and convert to UTF-16 */
1086             if ((ch < 0x10000)        /* minimum value allowed for 4
1087                                        byte encoding */
1088                 || (ch > 0x10ffff))   /* maximum value allowed for
1089                                        UTF-16 */
1090             {
1091                 errmsg = "illegal encoding";
1092                 goto utf8Error;
1093             }
1094 #ifdef Py_UNICODE_WIDE
1095             *p++ = (Py_UNICODE)ch;
1096 #else
1097             /*  compute and append the two surrogates: */
1098
1099             /*  translate from 10000..10FFFF to 0..FFFF */
1100             ch -= 0x10000;
1101
1102             /*  high surrogate = top 10 bits added to D800 */
1103             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1104
1105             /*  low surrogate = bottom 10 bits added to DC00 */
1106             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
1107 #endif
1108             break;
1109
1110         default:
1111             /* Other sizes are only needed for UCS-4 */
1112             errmsg = "unsupported Unicode code range";
1113             goto utf8Error;
1114         }
1115         s += n;
1116         continue;
1117
1118     utf8Error:
1119       if (utf8_decoding_error(&s, &p, errors, errmsg))
1120           goto onError;
1121     }
1122
1123     /* Adjust length */
1124     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1125         goto onError;
1126
1127     return (PyObject *)unicode;
1128
1129 onError:
1130     Py_DECREF(unicode);
1131     return NULL;
1132 }
1133
1134 /* Not used anymore, now that the encoder supports UTF-16
1135    surrogates. */
1136 #if 0
1137 static
1138 int utf8_encoding_error(const Py_UNICODE **source,
1139                         char **dest,
1140                         const char *errors,
1141                         const char *details)
1142 {
1143     if ((errors == NULL) ||
1144         (strcmp(errors,"strict") == 0)) {
1145         PyErr_Format(PyExc_UnicodeError,
1146                      "UTF-8 encoding error: %.400s",
1147                      details);
1148         return -1;
1149     }
1150     else if (strcmp(errors,"ignore") == 0) {
1151         return 0;
1152     }
1153     else if (strcmp(errors,"replace") == 0) {
1154         **dest = '?';
1155         (*dest)++;
1156         return 0;
1157     }
1158     else {
1159         PyErr_Format(PyExc_ValueError,
1160                      "UTF-8 encoding error; "
1161                      "unknown error handling code: %.400s",
1162                      errors);
1163         return -1;
1164     }
1165 }
1166 #endif
1167
1168 PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1169                                int size,
1170                                const char *errors)
1171 {
1172     PyObject *v;
1173     char *p;
1174     char *q;
1175     Py_UCS4 ch2;
1176     unsigned int cbAllocated = 3 * size;
1177     int i = 0;
1178
1179     v = PyString_FromStringAndSize(NULL, cbAllocated);
1180     if (v == NULL)
1181         return NULL;
1182     if (size == 0)
1183         return v;
1184
1185     p = q = PyString_AS_STRING(v);
1186     while (i < size) {
1187         Py_UCS4 ch = s[i++];
1188         if (ch < 0x80)
1189             *p++ = (char) ch;
1190
1191         else if (ch < 0x0800) {
1192             *p++ = 0xc0 | (ch >> 6);
1193             *p++ = 0x80 | (ch & 0x3f);
1194         }
1195
1196         else if (ch < 0x10000) {
1197             /* Check for high surrogate */
1198             if (0xD800 <= ch && ch <= 0xDBFF) {
1199                 if (i != size) {
1200                     ch2 = s[i];
1201                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1202
1203                         if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
1204                             /* Provide enough room for some more
1205                                surrogates */
1206                             cbAllocated += 4*10;
1207                             if (_PyString_Resize(&v, cbAllocated))
1208                                 goto onError;
1209                             p = PyString_AS_STRING(v) + (p - q);
1210                             q = PyString_AS_STRING(v);
1211                         }
1212
1213                         /* combine the two values */
1214                         ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1215
1216                         *p++ = (char)((ch >> 18) | 0xf0);
1217                         *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1218                         i++;
1219                     }
1220                 }
1221             }
1222             else
1223                 *p++ = (char)(0xe0 | (ch >> 12));
1224
1225             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1226             *p++ = (char)(0x80 | (ch & 0x3f));
1227
1228         } else {
1229             if ((Py_uintptr_t)(p - q) >= (cbAllocated - 4)) {
1230                 /* Provide enough room for some more
1231                    surrogates */
1232                 cbAllocated += 4*10;
1233                 if (_PyString_Resize(&v, cbAllocated))
1234                     goto onError;
1235                 p = PyString_AS_STRING(v) + (p - q);
1236                 q = PyString_AS_STRING(v);
1237             }
1238
1239             *p++ = 0xf0 | (ch>>18);
1240             *p++ = 0x80 | ((ch>>12) & 0x3f);
1241             *p++ = 0x80 | ((ch>>6) & 0x3f);
1242             *p++ = 0x80 | (ch & 0x3f);
1243         }
1244     }
1245     *p = '\0';
1246     if (_PyString_Resize(&v, p - q))
1247         goto onError;
1248     return v;
1249
1250  onError:
1251     Py_XDECREF(v);
1252     return NULL;
1253 }
1254
1255 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1256 {
1257     if (!PyUnicode_Check(unicode)) {
1258         PyErr_BadArgument();
1259         return NULL;
1260     }
1261     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1262                                 PyUnicode_GET_SIZE(unicode),
1263                                 NULL);
1264 }
1265
1266 /* --- UTF-16 Codec ------------------------------------------------------- */
1267
1268 static
1269 int utf16_decoding_error(Py_UNICODE **dest,
1270                          const char *errors,
1271                          const char *details)
1272 {
1273     if ((errors == NULL) ||
1274         (strcmp(errors,"strict") == 0)) {
1275         PyErr_Format(PyExc_UnicodeError,
1276                      "UTF-16 decoding error: %.400s",
1277                      details);
1278         return -1;
1279     }
1280     else if (strcmp(errors,"ignore") == 0) {
1281         return 0;
1282     }
1283     else if (strcmp(errors,"replace") == 0) {
1284         if (dest) {
1285             **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1286             (*dest)++;
1287         }
1288         return 0;
1289     }
1290     else {
1291         PyErr_Format(PyExc_ValueError,
1292                      "UTF-16 decoding error; "
1293                      "unknown error handling code: %.400s",
1294                      errors);
1295         return -1;
1296     }
1297 }
1298
1299 PyObject *
1300 PyUnicode_DecodeUTF16(const char *s,
1301                       int size,
1302                       const char *errors,
1303                       int *byteorder)
1304 {
1305     PyUnicodeObject *unicode;
1306     Py_UNICODE *p;
1307     const unsigned char *q, *e;
1308     int bo = 0;       /* assume native ordering by default */
1309     const char *errmsg = "";
1310     /* Offsets from q for retrieving byte pairs in the right order. */
1311 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1312     int ihi = 1, ilo = 0;
1313 #else
1314     int ihi = 0, ilo = 1;
1315 #endif
1316
1317     /* size should be an even number */
1318     if (size & 1) {
1319         if (utf16_decoding_error(NULL, errors, "truncated data"))
1320             return NULL;
1321         --size;  /* else ignore the oddball byte */
1322     }
1323
1324     /* Note: size will always be longer than the resulting Unicode
1325        character count */
1326     unicode = _PyUnicode_New(size);
1327     if (!unicode)
1328         return NULL;
1329     if (size == 0)
1330         return (PyObject *)unicode;
1331
1332     /* Unpack UTF-16 encoded data */
1333     p = unicode->str;
1334     q = (unsigned char *)s;
1335     e = q + size;
1336
1337     if (byteorder)
1338         bo = *byteorder;
1339
1340     /* Check for BOM marks (U+FEFF) in the input and adjust current
1341        byte order setting accordingly. In native mode, the leading BOM
1342        mark is skipped, in all other modes, it is copied to the output
1343        stream as-is (giving a ZWNBSP character). */
1344     if (bo == 0) {
1345         const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
1346 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1347         if (bom == 0xFEFF) {
1348             q += 2;
1349             bo = -1;
1350         }
1351         else if (bom == 0xFFFE) {
1352             q += 2;
1353             bo = 1;
1354         }
1355 #else
1356         if (bom == 0xFEFF) {
1357             q += 2;
1358             bo = 1;
1359         }
1360         else if (bom == 0xFFFE) {
1361             q += 2;
1362             bo = -1;
1363         }
1364 #endif
1365     }
1366
1367     if (bo == -1) {
1368         /* force LE */
1369         ihi = 1;
1370         ilo = 0;
1371     }
1372     else if (bo == 1) {
1373         /* force BE */
1374         ihi = 0;
1375         ilo = 1;
1376     }
1377
1378     while (q < e) {
1379         Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1380         q += 2;
1381
1382         if (ch < 0xD800 || ch > 0xDFFF) {
1383             *p++ = ch;
1384             continue;
1385         }
1386
1387         /* UTF-16 code pair: */
1388         if (q >= e) {
1389             errmsg = "unexpected end of data";
1390             goto utf16Error;
1391         }
1392         if (0xD800 <= ch && ch <= 0xDBFF) {
1393             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1394             q += 2;
1395             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1396 #ifndef Py_UNICODE_WIDE
1397                 *p++ = ch;
1398                 *p++ = ch2;
1399 #else
1400                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
1401 #endif
1402                 continue;
1403             }
1404             else {
1405                 errmsg = "illegal UTF-16 surrogate";
1406                 goto utf16Error;
1407             }
1408
1409         }
1410         errmsg = "illegal encoding";
1411         /* Fall through to report the error */
1412
1413     utf16Error:
1414         if (utf16_decoding_error(&p, errors, errmsg))
1415             goto onError;
1416     }
1417
1418     if (byteorder)
1419         *byteorder = bo;
1420
1421     /* Adjust length */
1422     if (_PyUnicode_Resize(&unicode, p - unicode->str))
1423         goto onError;
1424
1425     return (PyObject *)unicode;
1426
1427 onError:
1428     Py_DECREF(unicode);
1429     return NULL;
1430 }
1431
1432 PyObject *
1433 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1434                       int size,
1435                       const char *errors,
1436                       int byteorder)
1437 {
1438     PyObject *v;
1439     unsigned char *p;
1440     int i, pairs;
1441     /* Offsets from p for storing byte pairs in the right order. */
1442 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
1443     int ihi = 1, ilo = 0;
1444 #else
1445     int ihi = 0, ilo = 1;
1446 #endif
1447
1448 #define STORECHAR(CH)                   \
1449     do {                                \
1450         p[ihi] = ((CH) >> 8) & 0xff;    \
1451         p[ilo] = (CH) & 0xff;           \
1452         p += 2;                         \
1453     } while(0)
1454
1455     for (i = pairs = 0; i < size; i++)
1456         if (s[i] >= 0x10000)
1457             pairs++;
1458     v = PyString_FromStringAndSize(NULL,
1459                   2 * (size + pairs + (byteorder == 0)));
1460     if (v == NULL)
1461         return NULL;
1462
1463     p = (unsigned char *)PyString_AS_STRING(v);
1464     if (byteorder == 0)
1465         STORECHAR(0xFEFF);
1466     if (size == 0)
1467         return v;
1468
1469     if (byteorder == -1) {
1470         /* force LE */
1471         ihi = 1;
1472         ilo = 0;
1473     }
1474     else if (byteorder == 1) {
1475         /* force BE */
1476         ihi = 0;
1477         ilo = 1;
1478     }
1479
1480     while (size-- > 0) {
1481         Py_UNICODE ch = *s++;
1482         Py_UNICODE ch2 = 0;
1483         if (ch >= 0x10000) {
1484             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1485             ch  = 0xD800 | ((ch-0x10000) >> 10);
1486         }
1487         STORECHAR(ch);
1488         if (ch2)
1489             STORECHAR(ch2);
1490     }
1491     return v;
1492 #undef STORECHAR
1493 }
1494
1495 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1496 {
1497     if (!PyUnicode_Check(unicode)) {
1498         PyErr_BadArgument();
1499         return NULL;
1500     }
1501     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1502                                  PyUnicode_GET_SIZE(unicode),
1503                                  NULL,
1504                                  0);
1505 }
1506
1507 /* --- Unicode Escape Codec ----------------------------------------------- */
1508
1509 static
1510 int unicodeescape_decoding_error(Py_UNICODE **x,
1511                                  const char *errors,
1512                                  const char *details)
1513 {
1514     if ((errors == NULL) ||
1515         (strcmp(errors,"strict") == 0)) {
1516         PyErr_Format(PyExc_UnicodeError,
1517                      "Unicode-Escape decoding error: %.400s",
1518                      details);
1519         return -1;
1520     }
1521     else if (strcmp(errors,"ignore") == 0) {
1522         return 0;
1523     }
1524     else if (strcmp(errors,"replace") == 0) {
1525         **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1526         (*x)++;
1527         return 0;
1528     }
1529     else {
1530         PyErr_Format(PyExc_ValueError,
1531                      "Unicode-Escape decoding error; "
1532                      "unknown error handling code: %.400s",
1533                      errors);
1534         return -1;
1535     }
1536 }
1537
1538 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
1539
1540 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1541                                         int size,
1542                                         const char *errors)
1543 {
1544     PyUnicodeObject *v;
1545     Py_UNICODE *p, *buf;
1546     const char *end;
1547     char* message;
1548     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1549
1550     /* Escaped strings will always be longer than the resulting
1551        Unicode string, so we start with size here and then reduce the
1552        length after conversion to the true value. */
1553     v = _PyUnicode_New(size);
1554     if (v == NULL)
1555         goto onError;
1556     if (size == 0)
1557         return (PyObject *)v;
1558
1559     p = buf = PyUnicode_AS_UNICODE(v);
1560     end = s + size;
1561
1562     while (s < end) {
1563         unsigned char c;
1564         Py_UNICODE x;
1565         int i, digits;
1566
1567         /* Non-escape characters are interpreted as Unicode ordinals */
1568         if (*s != '\\') {
1569             *p++ = (unsigned char) *s++;
1570             continue;
1571         }
1572
1573         /* \ - Escapes */
1574         s++;
1575         switch (*s++) {
1576
1577         /* \x escapes */
1578         case '\n': break;
1579         case '\\': *p++ = '\\'; break;
1580         case '\'': *p++ = '\''; break;
1581         case '\"': *p++ = '\"'; break;
1582         case 'b': *p++ = '\b'; break;
1583         case 'f': *p++ = '\014'; break; /* FF */
1584         case 't': *p++ = '\t'; break;
1585         case 'n': *p++ = '\n'; break;
1586         case 'r': *p++ = '\r'; break;
1587         case 'v': *p++ = '\013'; break; /* VT */
1588         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1589
1590         /* \OOO (octal) escapes */
1591         case '0': case '1': case '2': case '3':
1592         case '4': case '5': case '6': case '7':
1593             x = s[-1] - '0';
1594             if ('0' <= *s && *s <= '7') {
1595                 x = (x<<3) + *s++ - '0';
1596                 if ('0' <= *s && *s <= '7')
1597                     x = (x<<3) + *s++ - '0';
1598             }
1599             *p++ = x;
1600             break;
1601
1602         /* hex escapes */
1603         /* \xXX */
1604         case 'x':
1605             digits = 2;
1606             message = "truncated \\xXX escape";
1607             goto hexescape;
1608
1609         /* \uXXXX */
1610         case 'u':
1611             digits = 4;
1612             message = "truncated \\uXXXX escape";
1613             goto hexescape;
1614
1615         /* \UXXXXXXXX */
1616         case 'U':
1617             digits = 8;
1618             message = "truncated \\UXXXXXXXX escape";
1619         hexescape:
1620             chr = 0;
1621             for (i = 0; i < digits; i++) {
1622                 c = (unsigned char) s[i];
1623                 if (!isxdigit(c)) {
1624                     if (unicodeescape_decoding_error(&p, errors, message))
1625                         goto onError;
1626                     chr = 0xffffffff;
1627                     i++;
1628                     break;
1629                 }
1630                 chr = (chr<<4) & ~0xF;
1631                 if (c >= '0' && c <= '9')
1632                     chr += c - '0';
1633                 else if (c >= 'a' && c <= 'f')
1634                     chr += 10 + c - 'a';
1635                 else
1636                     chr += 10 + c - 'A';
1637             }
1638             s += i;
1639             if (chr == 0xffffffff)
1640                     /* _decoding_error will have already written into the
1641                        target buffer. */
1642                     break;
1643         store:
1644             /* when we get here, chr is a 32-bit unicode character */
1645             if (chr <= 0xffff)
1646                 /* UCS-2 character */
1647                 *p++ = (Py_UNICODE) chr;
1648             else if (chr <= 0x10ffff) {
1649                 /* UCS-4 character. Either store directly, or as
1650                    surrogate pair. */
1651 #ifdef Py_UNICODE_WIDE
1652                 *p++ = chr;
1653 #else
1654                 chr -= 0x10000L;
1655                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1656                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
1657 #endif
1658             } else {
1659                 if (unicodeescape_decoding_error(
1660                     &p, errors,
1661                     "illegal Unicode character")
1662                     )
1663                     goto onError;
1664             }
1665             break;
1666
1667         /* \N{name} */
1668         case 'N':
1669             message = "malformed \\N character escape";
1670             if (ucnhash_CAPI == NULL) {
1671                 /* load the unicode data module */
1672                 PyObject *m, *v;
1673                 m = PyImport_ImportModule("unicodedata");
1674                 if (m == NULL)
1675                     goto ucnhashError;
1676                 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1677                 Py_DECREF(m);
1678                 if (v == NULL)
1679                     goto ucnhashError;
1680                 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1681                 Py_DECREF(v);
1682                 if (ucnhash_CAPI == NULL)
1683                     goto ucnhashError;
1684             }
1685             if (*s == '{') {
1686                 const char *start = s+1;
1687                 /* look for the closing brace */
1688                 while (*s != '}' && s < end)
1689                     s++;
1690                 if (s > start && s < end && *s == '}') {
1691                     /* found a name.  look it up in the unicode database */
1692                     message = "unknown Unicode character name";
1693                     s++;
1694                     if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1695                         goto store;
1696                 }
1697             }
1698             if (unicodeescape_decoding_error(&p, errors, message))
1699                 goto onError;
1700             break;
1701
1702         default:
1703             if (s > end) {
1704                 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1705                     goto onError;
1706             }
1707             else {
1708                 *p++ = '\\';
1709                 *p++ = (unsigned char)s[-1];
1710             }
1711             break;
1712         }
1713     }
1714     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1715                 goto onError;
1716     return (PyObject *)v;
1717
1718 ucnhashError:
1719     PyErr_SetString(
1720         PyExc_UnicodeError,
1721         "\\N escapes not supported (can't load unicodedata module)"
1722         );
1723     return NULL;
1724
1725 onError:
1726     Py_XDECREF(v);
1727     return NULL;
1728 }
1729
1730 /* Return a Unicode-Escape string version of the Unicode object.
1731
1732    If quotes is true, the string is enclosed in u"" or u'' quotes as
1733    appropriate.
1734
1735 */
1736
1737 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1738                                   int size,
1739                                   Py_UNICODE ch);
1740
1741 static
1742 PyObject *unicodeescape_string(const Py_UNICODE *s,
1743                                int size,
1744                                int quotes)
1745 {
1746     PyObject *repr;
1747     char *p;
1748
1749     static const char *hexdigit = "0123456789abcdef";
1750
1751     repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1752     if (repr == NULL)
1753         return NULL;
1754
1755     p = PyString_AS_STRING(repr);
1756
1757     if (quotes) {
1758         *p++ = 'u';
1759         *p++ = (findchar(s, size, '\'') &&
1760                 !findchar(s, size, '"')) ? '"' : '\'';
1761     }
1762     while (size-- > 0) {
1763         Py_UNICODE ch = *s++;
1764
1765         /* Escape quotes */
1766         if (quotes &&
1767             (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
1768             *p++ = '\\';
1769             *p++ = (char) ch;
1770             continue;
1771         }
1772
1773 #ifdef Py_UNICODE_WIDE
1774         /* Map 21-bit characters to '\U00xxxxxx' */
1775         else if (ch >= 0x10000) {
1776             int offset = p - PyString_AS_STRING(repr);
1777
1778             /* Resize the string if necessary */
1779             if (offset + 12 > PyString_GET_SIZE(repr)) {
1780                 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1781                     goto onError;
1782                 p = PyString_AS_STRING(repr) + offset;
1783             }
1784
1785             *p++ = '\\';
1786             *p++ = 'U';
1787             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1788             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1789             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1790             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1791             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1792             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1793             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
1794             *p++ = hexdigit[ch & 0x0000000F];
1795             continue;
1796         }
1797 #endif
1798         /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1799         else if (ch >= 0xD800 && ch < 0xDC00) {
1800             Py_UNICODE ch2;
1801             Py_UCS4 ucs;
1802
1803             ch2 = *s++;
1804             size--;
1805             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1806                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1807                 *p++ = '\\';
1808                 *p++ = 'U';
1809                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1810                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1811                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1812                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1813                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1814                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1815                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1816                 *p++ = hexdigit[ucs & 0x0000000F];
1817                 continue;
1818             }
1819             /* Fall through: isolated surrogates are copied as-is */
1820             s--;
1821             size++;
1822         }
1823
1824         /* Map 16-bit characters to '\uxxxx' */
1825         if (ch >= 256) {
1826             *p++ = '\\';
1827             *p++ = 'u';
1828             *p++ = hexdigit[(ch >> 12) & 0x000F];
1829             *p++ = hexdigit[(ch >> 8) & 0x000F];
1830             *p++ = hexdigit[(ch >> 4) & 0x000F];
1831             *p++ = hexdigit[ch & 0x000F];
1832         }
1833
1834         /* Map special whitespace to '\t', \n', '\r' */
1835         else if (ch == '\t') {
1836             *p++ = '\\';
1837             *p++ = 't';
1838         }
1839         else if (ch == '\n') {
1840             *p++ = '\\';
1841             *p++ = 'n';
1842         }
1843         else if (ch == '\r') {
1844             *p++ = '\\';
1845             *p++ = 'r';
1846         }
1847
1848         /* Map non-printable US ASCII to '\xhh' */
1849         else if (ch < ' ' || ch >= 0x7F) {
1850             *p++ = '\\';
1851             *p++ = 'x';
1852             *p++ = hexdigit[(ch >> 4) & 0x000F];
1853             *p++ = hexdigit[ch & 0x000F];
1854         }
1855
1856         /* Copy everything else as-is */
1857         else
1858             *p++ = (char) ch;
1859     }
1860     if (quotes)
1861         *p++ = PyString_AS_STRING(repr)[1];
1862
1863     *p = '\0';
1864     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
1865         goto onError;
1866
1867     return repr;
1868
1869  onError:
1870     Py_DECREF(repr);
1871     return NULL;
1872 }
1873
1874 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1875                                         int size)
1876 {
1877     return unicodeescape_string(s, size, 0);
1878 }
1879
1880 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1881 {
1882     if (!PyUnicode_Check(unicode)) {
1883         PyErr_BadArgument();
1884         return NULL;
1885     }
1886     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1887                                          PyUnicode_GET_SIZE(unicode));
1888 }
1889
1890 /* --- Raw Unicode Escape Codec ------------------------------------------- */
1891
1892 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1893                                            int size,
1894                                            const char *errors)
1895 {
1896     PyUnicodeObject *v;
1897     Py_UNICODE *p, *buf;
1898     const char *end;
1899     const char *bs;
1900
1901     /* Escaped strings will always be longer than the resulting
1902        Unicode string, so we start with size here and then reduce the
1903        length after conversion to the true value. */
1904     v = _PyUnicode_New(size);
1905     if (v == NULL)
1906         goto onError;
1907     if (size == 0)
1908         return (PyObject *)v;
1909     p = buf = PyUnicode_AS_UNICODE(v);
1910     end = s + size;
1911     while (s < end) {
1912         unsigned char c;
1913         Py_UCS4 x;
1914         int i;
1915
1916         /* Non-escape characters are interpreted as Unicode ordinals */
1917         if (*s != '\\') {
1918             *p++ = (unsigned char)*s++;
1919             continue;
1920         }
1921
1922         /* \u-escapes are only interpreted iff the number of leading
1923            backslashes if odd */
1924         bs = s;
1925         for (;s < end;) {
1926             if (*s != '\\')
1927                 break;
1928             *p++ = (unsigned char)*s++;
1929         }
1930         if (((s - bs) & 1) == 0 ||
1931             s >= end ||
1932             *s != 'u') {
1933             continue;
1934         }
1935         p--;
1936         s++;
1937
1938         /* \uXXXX with 4 hex digits */
1939         for (x = 0, i = 0; i < 4; i++) {
1940             c = (unsigned char)s[i];
1941             if (!isxdigit(c)) {
1942                 if (unicodeescape_decoding_error(&p, errors,
1943                                                  "truncated \\uXXXX"))
1944                     goto onError;
1945                 x = 0xffffffff;
1946                 i++;
1947                 break;
1948             }
1949             x = (x<<4) & ~0xF;
1950             if (c >= '0' && c <= '9')
1951                 x += c - '0';
1952             else if (c >= 'a' && c <= 'f')
1953                 x += 10 + c - 'a';
1954             else
1955                 x += 10 + c - 'A';
1956         }
1957         s += i;
1958         if (x != 0xffffffff)
1959                 *p++ = x;
1960     }
1961     if (_PyUnicode_Resize(&v, (int)(p - buf)))
1962         goto onError;
1963     return (PyObject *)v;
1964
1965  onError:
1966     Py_XDECREF(v);
1967     return NULL;
1968 }
1969
1970 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1971                                            int size)
1972 {
1973     PyObject *repr;
1974     char *p;
1975     char *q;
1976
1977     static const char *hexdigit = "0123456789abcdef";
1978
1979     repr = PyString_FromStringAndSize(NULL, 6 * size);
1980     if (repr == NULL)
1981         return NULL;
1982     if (size == 0)
1983         return repr;
1984
1985     p = q = PyString_AS_STRING(repr);
1986     while (size-- > 0) {
1987         Py_UNICODE ch = *s++;
1988         /* Map 16-bit characters to '\uxxxx' */
1989         if (ch >= 256) {
1990             *p++ = '\\';
1991             *p++ = 'u';
1992             *p++ = hexdigit[(ch >> 12) & 0xf];
1993             *p++ = hexdigit[(ch >> 8) & 0xf];
1994             *p++ = hexdigit[(ch >> 4) & 0xf];
1995             *p++ = hexdigit[ch & 15];
1996         }
1997         /* Copy everything else as-is */
1998         else
1999             *p++ = (char) ch;
2000     }
2001     *p = '\0';
2002     if (_PyString_Resize(&repr, p - q))
2003         goto onError;
2004
2005     return repr;
2006
2007  onError:
2008     Py_DECREF(repr);
2009     return NULL;
2010 }
2011
2012 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2013 {
2014     if (!PyUnicode_Check(unicode)) {
2015         PyErr_BadArgument();
2016         return NULL;
2017     }
2018     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2019                                             PyUnicode_GET_SIZE(unicode));
2020 }
2021
2022 /* --- Latin-1 Codec ------------------------------------------------------ */
2023
2024 PyObject *PyUnicode_DecodeLatin1(const char *s,
2025                                  int size,
2026                                  const char *errors)
2027 {
2028     PyUnicodeObject *v;
2029     Py_UNICODE *p;
2030
2031     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
2032     if (size == 1 && *(unsigned char*)s < 256) {
2033         Py_UNICODE r = *(unsigned char*)s;
2034         return PyUnicode_FromUnicode(&r, 1);
2035     }
2036
2037     v = _PyUnicode_New(size);
2038     if (v == NULL)
2039         goto onError;
2040     if (size == 0)
2041         return (PyObject *)v;
2042     p = PyUnicode_AS_UNICODE(v);
2043     while (size-- > 0)
2044         *p++ = (unsigned char)*s++;
2045     return (PyObject *)v;
2046
2047  onError:
2048     Py_XDECREF(v);
2049     return NULL;
2050 }
2051
2052 static
2053 int latin1_encoding_error(const Py_UNICODE **source,
2054                           char **dest,
2055                           const char *errors,
2056                           const char *details)
2057 {
2058     if ((errors == NULL) ||
2059         (strcmp(errors,"strict") == 0)) {
2060         PyErr_Format(PyExc_UnicodeError,
2061                      "Latin-1 encoding error: %.400s",
2062                      details);
2063         return -1;
2064     }
2065     else if (strcmp(errors,"ignore") == 0) {
2066         return 0;
2067     }
2068     else if (strcmp(errors,"replace") == 0) {
2069         **dest = '?';
2070         (*dest)++;
2071         return 0;
2072     }
2073     else {
2074         PyErr_Format(PyExc_ValueError,
2075                      "Latin-1 encoding error; "
2076                      "unknown error handling code: %.400s",
2077                      errors);
2078         return -1;
2079     }
2080 }
2081
2082 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2083                                  int size,
2084                                  const char *errors)
2085 {
2086     PyObject *repr;
2087     char *s, *start;
2088
2089     repr = PyString_FromStringAndSize(NULL, size);
2090     if (repr == NULL)
2091         return NULL;
2092     if (size == 0)
2093         return repr;
2094
2095     s = PyString_AS_STRING(repr);
2096     start = s;
2097     while (size-- > 0) {
2098         Py_UNICODE ch = *p++;
2099         if (ch >= 256) {
2100             if (latin1_encoding_error(&p, &s, errors,
2101                                       "ordinal not in range(256)"))
2102                 goto onError;
2103         }
2104         else
2105             *s++ = (char)ch;
2106     }
2107     /* Resize if error handling skipped some characters */
2108     if (s - start < PyString_GET_SIZE(repr))
2109         if (_PyString_Resize(&repr, s - start))
2110             goto onError;
2111     return repr;
2112
2113  onError:
2114     Py_DECREF(repr);
2115     return NULL;
2116 }
2117
2118 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2119 {
2120     if (!PyUnicode_Check(unicode)) {
2121         PyErr_BadArgument();
2122         return NULL;
2123     }
2124     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2125                                   PyUnicode_GET_SIZE(unicode),
2126                                   NULL);
2127 }
2128
2129 /* --- 7-bit ASCII Codec -------------------------------------------------- */
2130
2131 static
2132 int ascii_decoding_error(const char **source,
2133                          Py_UNICODE **dest,
2134                          const char *errors,
2135                          const char *details)
2136 {
2137     if ((errors == NULL) ||
2138         (strcmp(errors,"strict") == 0)) {
2139         PyErr_Format(PyExc_UnicodeError,
2140                      "ASCII decoding error: %.400s",
2141                      details);
2142         return -1;
2143     }
2144     else if (strcmp(errors,"ignore") == 0) {
2145         return 0;
2146     }
2147     else if (strcmp(errors,"replace") == 0) {
2148         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2149         (*dest)++;
2150         return 0;
2151     }
2152     else {
2153         PyErr_Format(PyExc_ValueError,
2154                      "ASCII decoding error; "
2155                      "unknown error handling code: %.400s",
2156                      errors);
2157         return -1;
2158     }
2159 }
2160
2161 PyObject *PyUnicode_DecodeASCII(const char *s,
2162                                 int size,
2163                                 const char *errors)
2164 {
2165     PyUnicodeObject *v;
2166     Py_UNICODE *p;
2167
2168     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
2169     if (size == 1 && *(unsigned char*)s < 128) {
2170         Py_UNICODE r = *(unsigned char*)s;
2171         return PyUnicode_FromUnicode(&r, 1);
2172     }
2173
2174     v = _PyUnicode_New(size);
2175     if (v == NULL)
2176         goto onError;
2177     if (size == 0)
2178         return (PyObject *)v;
2179     p = PyUnicode_AS_UNICODE(v);
2180     while (size-- > 0) {
2181         register unsigned char c;
2182
2183         c = (unsigned char)*s++;
2184         if (c < 128)
2185             *p++ = c;
2186         else if (ascii_decoding_error(&s, &p, errors,
2187                                       "ordinal not in range(128)"))
2188                 goto onError;
2189     }
2190     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
2191         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2192             goto onError;
2193     return (PyObject *)v;
2194
2195  onError:
2196     Py_XDECREF(v);
2197     return NULL;
2198 }
2199
2200 static
2201 int ascii_encoding_error(const Py_UNICODE **source,
2202                          char **dest,
2203                          const char *errors,
2204                          const char *details)
2205 {
2206     if ((errors == NULL) ||
2207         (strcmp(errors,"strict") == 0)) {
2208         PyErr_Format(PyExc_UnicodeError,
2209                      "ASCII encoding error: %.400s",
2210                      details);
2211         return -1;
2212     }
2213     else if (strcmp(errors,"ignore") == 0) {
2214         return 0;
2215     }
2216     else if (strcmp(errors,"replace") == 0) {
2217         **dest = '?';
2218         (*dest)++;
2219         return 0;
2220     }
2221     else {
2222         PyErr_Format(PyExc_ValueError,
2223                      "ASCII encoding error; "
2224                      "unknown error handling code: %.400s",
2225                      errors);
2226         return -1;
2227     }
2228 }
2229
2230 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2231                                 int size,
2232                                 const char *errors)
2233 {
2234     PyObject *repr;
2235     char *s, *start;
2236
2237     repr = PyString_FromStringAndSize(NULL, size);
2238     if (repr == NULL)
2239         return NULL;
2240     if (size == 0)
2241         return repr;
2242
2243     s = PyString_AS_STRING(repr);
2244     start = s;
2245     while (size-- > 0) {
2246         Py_UNICODE ch = *p++;
2247         if (ch >= 128) {
2248             if (ascii_encoding_error(&p, &s, errors,
2249                                       "ordinal not in range(128)"))
2250                 goto onError;
2251         }
2252         else
2253             *s++ = (char)ch;
2254     }
2255     /* Resize if error handling skipped some characters */
2256     if (s - start < PyString_GET_SIZE(repr))
2257         if (_PyString_Resize(&repr, s - start))
2258             goto onError;
2259     return repr;
2260
2261  onError:
2262     Py_DECREF(repr);
2263     return NULL;
2264 }
2265
2266 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2267 {
2268     if (!PyUnicode_Check(unicode)) {
2269         PyErr_BadArgument();
2270         return NULL;
2271     }
2272     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2273                                  PyUnicode_GET_SIZE(unicode),
2274                                  NULL);
2275 }
2276
2277 #if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
2278
2279 /* --- MBCS codecs for Windows -------------------------------------------- */
2280
2281 PyObject *PyUnicode_DecodeMBCS(const char *s,
2282                                 int size,
2283                                 const char *errors)
2284 {
2285     PyUnicodeObject *v;
2286     Py_UNICODE *p;
2287
2288     /* First get the size of the result */
2289     DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
2290     if (size > 0 && usize==0)
2291         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2292
2293     v = _PyUnicode_New(usize);
2294     if (v == NULL)
2295         return NULL;
2296     if (usize == 0)
2297         return (PyObject *)v;
2298     p = PyUnicode_AS_UNICODE(v);
2299     if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2300         Py_DECREF(v);
2301         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2302     }
2303
2304     return (PyObject *)v;
2305 }
2306
2307 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2308                                 int size,
2309                                 const char *errors)
2310 {
2311     PyObject *repr;
2312     char *s;
2313     DWORD mbcssize;
2314
2315     /* If there are no characters, bail now! */
2316     if (size==0)
2317             return PyString_FromString("");
2318
2319     /* First get the size of the result */
2320     mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
2321     if (mbcssize==0)
2322         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2323
2324     repr = PyString_FromStringAndSize(NULL, mbcssize);
2325     if (repr == NULL)
2326         return NULL;
2327     if (mbcssize == 0)
2328         return repr;
2329
2330     /* Do the conversion */
2331     s = PyString_AS_STRING(repr);
2332     if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2333         Py_DECREF(repr);
2334         return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2335     }
2336     return repr;
2337 }
2338
2339 #endif /* MS_WIN32 */
2340
2341 /* --- Character Mapping Codec -------------------------------------------- */
2342
2343 static
2344 int charmap_decoding_error(const char **source,
2345                          Py_UNICODE **dest,
2346                          const char *errors,
2347                          const char *details)
2348 {
2349     if ((errors == NULL) ||
2350         (strcmp(errors,"strict") == 0)) {
2351         PyErr_Format(PyExc_UnicodeError,
2352                      "charmap decoding error: %.400s",
2353                      details);
2354         return -1;
2355     }
2356     else if (strcmp(errors,"ignore") == 0) {
2357         return 0;
2358     }
2359     else if (strcmp(errors,"replace") == 0) {
2360         **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2361         (*dest)++;
2362         return 0;
2363     }
2364     else {
2365         PyErr_Format(PyExc_ValueError,
2366                      "charmap decoding error; "
2367                      "unknown error handling code: %.400s",
2368                      errors);
2369         return -1;
2370     }
2371 }
2372
2373 PyObject *PyUnicode_DecodeCharmap(const char *s,
2374                                   int size,
2375                                   PyObject *mapping,
2376                                   const char *errors)
2377 {
2378     PyUnicodeObject *v;
2379     Py_UNICODE *p;
2380     int extrachars = 0;
2381
2382     /* Default to Latin-1 */
2383     if (mapping == NULL)
2384         return PyUnicode_DecodeLatin1(s, size, errors);
2385
2386     v = _PyUnicode_New(size);
2387     if (v == NULL)
2388         goto onError;
2389     if (size == 0)
2390         return (PyObject *)v;
2391     p = PyUnicode_AS_UNICODE(v);
2392     while (size-- > 0) {
2393         unsigned char ch = *s++;
2394         PyObject *w, *x;
2395
2396         /* Get mapping (char ordinal -> integer, Unicode char or None) */
2397         w = PyInt_FromLong((long)ch);
2398         if (w == NULL)
2399             goto onError;
2400         x = PyObject_GetItem(mapping, w);
2401         Py_DECREF(w);
2402         if (x == NULL) {
2403             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2404                 /* No mapping found means: mapping is undefined. */
2405                 PyErr_Clear();
2406                 x = Py_None;
2407                 Py_INCREF(x);
2408             } else
2409                 goto onError;
2410         }
2411
2412         /* Apply mapping */
2413         if (PyInt_Check(x)) {
2414             long value = PyInt_AS_LONG(x);
2415             if (value < 0 || value > 65535) {
2416                 PyErr_SetString(PyExc_TypeError,
2417                                 "character mapping must be in range(65536)");
2418                 Py_DECREF(x);
2419                 goto onError;
2420             }
2421             *p++ = (Py_UNICODE)value;
2422         }
2423         else if (x == Py_None) {
2424             /* undefined mapping */
2425             if (charmap_decoding_error(&s, &p, errors,
2426                                        "character maps to <undefined>")) {
2427                 Py_DECREF(x);
2428                 goto onError;
2429             }
2430         }
2431         else if (PyUnicode_Check(x)) {
2432             int targetsize = PyUnicode_GET_SIZE(x);
2433
2434             if (targetsize == 1)
2435                 /* 1-1 mapping */
2436                 *p++ = *PyUnicode_AS_UNICODE(x);
2437
2438             else if (targetsize > 1) {
2439                 /* 1-n mapping */
2440                 if (targetsize > extrachars) {
2441                     /* resize first */
2442                     int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2443                     int needed = (targetsize - extrachars) + \
2444                                  (targetsize << 2);
2445                     extrachars += needed;
2446                     if (_PyUnicode_Resize(&v,
2447                                          PyUnicode_GET_SIZE(v) + needed)) {
2448                         Py_DECREF(x);
2449                         goto onError;
2450                     }
2451                     p = PyUnicode_AS_UNICODE(v) + oldpos;
2452                 }
2453                 Py_UNICODE_COPY(p,
2454                                 PyUnicode_AS_UNICODE(x),
2455                                 targetsize);
2456                 p += targetsize;
2457                 extrachars -= targetsize;
2458             }
2459             /* 1-0 mapping: skip the character */
2460         }
2461         else {
2462             /* wrong return value */
2463             PyErr_SetString(PyExc_TypeError,
2464                   "character mapping must return integer, None or unicode");
2465             Py_DECREF(x);
2466             goto onError;
2467         }
2468         Py_DECREF(x);
2469     }
2470     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2471         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2472             goto onError;
2473     return (PyObject *)v;
2474
2475  onError:
2476     Py_XDECREF(v);
2477     return NULL;
2478 }
2479
2480 static
2481 int charmap_encoding_error(const Py_UNICODE **source,
2482                            char **dest,
2483                            const char *errors,
2484                            const char *details)
2485 {
2486     if ((errors == NULL) ||
2487         (strcmp(errors,"strict") == 0)) {
2488         PyErr_Format(PyExc_UnicodeError,
2489                      "charmap encoding error: %.400s",
2490                      details);
2491         return -1;
2492     }
2493     else if (strcmp(errors,"ignore") == 0) {
2494         return 0;
2495     }
2496     else if (strcmp(errors,"replace") == 0) {
2497         **dest = '?';
2498         (*dest)++;
2499         return 0;
2500     }
2501     else {
2502         PyErr_Format(PyExc_ValueError,
2503                      "charmap encoding error; "
2504                      "unknown error handling code: %.400s",
2505                      errors);
2506         return -1;
2507     }
2508 }
2509
2510 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2511                                   int size,
2512                                   PyObject *mapping,
2513                                   const char *errors)
2514 {
2515     PyObject *v;
2516     char *s;
2517     int extrachars = 0;
2518
2519     /* Default to Latin-1 */
2520     if (mapping == NULL)
2521         return PyUnicode_EncodeLatin1(p, size, errors);
2522
2523     v = PyString_FromStringAndSize(NULL, size);
2524     if (v == NULL)
2525         return NULL;
2526     if (size == 0)
2527         return v;
2528     s = PyString_AS_STRING(v);
2529     while (size-- > 0) {
2530         Py_UNICODE ch = *p++;
2531         PyObject *w, *x;
2532
2533         /* Get mapping (Unicode ordinal -> string char, integer or None) */
2534         w = PyInt_FromLong((long)ch);
2535         if (w == NULL)
2536             goto onError;
2537         x = PyObject_GetItem(mapping, w);
2538         Py_DECREF(w);
2539         if (x == NULL) {
2540             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2541                 /* No mapping found means: mapping is undefined. */
2542                 PyErr_Clear();
2543                 x = Py_None;
2544                 Py_INCREF(x);
2545             } else
2546                 goto onError;
2547         }
2548
2549         /* Apply mapping */
2550         if (PyInt_Check(x)) {
2551             long value = PyInt_AS_LONG(x);
2552             if (value < 0 || value > 255) {
2553                 PyErr_SetString(PyExc_TypeError,
2554                                 "character mapping must be in range(256)");
2555                 Py_DECREF(x);
2556                 goto onError;
2557             }
2558             *s++ = (char)value;
2559         }
2560         else if (x == Py_None) {
2561             /* undefined mapping */
2562             if (charmap_encoding_error(&p, &s, errors,
2563                                        "character maps to <undefined>")) {
2564                 Py_DECREF(x);
2565                 goto onError;
2566             }
2567         }
2568         else if (PyString_Check(x)) {
2569             int targetsize = PyString_GET_SIZE(x);
2570
2571             if (targetsize == 1)
2572                 /* 1-1 mapping */
2573                 *s++ = *PyString_AS_STRING(x);
2574
2575             else if (targetsize > 1) {
2576                 /* 1-n mapping */
2577                 if (targetsize > extrachars) {
2578                     /* resize first */
2579                     int oldpos = (int)(s - PyString_AS_STRING(v));
2580                     int needed = (targetsize - extrachars) + \
2581                                  (targetsize << 2);
2582                     extrachars += needed;
2583                     if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
2584                         Py_DECREF(x);
2585                         goto onError;
2586                     }
2587                     s = PyString_AS_STRING(v) + oldpos;
2588                 }
2589                 memcpy(s, PyString_AS_STRING(x), targetsize);
2590                 s += targetsize;
2591                 extrachars -= targetsize;
2592             }
2593             /* 1-0 mapping: skip the character */
2594         }
2595         else {
2596             /* wrong return value */
2597             PyErr_SetString(PyExc_TypeError,
2598                   "character mapping must return integer, None or unicode");
2599             Py_DECREF(x);
2600             goto onError;
2601         }
2602         Py_DECREF(x);
2603     }
2604     if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2605         if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2606             goto onError;
2607     return v;
2608
2609  onError:
2610     Py_DECREF(v);
2611     return NULL;
2612 }
2613
2614 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2615                                     PyObject *mapping)
2616 {
2617     if (!PyUnicode_Check(unicode) || mapping == NULL) {
2618         PyErr_BadArgument();
2619         return NULL;
2620     }
2621     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2622                                    PyUnicode_GET_SIZE(unicode),
2623                                    mapping,
2624                                    NULL);
2625 }
2626
2627 static
2628 int translate_error(const Py_UNICODE **source,
2629                     Py_UNICODE **dest,
2630                     const char *errors,
2631                     const char *details)
2632 {
2633     if ((errors == NULL) ||
2634         (strcmp(errors,"strict") == 0)) {
2635         PyErr_Format(PyExc_UnicodeError,
2636                      "translate error: %.400s",
2637                      details);
2638         return -1;
2639     }
2640     else if (strcmp(errors,"ignore") == 0) {
2641         return 0;
2642     }
2643     else if (strcmp(errors,"replace") == 0) {
2644         **dest = '?';
2645         (*dest)++;
2646         return 0;
2647     }
2648     else {
2649         PyErr_Format(PyExc_ValueError,
2650                      "translate error; "
2651                      "unknown error handling code: %.400s",
2652                      errors);
2653         return -1;
2654     }
2655 }
2656
2657 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2658                                      int size,
2659                                      PyObject *mapping,
2660                                      const char *errors)
2661 {
2662     PyUnicodeObject *v;
2663     Py_UNICODE *p;
2664
2665     if (mapping == NULL) {
2666         PyErr_BadArgument();
2667         return NULL;
2668     }
2669
2670     /* Output will never be longer than input */
2671     v = _PyUnicode_New(size);
2672     if (v == NULL)
2673         goto onError;
2674     if (size == 0)
2675         goto done;
2676     p = PyUnicode_AS_UNICODE(v);
2677     while (size-- > 0) {
2678         Py_UNICODE ch = *s++;
2679         PyObject *w, *x;
2680
2681         /* Get mapping */
2682         w = PyInt_FromLong(ch);
2683         if (w == NULL)
2684             goto onError;
2685         x = PyObject_GetItem(mapping, w);
2686         Py_DECREF(w);
2687         if (x == NULL) {
2688             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2689                 /* No mapping found: default to 1-1 mapping */
2690                 PyErr_Clear();
2691                 *p++ = ch;
2692                 continue;
2693             }
2694             goto onError;
2695         }
2696
2697         /* Apply mapping */
2698         if (PyInt_Check(x))
2699             *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2700         else if (x == Py_None) {
2701             /* undefined mapping */
2702             if (translate_error(&s, &p, errors,
2703                                 "character maps to <undefined>")) {
2704                 Py_DECREF(x);
2705                 goto onError;
2706             }
2707         }
2708         else if (PyUnicode_Check(x)) {
2709             if (PyUnicode_GET_SIZE(x) != 1) {
2710                 /* 1-n mapping */
2711                 PyErr_SetString(PyExc_NotImplementedError,
2712                                 "1-n mappings are currently not implemented");
2713                 Py_DECREF(x);
2714                 goto onError;
2715             }
2716             *p++ = *PyUnicode_AS_UNICODE(x);
2717         }
2718         else {
2719             /* wrong return value */
2720             PyErr_SetString(PyExc_TypeError,
2721                   "translate mapping must return integer, None or unicode");
2722             Py_DECREF(x);
2723             goto onError;
2724         }
2725         Py_DECREF(x);
2726     }
2727     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2728         if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
2729             goto onError;
2730
2731  done:
2732     return (PyObject *)v;
2733
2734  onError:
2735     Py_XDECREF(v);
2736     return NULL;
2737 }
2738
2739 PyObject *PyUnicode_Translate(PyObject *str,
2740                               PyObject *mapping,
2741                               const char *errors)
2742 {
2743     PyObject *result;
2744
2745     str = PyUnicode_FromObject(str);
2746     if (str == NULL)
2747         goto onError;
2748     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2749                                         PyUnicode_GET_SIZE(str),
2750                                         mapping,
2751                                         errors);
2752     Py_DECREF(str);
2753     return result;
2754
2755  onError:
2756     Py_XDECREF(str);
2757     return NULL;
2758 }
2759
2760 /* --- Decimal Encoder ---------------------------------------------------- */
2761
2762 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2763                             int length,
2764                             char *output,
2765                             const char *errors)
2766 {
2767     Py_UNICODE *p, *end;
2768
2769     if (output == NULL) {
2770         PyErr_BadArgument();
2771         return -1;
2772     }
2773
2774     p = s;
2775     end = s + length;
2776     while (p < end) {
2777         register Py_UNICODE ch = *p++;
2778         int decimal;
2779
2780         if (Py_UNICODE_ISSPACE(ch)) {
2781             *output++ = ' ';
2782             continue;
2783         }
2784         decimal = Py_UNICODE_TODECIMAL(ch);
2785         if (decimal >= 0) {
2786             *output++ = '0' + decimal;
2787             continue;
2788         }
2789         if (0 < ch && ch < 256) {
2790             *output++ = (char)ch;
2791             continue;
2792         }
2793         /* All other characters are considered invalid */
2794         if (errors == NULL || strcmp(errors, "strict") == 0) {
2795             PyErr_SetString(PyExc_ValueError,
2796                             "invalid decimal Unicode string");
2797             goto onError;
2798         }
2799         else if (strcmp(errors, "ignore") == 0)
2800             continue;
2801         else if (strcmp(errors, "replace") == 0) {
2802             *output++ = '?';
2803             continue;
2804         }
2805     }
2806     /* 0-terminate the output string */
2807     *output++ = '\0';
2808     return 0;
2809
2810  onError:
2811     return -1;
2812 }
2813
2814 /* --- Helpers ------------------------------------------------------------ */
2815
2816 static
2817 int count(PyUnicodeObject *self,
2818           int start,
2819           int end,
2820           PyUnicodeObject *substring)
2821 {
2822     int count = 0;
2823
2824     if (start < 0)
2825         start += self->length;
2826     if (start < 0)
2827         start = 0;
2828     if (end > self->length)
2829         end = self->length;
2830     if (end < 0)
2831         end += self->length;
2832     if (end < 0)
2833         end = 0;
2834
2835     if (substring->length == 0)
2836         return (end - start + 1);
2837
2838     end -= substring->length;
2839
2840     while (start <= end)
2841         if (Py_UNICODE_MATCH(self, start, substring)) {
2842             count++;
2843             start += substring->length;
2844         } else
2845             start++;
2846
2847     return count;
2848 }
2849
2850 int PyUnicode_Count(PyObject *str,
2851                     PyObject *substr,
2852                     int start,
2853                     int end)
2854 {
2855     int result;
2856
2857     str = PyUnicode_FromObject(str);
2858     if (str == NULL)
2859         return -1;
2860     substr = PyUnicode_FromObject(substr);
2861     if (substr == NULL) {
2862         Py_DECREF(str);
2863         return -1;
2864     }
2865
2866     result = count((PyUnicodeObject *)str,
2867                    start, end,
2868                    (PyUnicodeObject *)substr);
2869
2870     Py_DECREF(str);
2871     Py_DECREF(substr);
2872     return result;
2873 }
2874
2875 static
2876 int findstring(PyUnicodeObject *self,
2877                PyUnicodeObject *substring,
2878                int start,
2879                int end,
2880                int direction)
2881 {
2882     if (start < 0)
2883         start += self->length;
2884     if (start < 0)
2885         start = 0;
2886
2887     if (substring->length == 0)
2888         return start;
2889
2890     if (end > self->length)
2891         end = self->length;
2892     if (end < 0)
2893         end += self->length;
2894     if (end < 0)
2895         end = 0;
2896
2897     end -= substring->length;
2898
2899     if (direction < 0) {
2900         for (; end >= start; end--)
2901             if (Py_UNICODE_MATCH(self, end, substring))
2902                 return end;
2903     } else {
2904         for (; start <= end; start++)
2905             if (Py_UNICODE_MATCH(self, start, substring))
2906                 return start;
2907     }
2908
2909     return -1;
2910 }
2911
2912 int PyUnicode_Find(PyObject *str,
2913                    PyObject *substr,
2914                    int start,
2915                    int end,
2916                    int direction)
2917 {
2918     int result;
2919
2920     str = PyUnicode_FromObject(str);
2921     if (str == NULL)
2922         return -1;
2923     substr = PyUnicode_FromObject(substr);
2924     if (substr == NULL) {
2925         Py_DECREF(substr);
2926         return -1;
2927     }
2928
2929     result = findstring((PyUnicodeObject *)str,
2930                         (PyUnicodeObject *)substr,
2931                         start, end, direction);
2932     Py_DECREF(str);
2933     Py_DECREF(substr);
2934     return result;
2935 }
2936
2937 static
2938 int tailmatch(PyUnicodeObject *self,
2939               PyUnicodeObject *substring,
2940               int start,
2941               int end,
2942               int direction)
2943 {
2944     if (start < 0)
2945         start += self->length;
2946     if (start < 0)
2947         start = 0;
2948
2949     if (substring->length == 0)
2950         return 1;
2951
2952     if (end > self->length)
2953         end = self->length;
2954     if (end < 0)
2955         end += self->length;
2956     if (end < 0)
2957         end = 0;
2958
2959     end -= substring->length;
2960     if (end < start)
2961         return 0;
2962
2963     if (direction > 0) {
2964         if (Py_UNICODE_MATCH(self, end, substring))
2965             return 1;
2966     } else {
2967         if (Py_UNICODE_MATCH(self, start, substring))
2968             return 1;
2969     }
2970
2971     return 0;
2972 }
2973
2974 int PyUnicode_Tailmatch(PyObject *str,
2975                         PyObject *substr,
2976                         int start,
2977                         int end,
2978                         int direction)
2979 {
2980     int result;
2981
2982     str = PyUnicode_FromObject(str);
2983     if (str == NULL)
2984         return -1;
2985     substr = PyUnicode_FromObject(substr);
2986     if (substr == NULL) {
2987         Py_DECREF(substr);
2988         return -1;
2989     }
2990
2991     result = tailmatch((PyUnicodeObject *)str,
2992                        (PyUnicodeObject *)substr,
2993                        start, end, direction);
2994     Py_DECREF(str);
2995     Py_DECREF(substr);
2996     return result;
2997 }
2998
2999 static
3000 const Py_UNICODE *findchar(const Py_UNICODE *s,
3001                      int size,
3002                      Py_UNICODE ch)
3003 {
3004     /* like wcschr, but doesn't stop at NULL characters */
3005
3006     while (size-- > 0) {
3007         if (*s == ch)
3008             return s;
3009         s++;
3010     }
3011
3012     return NULL;
3013 }
3014
3015 /* Apply fixfct filter to the Unicode object self and return a
3016    reference to the modified object */
3017
3018 static
3019 PyObject *fixup(PyUnicodeObject *self,
3020                 int (*fixfct)(PyUnicodeObject *s))
3021 {
3022
3023     PyUnicodeObject *u;
3024
3025     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
3026     if (u == NULL)
3027         return NULL;
3028
3029     Py_UNICODE_COPY(u->str, self->str, self->length);
3030
3031     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
3032         /* fixfct should return TRUE if it modified the buffer. If
3033            FALSE, return a reference to the original buffer instead
3034            (to save space, not time) */
3035         Py_INCREF(self);
3036         Py_DECREF(u);
3037         return (PyObject*) self;
3038     }
3039     return (PyObject*) u;
3040 }
3041
3042 static
3043 int fixupper(PyUnicodeObject *self)
3044 {
3045     int len = self->length;
3046     Py_UNICODE *s = self->str;
3047     int status = 0;
3048
3049     while (len-- > 0) {
3050         register Py_UNICODE ch;
3051
3052         ch = Py_UNICODE_TOUPPER(*s);
3053         if (ch != *s) {
3054             status = 1;
3055             *s = ch;
3056         }
3057         s++;
3058     }
3059
3060     return status;
3061 }
3062
3063 static
3064 int fixlower(PyUnicodeObject *self)
3065 {
3066     int len = self->length;
3067     Py_UNICODE *s = self->str;
3068     int status = 0;
3069
3070     while (len-- > 0) {
3071         register Py_UNICODE ch;
3072
3073         ch = Py_UNICODE_TOLOWER(*s);
3074         if (ch != *s) {
3075             status = 1;
3076             *s = ch;
3077         }
3078         s++;
3079     }
3080
3081     return status;
3082 }
3083
3084 static
3085 int fixswapcase(PyUnicodeObject *self)
3086 {
3087     int len = self->length;
3088     Py_UNICODE *s = self->str;
3089     int status = 0;
3090
3091     while (len-- > 0) {
3092         if (Py_UNICODE_ISUPPER(*s)) {
3093             *s = Py_UNICODE_TOLOWER(*s);
3094             status = 1;
3095         } else if (Py_UNICODE_ISLOWER(*s)) {
3096             *s = Py_UNICODE_TOUPPER(*s);
3097             status = 1;
3098         }
3099         s++;
3100     }
3101
3102     return status;
3103 }
3104
3105 static
3106 int fixcapitalize(PyUnicodeObject *self)
3107 {
3108     int len = self->length;
3109     Py_UNICODE *s = self->str;
3110     int status = 0;
3111
3112     if (len == 0)
3113         return 0;
3114     if (Py_UNICODE_ISLOWER(*s)) {
3115         *s = Py_UNICODE_TOUPPER(*s);
3116         status = 1;
3117     }
3118     s++;
3119     while (--len > 0) {
3120         if (Py_UNICODE_ISUPPER(*s)) {
3121             *s = Py_UNICODE_TOLOWER(*s);
3122             status = 1;
3123         }
3124         s++;
3125     }
3126     return status;
3127 }
3128
3129 static
3130 int fixtitle(PyUnicodeObject *self)
3131 {
3132     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3133     register Py_UNICODE *e;
3134     int previous_is_cased;
3135
3136     /* Shortcut for single character strings */
3137     if (PyUnicode_GET_SIZE(self) == 1) {
3138         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3139         if (*p != ch) {
3140             *p = ch;
3141             return 1;
3142         }
3143         else
3144             return 0;
3145     }
3146
3147     e = p + PyUnicode_GET_SIZE(self);
3148     previous_is_cased = 0;
3149     for (; p < e; p++) {
3150         register const Py_UNICODE ch = *p;
3151
3152         if (previous_is_cased)
3153             *p = Py_UNICODE_TOLOWER(ch);
3154         else
3155             *p = Py_UNICODE_TOTITLE(ch);
3156
3157         if (Py_UNICODE_ISLOWER(ch) ||
3158             Py_UNICODE_ISUPPER(ch) ||
3159             Py_UNICODE_ISTITLE(ch))
3160             previous_is_cased = 1;
3161         else
3162             previous_is_cased = 0;
3163     }
3164     return 1;
3165 }
3166
3167 PyObject *PyUnicode_Join(PyObject *separator,
3168                          PyObject *seq)
3169 {
3170     Py_UNICODE *sep;
3171     int seplen;
3172     PyUnicodeObject *res = NULL;
3173     int reslen = 0;
3174     Py_UNICODE *p;
3175     int sz = 100;
3176     int i;
3177     PyObject *it;
3178
3179     it = PyObject_GetIter(seq);
3180     if (it == NULL)
3181         return NULL;
3182
3183     if (separator == NULL) {
3184         Py_UNICODE blank = ' ';
3185         sep = &blank;
3186         seplen = 1;
3187     }
3188     else {
3189         separator = PyUnicode_FromObject(separator);
3190         if (separator == NULL)
3191             goto onError;
3192         sep = PyUnicode_AS_UNICODE(separator);
3193         seplen = PyUnicode_GET_SIZE(separator);
3194     }
3195
3196     res = _PyUnicode_New(sz);
3197     if (res == NULL)
3198         goto onError;
3199     p = PyUnicode_AS_UNICODE(res);
3200     reslen = 0;
3201
3202     for (i = 0; ; ++i) {
3203         int itemlen;
3204         PyObject *item = PyIter_Next(it);
3205         if (item == NULL) {
3206             if (PyErr_Occurred())
3207                 goto onError;
3208             break;
3209         }
3210         if (!PyUnicode_Check(item)) {
3211             PyObject *v;
3212             if (!PyString_Check(item)) {
3213                 PyErr_Format(PyExc_TypeError,
3214                              "sequence item %i: expected string or Unicode,"
3215                              " %.80s found",
3216                              i, item->ob_type->tp_name);
3217                 Py_DECREF(item);
3218                 goto onError;
3219             }
3220             v = PyUnicode_FromObject(item);
3221             Py_DECREF(item);
3222             item = v;
3223             if (item == NULL)
3224                 goto onError;
3225         }
3226         itemlen = PyUnicode_GET_SIZE(item);
3227         while (reslen + itemlen + seplen >= sz) {
3228             if (_PyUnicode_Resize(&res, sz*2)) {
3229                 Py_DECREF(item);
3230                 goto onError;
3231             }
3232             sz *= 2;
3233             p = PyUnicode_AS_UNICODE(res) + reslen;
3234         }
3235         if (i > 0) {
3236             Py_UNICODE_COPY(p, sep, seplen);
3237             p += seplen;
3238             reslen += seplen;
3239         }
3240         Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
3241         p += itemlen;
3242         reslen += itemlen;
3243         Py_DECREF(item);
3244     }
3245     if (_PyUnicode_Resize(&res, reslen))
3246         goto onError;
3247
3248     Py_XDECREF(separator);
3249     Py_DECREF(it);
3250     return (PyObject *)res;
3251
3252  onError:
3253     Py_XDECREF(separator);
3254     Py_XDECREF(res);
3255     Py_DECREF(it);
3256     return NULL;
3257 }
3258
3259 static
3260 PyUnicodeObject *pad(PyUnicodeObject *self,
3261                      int left,
3262                      int right,
3263                      Py_UNICODE fill)
3264 {
3265     PyUnicodeObject *u;
3266
3267     if (left < 0)
3268         left = 0;
3269     if (right < 0)
3270         right = 0;
3271
3272     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
3273         Py_INCREF(self);
3274         return self;
3275     }
3276
3277     u = _PyUnicode_New(left + self->length + right);
3278     if (u) {
3279         if (left)
3280             Py_UNICODE_FILL(u->str, fill, left);
3281         Py_UNICODE_COPY(u->str + left, self->str, self->length);
3282         if (right)
3283             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3284     }
3285
3286     return u;
3287 }
3288
3289 #define SPLIT_APPEND(data, left, right)                                 \
3290         str = PyUnicode_FromUnicode(data + left, right - left);         \
3291         if (!str)                                                       \
3292             goto onError;                                               \
3293         if (PyList_Append(list, str)) {                                 \
3294             Py_DECREF(str);                                             \
3295             goto onError;                                               \
3296         }                                                               \
3297         else                                                            \
3298             Py_DECREF(str);
3299
3300 static
3301 PyObject *split_whitespace(PyUnicodeObject *self,
3302                            PyObject *list,
3303                            int maxcount)
3304 {
3305     register int i;
3306     register int j;
3307     int len = self->length;
3308     PyObject *str;
3309
3310     for (i = j = 0; i < len; ) {
3311         /* find a token */
3312         while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3313             i++;
3314         j = i;
3315         while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3316             i++;
3317         if (j < i) {
3318             if (maxcount-- <= 0)
3319                 break;
3320             SPLIT_APPEND(self->str, j, i);
3321             while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3322                 i++;
3323             j = i;
3324         }
3325     }
3326     if (j < len) {
3327         SPLIT_APPEND(self->str, j, len);
3328     }
3329     return list;
3330
3331  onError:
3332     Py_DECREF(list);
3333     return NULL;
3334 }
3335
3336 PyObject *PyUnicode_Splitlines(PyObject *string,
3337                                int keepends)
3338 {
3339     register int i;
3340     register int j;
3341     int len;
3342     PyObject *list;
3343     PyObject *str;
3344     Py_UNICODE *data;
3345
3346     string = PyUnicode_FromObject(string);
3347     if (string == NULL)
3348         return NULL;
3349     data = PyUnicode_AS_UNICODE(string);
3350     len = PyUnicode_GET_SIZE(string);
3351
3352     list = PyList_New(0);
3353     if (!list)
3354         goto onError;
3355
3356     for (i = j = 0; i < len; ) {
3357         int eol;
3358
3359         /* Find a line and append it */
3360         while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3361             i++;
3362
3363         /* Skip the line break reading CRLF as one line break */
3364         eol = i;
3365         if (i < len) {
3366             if (data[i] == '\r' && i + 1 < len &&
3367                 data[i+1] == '\n')
3368                 i += 2;
3369             else
3370                 i++;
3371             if (keepends)
3372                 eol = i;
3373         }
3374         SPLIT_APPEND(data, j, eol);
3375         j = i;
3376     }
3377     if (j < len) {
3378         SPLIT_APPEND(data, j, len);
3379     }
3380
3381     Py_DECREF(string);
3382     return list;
3383
3384  onError:
3385     Py_DECREF(list);
3386     Py_DECREF(string);
3387     return NULL;
3388 }
3389
3390 static
3391 PyObject *split_char(PyUnicodeObject *self,
3392                      PyObject *list,
3393                      Py_UNICODE ch,
3394                      int maxcount)
3395 {
3396     register int i;
3397     register int j;
3398     int len = self->length;
3399     PyObject *str;
3400
3401     for (i = j = 0; i < len; ) {
3402         if (self->str[i] == ch) {
3403             if (maxcount-- <= 0)
3404                 break;
3405             SPLIT_APPEND(self->str, j, i);
3406             i = j = i + 1;
3407         } else
3408             i++;
3409     }
3410     if (j <= len) {
3411         SPLIT_APPEND(self->str, j, len);
3412     }
3413     return list;
3414
3415  onError:
3416     Py_DECREF(list);
3417     return NULL;
3418 }
3419
3420 static
3421 PyObject *split_substring(PyUnicodeObject *self,
3422                           PyObject *list,
3423                           PyUnicodeObject *substring,
3424                           int maxcount)
3425 {
3426     register int i;
3427     register int j;
3428     int len = self->length;
3429     int sublen = substring->length;
3430     PyObject *str;
3431
3432     for (i = j = 0; i <= len - sublen; ) {
3433         if (Py_UNICODE_MATCH(self, i, substring)) {
3434             if (maxcount-- <= 0)
3435                 break;
3436             SPLIT_APPEND(self->str, j, i);
3437             i = j = i + sublen;
3438         } else
3439             i++;
3440     }
3441     if (j <= len) {
3442         SPLIT_APPEND(self->str, j, len);
3443     }
3444     return list;
3445
3446  onError:
3447     Py_DECREF(list);
3448     return NULL;
3449 }
3450
3451 #undef SPLIT_APPEND
3452
3453 static
3454 PyObject *split(PyUnicodeObject *self,
3455                 PyUnicodeObject *substring,
3456                 int maxcount)
3457 {
3458     PyObject *list;
3459
3460     if (maxcount < 0)
3461         maxcount = INT_MAX;
3462
3463     list = PyList_New(0);
3464     if (!list)
3465         return NULL;
3466
3467     if (substring == NULL)
3468         return split_whitespace(self,list,maxcount);
3469
3470     else if (substring->length == 1)
3471         return split_char(self,list,substring->str[0],maxcount);
3472
3473     else if (substring->length == 0) {
3474         Py_DECREF(list);
3475         PyErr_SetString(PyExc_ValueError, "empty separator");
3476         return NULL;
3477     }
3478     else
3479         return split_substring(self,list,substring,maxcount);
3480 }
3481
3482 static
3483 PyObject *strip(PyUnicodeObject *self,
3484                 int left,
3485                 int right)
3486 {
3487     Py_UNICODE *p = self->str;
3488     int start = 0;
3489     int end = self->length;
3490
3491     if (left)
3492         while (start < end && Py_UNICODE_ISSPACE(p[start]))
3493             start++;
3494
3495     if (right)
3496         while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3497             end--;
3498
3499     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
3500         /* couldn't strip anything off, return original string */
3501         Py_INCREF(self);
3502         return (PyObject*) self;
3503     }
3504
3505     return (PyObject*) PyUnicode_FromUnicode(
3506         self->str + start,
3507         end - start
3508         );
3509 }
3510
3511 static
3512 PyObject *replace(PyUnicodeObject *self,
3513                   PyUnicodeObject *str1,
3514                   PyUnicodeObject *str2,
3515                   int maxcount)
3516 {
3517     PyUnicodeObject *u;
3518
3519     if (maxcount < 0)
3520         maxcount = INT_MAX;
3521
3522     if (str1->length == 1 && str2->length == 1) {
3523         int i;
3524
3525         /* replace characters */
3526         if (!findchar(self->str, self->length, str1->str[0]) &&
3527             PyUnicode_CheckExact(self)) {
3528             /* nothing to replace, return original string */
3529             Py_INCREF(self);
3530             u = self;
3531         } else {
3532             Py_UNICODE u1 = str1->str[0];
3533             Py_UNICODE u2 = str2->str[0];
3534
3535             u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3536                 NULL,
3537                 self->length
3538                 );
3539             if (u != NULL) {
3540                 Py_UNICODE_COPY(u->str, self->str,
3541                                 self->length);
3542                 for (i = 0; i < u->length; i++)
3543                     if (u->str[i] == u1) {
3544                         if (--maxcount < 0)
3545                             break;
3546                         u->str[i] = u2;
3547                     }
3548         }
3549         }
3550
3551     } else {
3552         int n, i;
3553         Py_UNICODE *p;
3554
3555         /* replace strings */
3556         n = count(self, 0, self->length, str1);
3557         if (n > maxcount)
3558             n = maxcount;
3559         if (n == 0 && PyUnicode_CheckExact(self)) {
3560             /* nothing to replace, return original string */
3561             Py_INCREF(self);
3562             u = self;
3563         } else {
3564             u = _PyUnicode_New(
3565                 self->length + n * (str2->length - str1->length));
3566             if (u) {
3567                 i = 0;
3568                 p = u->str;
3569                 while (i <= self->length - str1->length)
3570                     if (Py_UNICODE_MATCH(self, i, str1)) {
3571                         /* replace string segment */
3572                         Py_UNICODE_COPY(p, str2->str, str2->length);
3573                         p += str2->length;
3574                         i += str1->length;
3575                         if (--n <= 0) {
3576                             /* copy remaining part */
3577                             Py_UNICODE_COPY(p, self->str+i, self->length-i);
3578                             break;
3579                         }
3580                     } else
3581                         *p++ = self->str[i++];
3582             }
3583         }
3584     }
3585
3586     return (PyObject *) u;
3587 }
3588
3589 /* --- Unicode Object Methods --------------------------------------------- */
3590
3591 static char title__doc__[] =
3592 "S.title() -> unicode\n\
3593 \n\
3594 Return a titlecased version of S, i.e. words start with title case\n\
3595 characters, all remaining cased characters have lower case.";
3596
3597 static PyObject*
3598 unicode_title(PyUnicodeObject *self)
3599 {
3600     return fixup(self, fixtitle);
3601 }
3602
3603 static char capitalize__doc__[] =
3604 "S.capitalize() -> unicode\n\
3605 \n\
3606 Return a capitalized version of S, i.e. make the first character\n\
3607 have upper case.";
3608
3609 static PyObject*
3610 unicode_capitalize(PyUnicodeObject *self)
3611 {
3612     return fixup(self, fixcapitalize);
3613 }
3614
3615 #if 0
3616 static char capwords__doc__[] =
3617 "S.capwords() -> unicode\n\
3618 \n\
3619 Apply .capitalize() to all words in S and return the result with\n\
3620 normalized whitespace (all whitespace strings are replaced by ' ').";
3621
3622 static PyObject*
3623 unicode_capwords(PyUnicodeObject *self)
3624 {
3625     PyObject *list;
3626     PyObject *item;
3627     int i;
3628
3629     /* Split into words */
3630     list = split(self, NULL, -1);
3631     if (!list)
3632         return NULL;
3633
3634     /* Capitalize each word */
3635     for (i = 0; i < PyList_GET_SIZE(list); i++) {
3636         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3637                      fixcapitalize);
3638         if (item == NULL)
3639             goto onError;
3640         Py_DECREF(PyList_GET_ITEM(list, i));
3641         PyList_SET_ITEM(list, i, item);
3642     }
3643
3644     /* Join the words to form a new string */
3645     item = PyUnicode_Join(NULL, list);
3646
3647 onError:
3648     Py_DECREF(list);
3649     return (PyObject *)item;
3650 }
3651 #endif
3652
3653 static char center__doc__[] =
3654 "S.center(width) -> unicode\n\
3655 \n\
3656 Return S centered in a Unicode string of length width. Padding is done\n\
3657 using spaces.";
3658
3659 static PyObject *
3660 unicode_center(PyUnicodeObject *self, PyObject *args)
3661 {
3662     int marg, left;
3663     int width;
3664
3665     if (!PyArg_ParseTuple(args, "i:center", &width))
3666         return NULL;
3667
3668     if (self->length >= width && PyUnicode_CheckExact(self)) {
3669         Py_INCREF(self);
3670         return (PyObject*) self;
3671     }
3672
3673     marg = width - self->length;
3674     left = marg / 2 + (marg & width & 1);
3675
3676     return (PyObject*) pad(self, left, marg - left, ' ');
3677 }
3678
3679 #if 0
3680
3681 /* This code should go into some future Unicode collation support
3682    module. The basic comparison should compare ordinals on a naive
3683    basis (this is what Java does and thus JPython too). */
3684
3685 /* speedy UTF-16 code point order comparison */
3686 /* gleaned from: */
3687 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3688
3689 static short utf16Fixup[32] =
3690 {
3691     0, 0, 0, 0, 0, 0, 0, 0,
3692     0, 0, 0, 0, 0, 0, 0, 0,
3693     0, 0, 0, 0, 0, 0, 0, 0,
3694     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
3695 };
3696
3697 static int
3698 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3699 {
3700     int len1, len2;
3701
3702     Py_UNICODE *s1 = str1->str;
3703     Py_UNICODE *s2 = str2->str;
3704
3705     len1 = str1->length;
3706     len2 = str2->length;
3707
3708     while (len1 > 0 && len2 > 0) {
3709         Py_UNICODE c1, c2;
3710
3711         c1 = *s1++;
3712         c2 = *s2++;
3713
3714         if (c1 > (1<<11) * 26)
3715             c1 += utf16Fixup[c1>>11];
3716         if (c2 > (1<<11) * 26)
3717             c2 += utf16Fixup[c2>>11];
3718         /* now c1 and c2 are in UTF-32-compatible order */
3719
3720         if (c1 != c2)
3721             return (c1 < c2) ? -1 : 1;
3722
3723         len1--; len2--;
3724     }
3725
3726     return (len1 < len2) ? -1 : (len1 != len2);
3727 }
3728
3729 #else
3730
3731 static int
3732 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3733 {
3734     register int len1, len2;
3735
3736     Py_UNICODE *s1 = str1->str;
3737     Py_UNICODE *s2 = str2->str;
3738
3739     len1 = str1->length;
3740     len2 = str2->length;
3741
3742     while (len1 > 0 && len2 > 0) {
3743         Py_UNICODE c1, c2;
3744
3745         c1 = *s1++;
3746         c2 = *s2++;
3747
3748         if (c1 != c2)
3749             return (c1 < c2) ? -1 : 1;
3750
3751         len1--; len2--;
3752     }
3753
3754     return (len1 < len2) ? -1 : (len1 != len2);
3755 }
3756
3757 #endif
3758
3759 int PyUnicode_Compare(PyObject *left,
3760                       PyObject *right)
3761 {
3762     PyUnicodeObject *u = NULL, *v = NULL;
3763     int result;
3764
3765     /* Coerce the two arguments */
3766     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3767     if (u == NULL)
3768         goto onError;
3769     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3770     if (v == NULL)
3771         goto onError;
3772
3773     /* Shortcut for empty or interned objects */
3774     if (v == u) {
3775         Py_DECREF(u);
3776         Py_DECREF(v);
3777         return 0;
3778     }
3779
3780     result = unicode_compare(u, v);
3781
3782     Py_DECREF(u);
3783     Py_DECREF(v);
3784     return result;
3785
3786 onError:
3787     Py_XDECREF(u);
3788     Py_XDECREF(v);
3789     return -1;
3790 }
3791
3792 int PyUnicode_Contains(PyObject *container,
3793                        PyObject *element)
3794 {
3795     PyUnicodeObject *u = NULL, *v = NULL;
3796     int result;
3797     register const Py_UNICODE *p, *e;
3798     register Py_UNICODE ch;
3799
3800     /* Coerce the two arguments */
3801     v = (PyUnicodeObject *)PyUnicode_FromObject(element);
3802     if (v == NULL) {
3803         PyErr_SetString(PyExc_TypeError,
3804             "'in <string>' requires character as left operand");
3805         goto onError;
3806     }
3807     u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3808     if (u == NULL) {
3809         Py_DECREF(v);
3810         goto onError;
3811     }
3812
3813     /* Check v in u */
3814     if (PyUnicode_GET_SIZE(v) != 1) {
3815         PyErr_SetString(PyExc_TypeError,
3816             "'in <string>' requires character as left operand");
3817         goto onError;
3818     }
3819     ch = *PyUnicode_AS_UNICODE(v);
3820     p = PyUnicode_AS_UNICODE(u);
3821     e = p + PyUnicode_GET_SIZE(u);
3822     result = 0;
3823     while (p < e) {
3824         if (*p++ == ch) {
3825             result = 1;
3826             break;
3827         }
3828     }
3829
3830     Py_DECREF(u);
3831     Py_DECREF(v);
3832     return result;
3833
3834 onError:
3835     Py_XDECREF(u);
3836     Py_XDECREF(v);
3837     return -1;
3838 }
3839
3840 /* Concat to string or Unicode object giving a new Unicode object. */
3841
3842 PyObject *PyUnicode_Concat(PyObject *left,
3843                            PyObject *right)
3844 {
3845     PyUnicodeObject *u = NULL, *v = NULL, *w;
3846
3847     /* Coerce the two arguments */
3848     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3849     if (u == NULL)
3850         goto onError;
3851     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3852     if (v == NULL)
3853         goto onError;
3854
3855     /* Shortcuts */
3856     if (v == unicode_empty) {
3857         Py_DECREF(v);
3858         return (PyObject *)u;
3859     }
3860     if (u == unicode_empty) {
3861         Py_DECREF(u);
3862         return (PyObject *)v;
3863     }
3864
3865     /* Concat the two Unicode strings */
3866     w = _PyUnicode_New(u->length + v->length);
3867     if (w == NULL)
3868         goto onError;
3869     Py_UNICODE_COPY(w->str, u->str, u->length);
3870     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3871
3872     Py_DECREF(u);
3873     Py_DECREF(v);
3874     return (PyObject *)w;
3875
3876 onError:
3877     Py_XDECREF(u);
3878     Py_XDECREF(v);
3879     return NULL;
3880 }
3881
3882 static char count__doc__[] =
3883 "S.count(sub[, start[, end]]) -> int\n\
3884 \n\
3885 Return the number of occurrences of substring sub in Unicode string\n\
3886 S[start:end].  Optional arguments start and end are\n\
3887 interpreted as in slice notation.";
3888
3889 static PyObject *
3890 unicode_count(PyUnicodeObject *self, PyObject *args)
3891 {
3892     PyUnicodeObject *substring;
3893     int start = 0;
3894     int end = INT_MAX;
3895     PyObject *result;
3896
3897     if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3898                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
3899         return NULL;
3900
3901     substring = (PyUnicodeObject *)PyUnicode_FromObject(
3902                                                 (PyObject *)substring);
3903     if (substring == NULL)
3904         return NULL;
3905
3906     if (start < 0)
3907         start += self->length;
3908     if (start < 0)
3909         start = 0;
3910     if (end > self->length)
3911         end = self->length;
3912     if (end < 0)
3913         end += self->length;
3914     if (end < 0)
3915         end = 0;
3916
3917     result = PyInt_FromLong((long) count(self, start, end, substring));
3918
3919     Py_DECREF(substring);
3920     return result;
3921 }
3922
3923 static char encode__doc__[] =
3924 "S.encode([encoding[,errors]]) -> string\n\
3925 \n\
3926 Return an encoded string version of S. Default encoding is the current\n\
3927 default string encoding. errors may be given to set a different error\n\
3928 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3929 a ValueError. Other possible values are 'ignore' and 'replace'.";
3930
3931 static PyObject *
3932 unicode_encode(PyUnicodeObject *self, PyObject *args)
3933 {
3934     char *encoding = NULL;
3935     char *errors = NULL;
3936     if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3937         return NULL;
3938     return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3939 }
3940
3941 static char expandtabs__doc__[] =
3942 "S.expandtabs([tabsize]) -> unicode\n\
3943 \n\
3944 Return a copy of S where all tab characters are expanded using spaces.\n\
3945 If tabsize is not given, a tab size of 8 characters is assumed.";
3946
3947 static PyObject*
3948 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3949 {
3950     Py_UNICODE *e;
3951     Py_UNICODE *p;
3952     Py_UNICODE *q;
3953     int i, j;
3954     PyUnicodeObject *u;
3955     int tabsize = 8;
3956
3957     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3958         return NULL;
3959
3960     /* First pass: determine size of output string */
3961     i = j = 0;
3962     e = self->str + self->length;
3963     for (p = self->str; p < e; p++)
3964         if (*p == '\t') {
3965             if (tabsize > 0)
3966                 j += tabsize - (j % tabsize);
3967         }
3968         else {
3969             j++;
3970             if (*p == '\n' || *p == '\r') {
3971                 i += j;
3972                 j = 0;
3973             }
3974         }
3975
3976     /* Second pass: create output string and fill it */
3977     u = _PyUnicode_New(i + j);
3978     if (!u)
3979         return NULL;
3980
3981     j = 0;
3982     q = u->str;
3983
3984     for (p = self->str; p < e; p++)
3985         if (*p == '\t') {
3986             if (tabsize > 0) {
3987                 i = tabsize - (j % tabsize);
3988                 j += i;
3989                 while (i--)
3990                     *q++ = ' ';
3991             }
3992         }
3993         else {
3994             j++;
3995             *q++ = *p;
3996             if (*p == '\n' || *p == '\r')
3997                 j = 0;
3998         }
3999
4000     return (PyObject*) u;
4001 }
4002
4003 static char find__doc__[] =
4004 "S.find(sub [,start [,end]]) -> int\n\
4005 \n\
4006 Return the lowest index in S where substring sub is found,\n\
4007 such that sub is contained within s[start,end].  Optional\n\
4008 arguments start and end are interpreted as in slice notation.\n\
4009 \n\
4010 Return -1 on failure.";
4011
4012 static PyObject *
4013 unicode_find(PyUnicodeObject *self, PyObject *args)
4014 {
4015     PyUnicodeObject *substring;
4016     int start = 0;
4017     int end = INT_MAX;
4018     PyObject *result;
4019
4020     if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4021                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4022         return NULL;
4023     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4024                                                 (PyObject *)substring);
4025     if (substring == NULL)
4026         return NULL;
4027
4028     result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4029
4030     Py_DECREF(substring);
4031     return result;
4032 }
4033
4034 static PyObject *
4035 unicode_getitem(PyUnicodeObject *self, int index)
4036 {
4037     if (index < 0 || index >= self->length) {
4038         PyErr_SetString(PyExc_IndexError, "string index out of range");
4039         return NULL;
4040     }
4041
4042     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4043 }
4044
4045 static long
4046 unicode_hash(PyUnicodeObject *self)
4047 {
4048     /* Since Unicode objects compare equal to their ASCII string
4049        counterparts, they should use the individual character values
4050        as basis for their hash value.  This is needed to assure that
4051        strings and Unicode objects behave in the same way as
4052        dictionary keys. */
4053
4054     register int len;
4055     register Py_UNICODE *p;
4056     register long x;
4057
4058     if (self->hash != -1)
4059         return self->hash;
4060     len = PyUnicode_GET_SIZE(self);
4061     p = PyUnicode_AS_UNICODE(self);
4062     x = *p << 7;
4063     while (--len >= 0)
4064         x = (1000003*x) ^ *p++;
4065     x ^= PyUnicode_GET_SIZE(self);
4066     if (x == -1)
4067         x = -2;
4068     self->hash = x;
4069     return x;
4070 }
4071
4072 static char index__doc__[] =
4073 "S.index(sub [,start [,end]]) -> int\n\
4074 \n\
4075 Like S.find() but raise ValueError when the substring is not found.";
4076
4077 static PyObject *
4078 unicode_index(PyUnicodeObject *self, PyObject *args)
4079 {
4080     int result;
4081     PyUnicodeObject *substring;
4082     int start = 0;
4083     int end = INT_MAX;
4084
4085     if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4086                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4087         return NULL;
4088
4089     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4090                                                 (PyObject *)substring);
4091     if (substring == NULL)
4092         return NULL;
4093
4094     result = findstring(self, substring, start, end, 1);
4095
4096     Py_DECREF(substring);
4097     if (result < 0) {
4098         PyErr_SetString(PyExc_ValueError, "substring not found");
4099         return NULL;
4100     }
4101     return PyInt_FromLong(result);
4102 }
4103
4104 static char islower__doc__[] =
4105 "S.islower() -> int\n\
4106 \n\
4107 Return 1 if  all cased characters in S are lowercase and there is\n\
4108 at least one cased character in S, 0 otherwise.";
4109
4110 static PyObject*
4111 unicode_islower(PyUnicodeObject *self)
4112 {
4113     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4114     register const Py_UNICODE *e;
4115     int cased;
4116
4117     /* Shortcut for single character strings */
4118     if (PyUnicode_GET_SIZE(self) == 1)
4119         return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4120
4121     /* Special case for empty strings */
4122     if (PyString_GET_SIZE(self) == 0)
4123         return PyInt_FromLong(0);
4124
4125     e = p + PyUnicode_GET_SIZE(self);
4126     cased = 0;
4127     for (; p < e; p++) {
4128         register const Py_UNICODE ch = *p;
4129
4130         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4131             return PyInt_FromLong(0);
4132         else if (!cased && Py_UNICODE_ISLOWER(ch))
4133             cased = 1;
4134     }
4135     return PyInt_FromLong(cased);
4136 }
4137
4138 static char isupper__doc__[] =
4139 "S.isupper() -> int\n\
4140 \n\
4141 Return 1 if  all cased characters in S are uppercase and there is\n\
4142 at least one cased character in S, 0 otherwise.";
4143
4144 static PyObject*
4145 unicode_isupper(PyUnicodeObject *self)
4146 {
4147     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4148     register const Py_UNICODE *e;
4149     int cased;
4150
4151     /* Shortcut for single character strings */
4152     if (PyUnicode_GET_SIZE(self) == 1)
4153         return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4154
4155     /* Special case for empty strings */
4156     if (PyString_GET_SIZE(self) == 0)
4157         return PyInt_FromLong(0);
4158
4159     e = p + PyUnicode_GET_SIZE(self);
4160     cased = 0;
4161     for (; p < e; p++) {
4162         register const Py_UNICODE ch = *p;
4163
4164         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4165             return PyInt_FromLong(0);
4166         else if (!cased && Py_UNICODE_ISUPPER(ch))
4167             cased = 1;
4168     }
4169     return PyInt_FromLong(cased);
4170 }
4171
4172 static char istitle__doc__[] =
4173 "S.istitle() -> int\n\
4174 \n\
4175 Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4176 may only follow uncased characters and lowercase characters only cased\n\
4177 ones. Return 0 otherwise.";
4178
4179 static PyObject*
4180 unicode_istitle(PyUnicodeObject *self)
4181 {
4182     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4183     register const Py_UNICODE *e;
4184     int cased, previous_is_cased;
4185
4186     /* Shortcut for single character strings */
4187     if (PyUnicode_GET_SIZE(self) == 1)
4188         return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4189                               (Py_UNICODE_ISUPPER(*p) != 0));
4190
4191     /* Special case for empty strings */
4192     if (PyString_GET_SIZE(self) == 0)
4193         return PyInt_FromLong(0);
4194
4195     e = p + PyUnicode_GET_SIZE(self);
4196     cased = 0;
4197     previous_is_cased = 0;
4198     for (; p < e; p++) {
4199         register const Py_UNICODE ch = *p;
4200
4201         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4202             if (previous_is_cased)
4203                 return PyInt_FromLong(0);
4204             previous_is_cased = 1;
4205             cased = 1;
4206         }
4207         else if (Py_UNICODE_ISLOWER(ch)) {
4208             if (!previous_is_cased)
4209                 return PyInt_FromLong(0);
4210             previous_is_cased = 1;
4211             cased = 1;
4212         }
4213         else
4214             previous_is_cased = 0;
4215     }
4216     return PyInt_FromLong(cased);
4217 }
4218
4219 static char isspace__doc__[] =
4220 "S.isspace() -> int\n\
4221 \n\
4222 Return 1 if there are only whitespace characters in S,\n\
4223 0 otherwise.";
4224
4225 static PyObject*
4226 unicode_isspace(PyUnicodeObject *self)
4227 {
4228     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4229     register const Py_UNICODE *e;
4230
4231     /* Shortcut for single character strings */
4232     if (PyUnicode_GET_SIZE(self) == 1 &&
4233         Py_UNICODE_ISSPACE(*p))
4234         return PyInt_FromLong(1);
4235
4236     /* Special case for empty strings */
4237     if (PyString_GET_SIZE(self) == 0)
4238         return PyInt_FromLong(0);
4239
4240     e = p + PyUnicode_GET_SIZE(self);
4241     for (; p < e; p++) {
4242         if (!Py_UNICODE_ISSPACE(*p))
4243             return PyInt_FromLong(0);
4244     }
4245     return PyInt_FromLong(1);
4246 }
4247
4248 static char isalpha__doc__[] =
4249 "S.isalpha() -> int\n\
4250 \n\
4251 Return 1 if  all characters in S are alphabetic\n\
4252 and there is at least one character in S, 0 otherwise.";
4253
4254 static PyObject*
4255 unicode_isalpha(PyUnicodeObject *self)
4256 {
4257     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4258     register const Py_UNICODE *e;
4259
4260     /* Shortcut for single character strings */
4261     if (PyUnicode_GET_SIZE(self) == 1 &&
4262         Py_UNICODE_ISALPHA(*p))
4263         return PyInt_FromLong(1);
4264
4265     /* Special case for empty strings */
4266     if (PyString_GET_SIZE(self) == 0)
4267         return PyInt_FromLong(0);
4268
4269     e = p + PyUnicode_GET_SIZE(self);
4270     for (; p < e; p++) {
4271         if (!Py_UNICODE_ISALPHA(*p))
4272             return PyInt_FromLong(0);
4273     }
4274     return PyInt_FromLong(1);
4275 }
4276
4277 static char isalnum__doc__[] =
4278 "S.isalnum() -> int\n\
4279 \n\
4280 Return 1 if  all characters in S are alphanumeric\n\
4281 and there is at least one character in S, 0 otherwise.";
4282
4283 static PyObject*
4284 unicode_isalnum(PyUnicodeObject *self)
4285 {
4286     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4287     register const Py_UNICODE *e;
4288
4289     /* Shortcut for single character strings */
4290     if (PyUnicode_GET_SIZE(self) == 1 &&
4291         Py_UNICODE_ISALNUM(*p))
4292         return PyInt_FromLong(1);
4293
4294     /* Special case for empty strings */
4295     if (PyString_GET_SIZE(self) == 0)
4296         return PyInt_FromLong(0);
4297
4298     e = p + PyUnicode_GET_SIZE(self);
4299     for (; p < e; p++) {
4300         if (!Py_UNICODE_ISALNUM(*p))
4301             return PyInt_FromLong(0);
4302     }
4303     return PyInt_FromLong(1);
4304 }
4305
4306 static char isdecimal__doc__[] =
4307 "S.isdecimal() -> int\n\
4308 \n\
4309 Return 1 if there are only decimal characters in S,\n\
4310 0 otherwise.";
4311
4312 static PyObject*
4313 unicode_isdecimal(PyUnicodeObject *self)
4314 {
4315     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4316     register const Py_UNICODE *e;
4317
4318     /* Shortcut for single character strings */
4319     if (PyUnicode_GET_SIZE(self) == 1 &&
4320         Py_UNICODE_ISDECIMAL(*p))
4321         return PyInt_FromLong(1);
4322
4323     /* Special case for empty strings */
4324     if (PyString_GET_SIZE(self) == 0)
4325         return PyInt_FromLong(0);
4326
4327     e = p + PyUnicode_GET_SIZE(self);
4328     for (; p < e; p++) {
4329         if (!Py_UNICODE_ISDECIMAL(*p))
4330             return PyInt_FromLong(0);
4331     }
4332     return PyInt_FromLong(1);
4333 }
4334
4335 static char isdigit__doc__[] =
4336 "S.isdigit() -> int\n\
4337 \n\
4338 Return 1 if there are only digit characters in S,\n\
4339 0 otherwise.";
4340
4341 static PyObject*
4342 unicode_isdigit(PyUnicodeObject *self)
4343 {
4344     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4345     register const Py_UNICODE *e;
4346
4347     /* Shortcut for single character strings */
4348     if (PyUnicode_GET_SIZE(self) == 1 &&
4349         Py_UNICODE_ISDIGIT(*p))
4350         return PyInt_FromLong(1);
4351
4352     /* Special case for empty strings */
4353     if (PyString_GET_SIZE(self) == 0)
4354         return PyInt_FromLong(0);
4355
4356     e = p + PyUnicode_GET_SIZE(self);
4357     for (; p < e; p++) {
4358         if (!Py_UNICODE_ISDIGIT(*p))
4359             return PyInt_FromLong(0);
4360     }
4361     return PyInt_FromLong(1);
4362 }
4363
4364 static char isnumeric__doc__[] =
4365 "S.isnumeric() -> int\n\
4366 \n\
4367 Return 1 if there are only numeric characters in S,\n\
4368 0 otherwise.";
4369
4370 static PyObject*
4371 unicode_isnumeric(PyUnicodeObject *self)
4372 {
4373     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4374     register const Py_UNICODE *e;
4375
4376     /* Shortcut for single character strings */
4377     if (PyUnicode_GET_SIZE(self) == 1 &&
4378         Py_UNICODE_ISNUMERIC(*p))
4379         return PyInt_FromLong(1);
4380
4381     /* Special case for empty strings */
4382     if (PyString_GET_SIZE(self) == 0)
4383         return PyInt_FromLong(0);
4384
4385     e = p + PyUnicode_GET_SIZE(self);
4386     for (; p < e; p++) {
4387         if (!Py_UNICODE_ISNUMERIC(*p))
4388             return PyInt_FromLong(0);
4389     }
4390     return PyInt_FromLong(1);
4391 }
4392
4393 static char join__doc__[] =
4394 "S.join(sequence) -> unicode\n\
4395 \n\
4396 Return a string which is the concatenation of the strings in the\n\
4397 sequence.  The separator between elements is S.";
4398
4399 static PyObject*
4400 unicode_join(PyObject *self, PyObject *data)
4401 {
4402     return PyUnicode_Join(self, data);
4403 }
4404
4405 static int
4406 unicode_length(PyUnicodeObject *self)
4407 {
4408     return self->length;
4409 }
4410
4411 static char ljust__doc__[] =
4412 "S.ljust(width) -> unicode\n\
4413 \n\
4414 Return S left justified in a Unicode string of length width. Padding is\n\
4415 done using spaces.";
4416
4417 static PyObject *
4418 unicode_ljust(PyUnicodeObject *self, PyObject *args)
4419 {
4420     int width;
4421     if (!PyArg_ParseTuple(args, "i:ljust", &width))
4422         return NULL;
4423
4424     if (self->length >= width && PyUnicode_CheckExact(self)) {
4425         Py_INCREF(self);
4426         return (PyObject*) self;
4427     }
4428
4429     return (PyObject*) pad(self, 0, width - self->length, ' ');
4430 }
4431
4432 static char lower__doc__[] =
4433 "S.lower() -> unicode\n\
4434 \n\
4435 Return a copy of the string S converted to lowercase.";
4436
4437 static PyObject*
4438 unicode_lower(PyUnicodeObject *self)
4439 {
4440     return fixup(self, fixlower);
4441 }
4442
4443 static char lstrip__doc__[] =
4444 "S.lstrip() -> unicode\n\
4445 \n\
4446 Return a copy of the string S with leading whitespace removed.";
4447
4448 static PyObject *
4449 unicode_lstrip(PyUnicodeObject *self)
4450 {
4451     return strip(self, 1, 0);
4452 }
4453
4454 static PyObject*
4455 unicode_repeat(PyUnicodeObject *str, int len)
4456 {
4457     PyUnicodeObject *u;
4458     Py_UNICODE *p;
4459     int nchars;
4460     size_t nbytes;
4461
4462     if (len < 0)
4463         len = 0;
4464
4465     if (len == 1 && PyUnicode_CheckExact(str)) {
4466         /* no repeat, return original string */
4467         Py_INCREF(str);
4468         return (PyObject*) str;
4469     }
4470
4471     /* ensure # of chars needed doesn't overflow int and # of bytes
4472      * needed doesn't overflow size_t
4473      */
4474     nchars = len * str->length;
4475     if (len && nchars / len != str->length) {
4476         PyErr_SetString(PyExc_OverflowError,
4477                         "repeated string is too long");
4478         return NULL;
4479     }
4480     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4481     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4482         PyErr_SetString(PyExc_OverflowError,
4483                         "repeated string is too long");
4484         return NULL;
4485     }
4486     u = _PyUnicode_New(nchars);
4487     if (!u)
4488         return NULL;
4489
4490     p = u->str;
4491
4492     while (len-- > 0) {
4493         Py_UNICODE_COPY(p, str->str, str->length);
4494         p += str->length;
4495     }
4496
4497     return (PyObject*) u;
4498 }
4499
4500 PyObject *PyUnicode_Replace(PyObject *obj,
4501                             PyObject *subobj,
4502                             PyObject *replobj,
4503                             int maxcount)
4504 {
4505     PyObject *self;
4506     PyObject *str1;
4507     PyObject *str2;
4508     PyObject *result;
4509
4510     self = PyUnicode_FromObject(obj);
4511     if (self == NULL)
4512         return NULL;
4513     str1 = PyUnicode_FromObject(subobj);
4514     if (str1 == NULL) {
4515         Py_DECREF(self);
4516         return NULL;
4517     }
4518     str2 = PyUnicode_FromObject(replobj);
4519     if (str2 == NULL) {
4520         Py_DECREF(self);
4521         Py_DECREF(str1);
4522         return NULL;
4523     }
4524     result = replace((PyUnicodeObject *)self,
4525                      (PyUnicodeObject *)str1,
4526                      (PyUnicodeObject *)str2,
4527                      maxcount);
4528     Py_DECREF(self);
4529     Py_DECREF(str1);
4530     Py_DECREF(str2);
4531     return result;
4532 }
4533
4534 static char replace__doc__[] =
4535 "S.replace (old, new[, maxsplit]) -> unicode\n\
4536 \n\
4537 Return a copy of S with all occurrences of substring\n\
4538 old replaced by new.  If the optional argument maxsplit is\n\
4539 given, only the first maxsplit occurrences are replaced.";
4540
4541 static PyObject*
4542 unicode_replace(PyUnicodeObject *self, PyObject *args)
4543 {
4544     PyUnicodeObject *str1;
4545     PyUnicodeObject *str2;
4546     int maxcount = -1;
4547     PyObject *result;
4548
4549     if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4550         return NULL;
4551     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4552     if (str1 == NULL)
4553         return NULL;
4554     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4555     if (str2 == NULL)
4556         return NULL;
4557
4558     result = replace(self, str1, str2, maxcount);
4559
4560     Py_DECREF(str1);
4561     Py_DECREF(str2);
4562     return result;
4563 }
4564
4565 static
4566 PyObject *unicode_repr(PyObject *unicode)
4567 {
4568     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4569                                 PyUnicode_GET_SIZE(unicode),
4570                                 1);
4571 }
4572
4573 static char rfind__doc__[] =
4574 "S.rfind(sub [,start [,end]]) -> int\n\
4575 \n\
4576 Return the highest index in S where substring sub is found,\n\
4577 such that sub is contained within s[start,end].  Optional\n\
4578 arguments start and end are interpreted as in slice notation.\n\
4579 \n\
4580 Return -1 on failure.";
4581
4582 static PyObject *
4583 unicode_rfind(PyUnicodeObject *self, PyObject *args)
4584 {
4585     PyUnicodeObject *substring;
4586     int start = 0;
4587     int end = INT_MAX;
4588     PyObject *result;
4589
4590     if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4591                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4592         return NULL;
4593     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4594                                                 (PyObject *)substring);
4595     if (substring == NULL)
4596         return NULL;
4597
4598     result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4599
4600     Py_DECREF(substring);
4601     return result;
4602 }
4603
4604 static char rindex__doc__[] =
4605 "S.rindex(sub [,start [,end]]) -> int\n\
4606 \n\
4607 Like S.rfind() but raise ValueError when the substring is not found.";
4608
4609 static PyObject *
4610 unicode_rindex(PyUnicodeObject *self, PyObject *args)
4611 {
4612     int result;
4613     PyUnicodeObject *substring;
4614     int start = 0;
4615     int end = INT_MAX;
4616
4617     if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4618                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4619         return NULL;
4620     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4621                                                 (PyObject *)substring);
4622     if (substring == NULL)
4623         return NULL;
4624
4625     result = findstring(self, substring, start, end, -1);
4626
4627     Py_DECREF(substring);
4628     if (result < 0) {
4629         PyErr_SetString(PyExc_ValueError, "substring not found");
4630         return NULL;
4631     }
4632     return PyInt_FromLong(result);
4633 }
4634
4635 static char rjust__doc__[] =
4636 "S.rjust(width) -> unicode\n\
4637 \n\
4638 Return S right justified in a Unicode string of length width. Padding is\n\
4639 done using spaces.";
4640
4641 static PyObject *
4642 unicode_rjust(PyUnicodeObject *self, PyObject *args)
4643 {
4644     int width;
4645     if (!PyArg_ParseTuple(args, "i:rjust", &width))
4646         return NULL;
4647
4648     if (self->length >= width && PyUnicode_CheckExact(self)) {
4649         Py_INCREF(self);
4650         return (PyObject*) self;
4651     }
4652
4653     return (PyObject*) pad(self, width - self->length, 0, ' ');
4654 }
4655
4656 static char rstrip__doc__[] =
4657 "S.rstrip() -> unicode\n\
4658 \n\
4659 Return a copy of the string S with trailing whitespace removed.";
4660
4661 static PyObject *
4662 unicode_rstrip(PyUnicodeObject *self)
4663 {
4664     return strip(self, 0, 1);
4665 }
4666
4667 static PyObject*
4668 unicode_slice(PyUnicodeObject *self, int start, int end)
4669 {
4670     /* standard clamping */
4671     if (start < 0)
4672         start = 0;
4673     if (end < 0)
4674         end = 0;
4675     if (end > self->length)
4676         end = self->length;
4677     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
4678         /* full slice, return original string */
4679         Py_INCREF(self);
4680         return (PyObject*) self;
4681     }
4682     if (start > end)
4683         start = end;
4684     /* copy slice */
4685     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4686                                              end - start);
4687 }
4688
4689 PyObject *PyUnicode_Split(PyObject *s,
4690                           PyObject *sep,
4691                           int maxsplit)
4692 {
4693     PyObject *result;
4694
4695     s = PyUnicode_FromObject(s);
4696     if (s == NULL)
4697         return NULL;
4698     if (sep != NULL) {
4699         sep = PyUnicode_FromObject(sep);
4700         if (sep == NULL) {
4701             Py_DECREF(s);
4702             return NULL;
4703         }
4704     }
4705
4706     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4707
4708     Py_DECREF(s);
4709     Py_XDECREF(sep);
4710     return result;
4711 }
4712
4713 static char split__doc__[] =
4714 "S.split([sep [,maxsplit]]) -> list of strings\n\
4715 \n\
4716 Return a list of the words in S, using sep as the\n\
4717 delimiter string.  If maxsplit is given, at most maxsplit\n\
4718 splits are done. If sep is not specified, any whitespace string\n\
4719 is a separator.";
4720
4721 static PyObject*
4722 unicode_split(PyUnicodeObject *self, PyObject *args)
4723 {
4724     PyObject *substring = Py_None;
4725     int maxcount = -1;
4726
4727     if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4728         return NULL;
4729
4730     if (substring == Py_None)
4731         return split(self, NULL, maxcount);
4732     else if (PyUnicode_Check(substring))
4733         return split(self, (PyUnicodeObject *)substring, maxcount);
4734     else
4735         return PyUnicode_Split((PyObject *)self, substring, maxcount);
4736 }
4737
4738 static char splitlines__doc__[] =
4739 "S.splitlines([keepends]]) -> list of strings\n\
4740 \n\
4741 Return a list of the lines in S, breaking at line boundaries.\n\
4742 Line breaks are not included in the resulting list unless keepends\n\
4743 is given and true.";
4744
4745 static PyObject*
4746 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4747 {
4748     int keepends = 0;
4749
4750     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
4751         return NULL;
4752
4753     return PyUnicode_Splitlines((PyObject *)self, keepends);
4754 }
4755
4756 static
4757 PyObject *unicode_str(PyUnicodeObject *self)
4758 {
4759     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
4760 }
4761
4762 static char strip__doc__[] =
4763 "S.strip() -> unicode\n\
4764 \n\
4765 Return a copy of S with leading and trailing whitespace removed.";
4766
4767 static PyObject *
4768 unicode_strip(PyUnicodeObject *self)
4769 {
4770     return strip(self, 1, 1);
4771 }
4772
4773 static char swapcase__doc__[] =
4774 "S.swapcase() -> unicode\n\
4775 \n\
4776 Return a copy of S with uppercase characters converted to lowercase\n\
4777 and vice versa.";
4778
4779 static PyObject*
4780 unicode_swapcase(PyUnicodeObject *self)
4781 {
4782     return fixup(self, fixswapcase);
4783 }
4784
4785 static char translate__doc__[] =
4786 "S.translate(table) -> unicode\n\
4787 \n\
4788 Return a copy of the string S, where all characters have been mapped\n\
4789 through the given translation table, which must be a mapping of\n\
4790 Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4791 are left untouched. Characters mapped to None are deleted.";
4792
4793 static PyObject*
4794 unicode_translate(PyUnicodeObject *self, PyObject *table)
4795 {
4796     return PyUnicode_TranslateCharmap(self->str,
4797                                       self->length,
4798                                       table,
4799                                       "ignore");
4800 }
4801
4802 static char upper__doc__[] =
4803 "S.upper() -> unicode\n\
4804 \n\
4805 Return a copy of S converted to uppercase.";
4806
4807 static PyObject*
4808 unicode_upper(PyUnicodeObject *self)
4809 {
4810     return fixup(self, fixupper);
4811 }
4812
4813 #if 0
4814 static char zfill__doc__[] =
4815 "S.zfill(width) -> unicode\n\
4816 \n\
4817 Pad a numeric string x with zeros on the left, to fill a field\n\
4818 of the specified width. The string x is never truncated.";
4819
4820 static PyObject *
4821 unicode_zfill(PyUnicodeObject *self, PyObject *args)
4822 {
4823     int fill;
4824     PyUnicodeObject *u;
4825
4826     int width;
4827     if (!PyArg_ParseTuple(args, "i:zfill", &width))
4828         return NULL;
4829
4830     if (self->length >= width) {
4831         Py_INCREF(self);
4832         return (PyObject*) self;
4833     }
4834
4835     fill = width - self->length;
4836
4837     u = pad(self, fill, 0, '0');
4838
4839     if (u->str[fill] == '+' || u->str[fill] == '-') {
4840         /* move sign to beginning of string */
4841         u->str[0] = u->str[fill];
4842         u->str[fill] = '0';
4843     }
4844
4845     return (PyObject*) u;
4846 }
4847 #endif
4848
4849 #if 0
4850 static PyObject*
4851 unicode_freelistsize(PyUnicodeObject *self)
4852 {
4853     return PyInt_FromLong(unicode_freelist_size);
4854 }
4855 #endif
4856
4857 static char startswith__doc__[] =
4858 "S.startswith(prefix[, start[, end]]) -> int\n\
4859 \n\
4860 Return 1 if S starts with the specified prefix, otherwise return 0.  With\n\
4861 optional start, test S beginning at that position.  With optional end, stop\n\
4862 comparing S at that position.";
4863
4864 static PyObject *
4865 unicode_startswith(PyUnicodeObject *self,
4866                    PyObject *args)
4867 {
4868     PyUnicodeObject *substring;
4869     int start = 0;
4870     int end = INT_MAX;
4871     PyObject *result;
4872
4873     if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4874                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4875         return NULL;
4876     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4877                                                 (PyObject *)substring);
4878     if (substring == NULL)
4879         return NULL;
4880
4881     result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4882
4883     Py_DECREF(substring);
4884     return result;
4885 }
4886
4887
4888 static char endswith__doc__[] =
4889 "S.endswith(suffix[, start[, end]]) -> int\n\
4890 \n\
4891 Return 1 if S ends with the specified suffix, otherwise return 0.  With\n\
4892 optional start, test S beginning at that position.  With optional end, stop\n\
4893 comparing S at that position.";
4894
4895 static PyObject *
4896 unicode_endswith(PyUnicodeObject *self,
4897                  PyObject *args)
4898 {
4899     PyUnicodeObject *substring;
4900     int start = 0;
4901     int end = INT_MAX;
4902     PyObject *result;
4903
4904     if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4905                 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
4906         return NULL;
4907     substring = (PyUnicodeObject *)PyUnicode_FromObject(
4908                                                 (PyObject *)substring);
4909     if (substring == NULL)
4910         return NULL;
4911
4912     result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4913
4914     Py_DECREF(substring);
4915     return result;
4916 }
4917
4918
4919 static PyMethodDef unicode_methods[] = {
4920
4921     /* Order is according to common usage: often used methods should
4922        appear first, since lookup is done sequentially. */
4923
4924     {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4925     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4926     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4927     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4928     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4929     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4930     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4931     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4932     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4933     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4934     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4935     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4936     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4937     {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4938 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4939     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4940     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4941     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4942     {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4943     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4944     {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4945     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4946     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4947     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4948     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4949     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4950     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4951     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4952     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4953     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4954     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4955     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4956     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4957     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4958     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
4959 #if 0
4960     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4961     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
4962 #endif
4963
4964 #if 0
4965     /* This one is just used for debugging the implementation. */
4966     {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
4967 #endif
4968
4969     {NULL, NULL}
4970 };
4971
4972 static PySequenceMethods unicode_as_sequence = {
4973     (inquiry) unicode_length,           /* sq_length */
4974     (binaryfunc) PyUnicode_Concat,      /* sq_concat */
4975     (intargfunc) unicode_repeat,        /* sq_repeat */
4976     (intargfunc) unicode_getitem,       /* sq_item */
4977     (intintargfunc) unicode_slice,      /* sq_slice */
4978     0,                                  /* sq_ass_item */
4979     0,                                  /* sq_ass_slice */
4980     (objobjproc)PyUnicode_Contains,     /*sq_contains*/
4981 };
4982
4983 static int
4984 unicode_buffer_getreadbuf(PyUnicodeObject *self,
4985                           int index,
4986                           const void **ptr)
4987 {
4988     if (index != 0) {
4989         PyErr_SetString(PyExc_SystemError,
4990                         "accessing non-existent unicode segment");
4991         return -1;
4992     }
4993     *ptr = (void *) self->str;
4994     return PyUnicode_GET_DATA_SIZE(self);
4995 }
4996
4997 static int
4998 unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4999                            const void **ptr)
5000 {
5001     PyErr_SetString(PyExc_TypeError,
5002                     "cannot use unicode as modifyable buffer");
5003     return -1;
5004 }
5005
5006 static int
5007 unicode_buffer_getsegcount(PyUnicodeObject *self,
5008                            int *lenp)
5009 {
5010     if (lenp)
5011         *lenp = PyUnicode_GET_DATA_SIZE(self);
5012     return 1;
5013 }
5014
5015 static int
5016 unicode_buffer_getcharbuf(PyUnicodeObject *self,
5017                           int index,
5018                           const void **ptr)
5019 {
5020     PyObject *str;
5021
5022     if (index != 0) {
5023         PyErr_SetString(PyExc_SystemError,
5024                         "accessing non-existent unicode segment");
5025         return -1;
5026     }
5027     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
5028     if (str == NULL)
5029         return -1;
5030     *ptr = (void *) PyString_AS_STRING(str);
5031     return PyString_GET_SIZE(str);
5032 }
5033
5034 /* Helpers for PyUnicode_Format() */
5035
5036 static PyObject *
5037 getnextarg(PyObject *args, int arglen, int *p_argidx)
5038 {
5039     int argidx = *p_argidx;
5040     if (argidx < arglen) {
5041         (*p_argidx)++;
5042         if (arglen < 0)
5043             return args;
5044         else
5045             return PyTuple_GetItem(args, argidx);
5046     }
5047     PyErr_SetString(PyExc_TypeError,
5048                     "not enough arguments for format string");
5049     return NULL;
5050 }
5051
5052 #define F_LJUST (1<<0)
5053 #define F_SIGN  (1<<1)
5054 #define F_BLANK (1<<2)
5055 #define F_ALT   (1<<3)
5056 #define F_ZERO  (1<<4)
5057
5058 static
5059 int usprintf(register Py_UNICODE *buffer, char *format, ...)
5060 {
5061     register int i;
5062     int len;
5063     va_list va;
5064     char *charbuffer;
5065     va_start(va, format);
5066
5067     /* First, format the string as char array, then expand to Py_UNICODE
5068        array. */
5069     charbuffer = (char *)buffer;
5070     len = vsprintf(charbuffer, format, va);
5071     for (i = len - 1; i >= 0; i--)
5072         buffer[i] = (Py_UNICODE) charbuffer[i];
5073
5074     va_end(va);
5075     return len;
5076 }
5077
5078 static int
5079 formatfloat(Py_UNICODE *buf,
5080             size_t buflen,
5081             int flags,
5082             int prec,
5083             int type,
5084             PyObject *v)
5085 {
5086     /* fmt = '%#.' + `prec` + `type`
5087        worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
5088     char fmt[20];
5089     double x;
5090
5091     x = PyFloat_AsDouble(v);
5092     if (x == -1.0 && PyErr_Occurred())
5093         return -1;
5094     if (prec < 0)
5095         prec = 6;
5096     if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5097         type = 'g';
5098     PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5099                   (flags & F_ALT) ? "#" : "", prec, type);
5100     /* worst case length calc to ensure no buffer overrun:
5101          fmt = %#.<prec>g
5102          buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5103             for any double rep.)
5104          len = 1 + prec + 1 + 2 + 5 = 9 + prec
5105        If prec=0 the effective precision is 1 (the leading digit is
5106        always given), therefore increase by one to 10+prec. */
5107     if (buflen <= (size_t)10 + (size_t)prec) {
5108         PyErr_SetString(PyExc_OverflowError,
5109             "formatted float is too long (precision too long?)");
5110         return -1;
5111     }
5112     return usprintf(buf, fmt, x);
5113 }
5114
5115 static PyObject*
5116 formatlong(PyObject *val, int flags, int prec, int type)
5117 {
5118         char *buf;
5119         int i, len;
5120         PyObject *str; /* temporary string object. */
5121         PyUnicodeObject *result;
5122
5123         str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5124         if (!str)
5125                 return NULL;
5126         result = _PyUnicode_New(len);
5127         for (i = 0; i < len; i++)
5128                 result->str[i] = buf[i];
5129         result->str[len] = 0;
5130         Py_DECREF(str);
5131         return (PyObject*)result;
5132 }
5133
5134 static int
5135 formatint(Py_UNICODE *buf,
5136           size_t buflen,
5137           int flags,
5138           int prec,
5139           int type,
5140           PyObject *v)
5141 {
5142     /* fmt = '%#.' + `prec` + 'l' + `type`
5143        worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5144        + 1 + 1 = 24*/
5145     char fmt[64]; /* plenty big enough! */
5146     long x;
5147     int use_native_c_format = 1;
5148
5149     x = PyInt_AsLong(v);
5150     if (x == -1 && PyErr_Occurred())
5151         return -1;
5152     if (prec < 0)
5153         prec = 1;
5154     /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5155        worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5156     if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5157         PyErr_SetString(PyExc_OverflowError,
5158             "formatted integer is too long (precision too long?)");
5159         return -1;
5160     }
5161     /* When converting 0 under %#x or %#X, C leaves off the base marker,
5162      * but we want it (for consistency with other %#x conversions, and
5163      * for consistency with Python's hex() function).
5164      * BUG 28-Apr-2001 tim:  At least two platform Cs (Metrowerks &
5165      * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5166      * So add it only if the platform doesn't already.
5167      */
5168     if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5169         /* Only way to know what the platform does is to try it. */
5170         PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
5171         if (fmt[1] != (char)type) {
5172             /* Supply our own leading 0x/0X -- needed under std C */
5173             use_native_c_format = 0;
5174             PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
5175         }
5176     }
5177     if (use_native_c_format)
5178          PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5179                        (flags & F_ALT) ? "#" : "", prec, type);
5180     return usprintf(buf, fmt, x);
5181 }
5182
5183 static int
5184 formatchar(Py_UNICODE *buf,
5185            size_t buflen,
5186            PyObject *v)
5187 {
5188     /* presume that the buffer is at least 2 characters long */
5189     if (PyUnicode_Check(v)) {
5190         if (PyUnicode_GET_SIZE(v) != 1)
5191             goto onError;
5192         buf[0] = PyUnicode_AS_UNICODE(v)[0];
5193     }
5194
5195     else if (PyString_Check(v)) {
5196         if (PyString_GET_SIZE(v) != 1)
5197             goto onError;
5198         buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5199     }
5200
5201     else {
5202         /* Integer input truncated to a character */
5203         long x;
5204         x = PyInt_AsLong(v);
5205         if (x == -1 && PyErr_Occurred())
5206             goto onError;
5207         buf[0] = (char) x;
5208     }
5209     buf[1] = '\0';
5210     return 1;
5211
5212  onError:
5213     PyErr_SetString(PyExc_TypeError,
5214                     "%c requires int or char");
5215     return -1;
5216 }
5217
5218 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5219
5220    FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5221    chars are formatted. XXX This is a magic number. Each formatting
5222    routine does bounds checking to ensure no overflow, but a better
5223    solution may be to malloc a buffer of appropriate size for each
5224    format. For now, the current solution is sufficient.
5225 */
5226 #define FORMATBUFLEN (size_t)120
5227
5228 PyObject *PyUnicode_Format(PyObject *format,
5229                            PyObject *args)
5230 {
5231     Py_UNICODE *fmt, *res;
5232     int fmtcnt, rescnt, reslen, arglen, argidx;
5233     int args_owned = 0;
5234     PyUnicodeObject *result = NULL;
5235     PyObject *dict = NULL;
5236     PyObject *uformat;
5237
5238     if (format == NULL || args == NULL) {
5239         PyErr_BadInternalCall();
5240         return NULL;
5241     }
5242     uformat = PyUnicode_FromObject(format);
5243     if (uformat == NULL)
5244         return NULL;
5245     fmt = PyUnicode_AS_UNICODE(uformat);
5246     fmtcnt = PyUnicode_GET_SIZE(uformat);
5247
5248     reslen = rescnt = fmtcnt + 100;
5249     result = _PyUnicode_New(reslen);
5250     if (result == NULL)
5251         goto onError;
5252     res = PyUnicode_AS_UNICODE(result);
5253
5254     if (PyTuple_Check(args)) {
5255         arglen = PyTuple_Size(args);
5256         argidx = 0;
5257     }
5258     else {
5259         arglen = -1;
5260         argidx = -2;
5261     }
5262     if (args->ob_type->tp_as_mapping)
5263         dict = args;
5264
5265     while (--fmtcnt >= 0) {
5266         if (*fmt != '%') {
5267             if (--rescnt < 0) {
5268                 rescnt = fmtcnt + 100;
5269                 reslen += rescnt;
5270                 if (_PyUnicode_Resize(&result, reslen) < 0)
5271                     return NULL;
5272                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5273                 --rescnt;
5274             }
5275             *res++ = *fmt++;
5276         }
5277         else {
5278             /* Got a format specifier */
5279             int flags = 0;
5280             int width = -1;
5281             int prec = -1;
5282             Py_UNICODE c = '\0';
5283             Py_UNICODE fill;
5284             PyObject *v = NULL;
5285             PyObject *temp = NULL;
5286             Py_UNICODE *pbuf;
5287             Py_UNICODE sign;
5288             int len;
5289             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
5290
5291             fmt++;
5292             if (*fmt == '(') {
5293                 Py_UNICODE *keystart;
5294                 int keylen;
5295                 PyObject *key;
5296                 int pcount = 1;
5297
5298                 if (dict == NULL) {
5299                     PyErr_SetString(PyExc_TypeError,
5300                                     "format requires a mapping");
5301                     goto onError;
5302                 }
5303                 ++fmt;
5304                 --fmtcnt;
5305                 keystart = fmt;
5306                 /* Skip over balanced parentheses */
5307                 while (pcount > 0 && --fmtcnt >= 0) {
5308                     if (*fmt == ')')
5309                         --pcount;
5310                     else if (*fmt == '(')
5311                         ++pcount;
5312                     fmt++;
5313                 }
5314                 keylen = fmt - keystart - 1;
5315                 if (fmtcnt < 0 || pcount > 0) {
5316                     PyErr_SetString(PyExc_ValueError,
5317                                     "incomplete format key");
5318                     goto onError;
5319                 }
5320 #if 0
5321                 /* keys are converted to strings using UTF-8 and
5322                    then looked up since Python uses strings to hold
5323                    variables names etc. in its namespaces and we
5324                    wouldn't want to break common idioms. */
5325                 key = PyUnicode_EncodeUTF8(keystart,
5326                                            keylen,
5327                                            NULL);
5328 #else
5329                 key = PyUnicode_FromUnicode(keystart, keylen);
5330 #endif
5331                 if (key == NULL)
5332                     goto onError;
5333                 if (args_owned) {
5334                     Py_DECREF(args);
5335                     args_owned = 0;
5336                 }
5337                 args = PyObject_GetItem(dict, key);
5338                 Py_DECREF(key);
5339                 if (args == NULL) {
5340                     goto onError;
5341                 }
5342                 args_owned = 1;
5343                 arglen = -1;
5344                 argidx = -2;
5345             }
5346             while (--fmtcnt >= 0) {
5347                 switch (c = *fmt++) {
5348                 case '-': flags |= F_LJUST; continue;
5349                 case '+': flags |= F_SIGN; continue;
5350                 case ' ': flags |= F_BLANK; continue;
5351                 case '#': flags |= F_ALT; continue;
5352                 case '0': flags |= F_ZERO; continue;
5353                 }
5354                 break;
5355             }
5356             if (c == '*') {
5357                 v = getnextarg(args, arglen, &argidx);
5358                 if (v == NULL)
5359                     goto onError;
5360                 if (!PyInt_Check(v)) {
5361                     PyErr_SetString(PyExc_TypeError,
5362                                     "* wants int");
5363                     goto onError;
5364                 }
5365                 width = PyInt_AsLong(v);
5366                 if (width < 0) {
5367                     flags |= F_LJUST;
5368                     width = -width;
5369                 }
5370                 if (--fmtcnt >= 0)
5371                     c = *fmt++;
5372             }
5373             else if (c >= '0' && c <= '9') {
5374                 width = c - '0';
5375                 while (--fmtcnt >= 0) {
5376                     c = *fmt++;
5377                     if (c < '0' || c > '9')
5378                         break;
5379                     if ((width*10) / 10 != width) {
5380                         PyErr_SetString(PyExc_ValueError,
5381                                         "width too big");
5382                         goto onError;
5383                     }
5384                     width = width*10 + (c - '0');
5385                 }
5386             }
5387             if (c == '.') {
5388                 prec = 0;
5389                 if (--fmtcnt >= 0)
5390                     c = *fmt++;
5391                 if (c == '*') {
5392                     v = getnextarg(args, arglen, &argidx);
5393                     if (v == NULL)
5394                         goto onError;
5395                     if (!PyInt_Check(v)) {
5396                         PyErr_SetString(PyExc_TypeError,
5397                                         "* wants int");
5398                         goto onError;
5399                     }
5400                     prec = PyInt_AsLong(v);
5401                     if (prec < 0)
5402                         prec = 0;
5403                     if (--fmtcnt >= 0)
5404                         c = *fmt++;
5405                 }
5406                 else if (c >= '0' && c <= '9') {
5407                     prec = c - '0';
5408                     while (--fmtcnt >= 0) {
5409                         c = Py_CHARMASK(*fmt++);
5410                         if (c < '0' || c > '9')
5411                             break;
5412                         if ((prec*10) / 10 != prec) {
5413                             PyErr_SetString(PyExc_ValueError,
5414                                             "prec too big");
5415                             goto onError;
5416                         }
5417                         prec = prec*10 + (c - '0');
5418                     }
5419                 }
5420             } /* prec */
5421             if (fmtcnt >= 0) {
5422                 if (c == 'h' || c == 'l' || c == 'L') {
5423                     if (--fmtcnt >= 0)
5424                         c = *fmt++;
5425                 }
5426             }
5427             if (fmtcnt < 0) {
5428                 PyErr_SetString(PyExc_ValueError,
5429                                 "incomplete format");
5430                 goto onError;
5431             }
5432             if (c != '%') {
5433                 v = getnextarg(args, arglen, &argidx);
5434                 if (v == NULL)
5435                     goto onError;
5436             }
5437             sign = 0;
5438             fill = ' ';
5439             switch (c) {
5440
5441             case '%':
5442                 pbuf = formatbuf;
5443                 /* presume that buffer length is at least 1 */
5444                 pbuf[0] = '%';
5445                 len = 1;
5446                 break;
5447
5448             case 's':
5449             case 'r':
5450                 if (PyUnicode_Check(v) && c == 's') {
5451                     temp = v;
5452                     Py_INCREF(temp);
5453                 }
5454                 else {
5455                     PyObject *unicode;
5456                     if (c == 's')
5457                         temp = PyObject_Str(v);
5458                     else
5459                         temp = PyObject_Repr(v);
5460                     if (temp == NULL)
5461                         goto onError;
5462                     if (!PyString_Check(temp)) {
5463                         /* XXX Note: this should never happen, since
5464                                PyObject_Repr() and PyObject_Str() assure
5465                                this */
5466                         Py_DECREF(temp);
5467                         PyErr_SetString(PyExc_TypeError,
5468                                         "%s argument has non-string str()");
5469                         goto onError;
5470                     }
5471                     unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
5472                                                    PyString_GET_SIZE(temp),
5473                                                NULL,
5474                                                    "strict");
5475                     Py_DECREF(temp);
5476                     temp = unicode;
5477                     if (temp == NULL)
5478                         goto onError;
5479                 }
5480                 pbuf = PyUnicode_AS_UNICODE(temp);
5481                 len = PyUnicode_GET_SIZE(temp);
5482                 if (prec >= 0 && len > prec)
5483                     len = prec;
5484                 break;
5485
5486             case 'i':
5487             case 'd':
5488             case 'u':
5489             case 'o':
5490             case 'x':
5491             case 'X':
5492                 if (c == 'i')
5493                     c = 'd';
5494                 if (PyLong_Check(v)) {
5495                     temp = formatlong(v, flags, prec, c);
5496                     if (!temp)
5497                         goto onError;
5498                     pbuf = PyUnicode_AS_UNICODE(temp);
5499                     len = PyUnicode_GET_SIZE(temp);
5500                     /* unbounded ints can always produce
5501                        a sign character! */
5502                     sign = 1;
5503                 }
5504                 else {
5505                     pbuf = formatbuf;
5506                     len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5507                                     flags, prec, c, v);
5508                     if (len < 0)
5509                         goto onError;
5510                     /* only d conversion is signed */
5511                     sign = c == 'd';
5512                 }
5513                 if (flags & F_ZERO)
5514                     fill = '0';
5515                 break;
5516
5517             case 'e':
5518             case 'E':
5519             case 'f':
5520             case 'g':
5521             case 'G':
5522                 pbuf = formatbuf;
5523                 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5524                         flags, prec, c, v);
5525                 if (len < 0)
5526                     goto onError;
5527                 sign = 1;
5528                 if (flags & F_ZERO)
5529                     fill = '0';
5530                 break;
5531
5532             case 'c':
5533                 pbuf = formatbuf;
5534                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
5535                 if (len < 0)
5536                     goto onError;
5537                 break;
5538
5539             default:
5540                 PyErr_Format(PyExc_ValueError,
5541                              "unsupported format character '%c' (0x%x) "
5542                              "at index %i",
5543                              (31<=c && c<=126) ? (int)c : '?',
5544                              (int)c, (fmt -1 - PyUnicode_AS_UNICODE(uformat)));
5545                 goto onError;
5546             }
5547             if (sign) {
5548                 if (*pbuf == '-' || *pbuf == '+') {
5549                     sign = *pbuf++;
5550                     len--;
5551                 }
5552                 else if (flags & F_SIGN)
5553                     sign = '+';
5554                 else if (flags & F_BLANK)
5555                     sign = ' ';
5556                 else
5557                     sign = 0;
5558             }
5559             if (width < len)
5560                 width = len;
5561             if (rescnt < width + (sign != 0)) {
5562                 reslen -= rescnt;
5563                 rescnt = width + fmtcnt + 100;
5564                 reslen += rescnt;
5565                 if (_PyUnicode_Resize(&result, reslen) < 0)
5566                     return NULL;
5567                 res = PyUnicode_AS_UNICODE(result)
5568                     + reslen - rescnt;
5569             }
5570             if (sign) {
5571                 if (fill != ' ')
5572                     *res++ = sign;
5573                 rescnt--;
5574                 if (width > len)
5575                     width--;
5576             }
5577             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5578                 assert(pbuf[0] == '0');
5579                 assert(pbuf[1] == c);
5580                 if (fill != ' ') {
5581                     *res++ = *pbuf++;
5582                     *res++ = *pbuf++;
5583                 }
5584                 rescnt -= 2;
5585                 width -= 2;
5586                 if (width < 0)
5587                     width = 0;
5588                 len -= 2;
5589             }
5590             if (width > len && !(flags & F_LJUST)) {
5591                 do {
5592                     --rescnt;
5593                     *res++ = fill;
5594                 } while (--width > len);
5595             }
5596             if (fill == ' ') {
5597                 if (sign)
5598                     *res++ = sign;
5599                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5600                     assert(pbuf[0] == '0');
5601                     assert(pbuf[1] == c);
5602                     *res++ = *pbuf++;
5603                     *res++ = *pbuf++;
5604                 }
5605             }
5606             Py_UNICODE_COPY(res, pbuf, len);
5607             res += len;
5608             rescnt -= len;
5609             while (--width >= len) {
5610                 --rescnt;
5611                 *res++ = ' ';
5612             }
5613             if (dict && (argidx < arglen) && c != '%') {
5614                 PyErr_SetString(PyExc_TypeError,
5615                                 "not all arguments converted");
5616                 goto onError;
5617             }
5618             Py_XDECREF(temp);
5619         } /* '%' */
5620     } /* until end */
5621     if (argidx < arglen && !dict) {
5622         PyErr_SetString(PyExc_TypeError,
5623                         "not all arguments converted");
5624         goto onError;
5625     }
5626
5627     if (args_owned) {
5628         Py_DECREF(args);
5629     }
5630     Py_DECREF(uformat);
5631     if (_PyUnicode_Resize(&result, reslen - rescnt))
5632         goto onError;
5633     return (PyObject *)result;
5634
5635  onError:
5636     Py_XDECREF(result);
5637     Py_DECREF(uformat);
5638     if (args_owned) {
5639         Py_DECREF(args);
5640     }
5641     return NULL;
5642 }
5643
5644 static PyBufferProcs unicode_as_buffer = {
5645     (getreadbufferproc) unicode_buffer_getreadbuf,
5646     (getwritebufferproc) unicode_buffer_getwritebuf,
5647     (getsegcountproc) unicode_buffer_getsegcount,
5648     (getcharbufferproc) unicode_buffer_getcharbuf,
5649 };
5650
5651 staticforward PyObject *
5652 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5653
5654 static PyObject *
5655 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5656 {
5657         PyObject *x = NULL;
5658         static char *kwlist[] = {"string", "encoding", "errors", 0};
5659         char *encoding = NULL;
5660         char *errors = NULL;
5661
5662         if (type != &PyUnicode_Type)
5663                 return unicode_subtype_new(type, args, kwds);
5664         if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5665                                           kwlist, &x, &encoding, &errors))
5666             return NULL;
5667         if (x == NULL)
5668                 return (PyObject *)_PyUnicode_New(0);
5669         if (encoding == NULL && errors == NULL)
5670             return PyObject_Unicode(x);
5671         else
5672         return PyUnicode_FromEncodedObject(x, encoding, errors);
5673 }
5674
5675 static PyObject *
5676 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5677 {
5678         PyUnicodeObject *tmp, *pnew;
5679         int n;
5680
5681         assert(PyType_IsSubtype(type, &PyUnicode_Type));
5682         tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5683         if (tmp == NULL)
5684                 return NULL;
5685         assert(PyUnicode_Check(tmp));
5686         pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5687         if (pnew == NULL)
5688                 return NULL;
5689         pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5690         if (pnew->str == NULL) {
5691                 _Py_ForgetReference((PyObject *)pnew);
5692                 PyObject_DEL(pnew);
5693                 return NULL;
5694         }
5695         Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5696         pnew->length = n;
5697         pnew->hash = tmp->hash;
5698         Py_DECREF(tmp);
5699         return (PyObject *)pnew;
5700 }
5701
5702 static char unicode_doc[] =
5703 "unicode(string [, encoding[, errors]]) -> object\n\
5704 \n\
5705 Create a new Unicode object from the given encoded string.\n\
5706 encoding defaults to the current default string encoding and \n\
5707 errors, defining the error handling, to 'strict'.";
5708
5709 PyTypeObject PyUnicode_Type = {
5710     PyObject_HEAD_INIT(&PyType_Type)
5711     0,                                  /* ob_size */
5712     "unicode",                          /* tp_name */
5713     sizeof(PyUnicodeObject),            /* tp_size */
5714     0,                                  /* tp_itemsize */
5715     /* Slots */
5716     (destructor)unicode_dealloc,        /* tp_dealloc */
5717     0,                                  /* tp_print */
5718     0,                                  /* tp_getattr */
5719     0,                                  /* tp_setattr */
5720     (cmpfunc) unicode_compare,          /* tp_compare */
5721     (reprfunc) unicode_repr,            /* tp_repr */
5722     0,                                  /* tp_as_number */
5723     &unicode_as_sequence,               /* tp_as_sequence */
5724     0,                                  /* tp_as_mapping */
5725     (hashfunc) unicode_hash,            /* tp_hash*/
5726     0,                                  /* tp_call*/
5727     (reprfunc) unicode_str,             /* tp_str */
5728     PyObject_GenericGetAttr,            /* tp_getattro */
5729     0,                                  /* tp_setattro */
5730     &unicode_as_buffer,                 /* tp_as_buffer */
5731     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
5732     unicode_doc,                        /* tp_doc */
5733     0,                                  /* tp_traverse */
5734     0,                                  /* tp_clear */
5735     0,                                  /* tp_richcompare */
5736     0,                                  /* tp_weaklistoffset */
5737     0,                                  /* tp_iter */
5738     0,                                  /* tp_iternext */
5739     unicode_methods,                    /* tp_methods */
5740     0,                                  /* tp_members */
5741     0,                                  /* tp_getset */
5742     0,                                  /* tp_base */
5743     0,                                  /* tp_dict */
5744     0,                                  /* tp_descr_get */
5745     0,                                  /* tp_descr_set */
5746     0,                                  /* tp_dictoffset */
5747     0,                                  /* tp_init */
5748     0,                                  /* tp_alloc */
5749     unicode_new,                        /* tp_new */
5750     _PyObject_Del,                      /* tp_free */
5751 };
5752
5753 /* Initialize the Unicode implementation */
5754
5755 void _PyUnicode_Init(void)
5756 {
5757     int i;
5758
5759     /* Init the implementation */
5760     unicode_freelist = NULL;
5761     unicode_freelist_size = 0;
5762     unicode_empty = _PyUnicode_New(0);
5763     strcpy(unicode_default_encoding, "ascii");
5764     for (i = 0; i < 256; i++)
5765         unicode_latin1[i] = NULL;
5766 }
5767
5768 /* Finalize the Unicode implementation */
5769
5770 void
5771 _PyUnicode_Fini(void)
5772 {
5773     PyUnicodeObject *u;
5774     int i;
5775
5776     Py_XDECREF(unicode_empty);
5777     unicode_empty = NULL;
5778
5779     for (i = 0; i < 256; i++) {
5780         if (unicode_latin1[i]) {
5781             Py_DECREF(unicode_latin1[i]);
5782             unicode_latin1[i] = NULL;
5783         }
5784     }
5785
5786     for (u = unicode_freelist; u != NULL;) {
5787         PyUnicodeObject *v = u;
5788         u = *(PyUnicodeObject **)u;
5789         if (v->str)
5790             PyMem_DEL(v->str);
5791         Py_XDECREF(v->defenc);
5792         PyObject_DEL(v);
5793     }
5794     unicode_freelist = NULL;
5795     unicode_freelist_size = 0;
5796 }