Include/unicodeobject.h

   1 #ifndef Py_UNICODEOBJECT_H
   2 #define Py_UNICODEOBJECT_H
   3
   4 /*
   5
   6 Unicode implementation based on original code by Fredrik Lundh,
   7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
   8 Unicode Integration Proposal (see file Misc/unicode.txt).
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12
  13  Original header:
  14  --------------------------------------------------------------------
  15
  16  * Yet another Unicode string type for Python.  This type supports the
  17  * 16-bit Basic Multilingual Plane (BMP) only.
  18  *
  19  * Written by Fredrik Lundh, January 1999.
  20  *
  21  * Copyright (c) 1999 by Secret Labs AB.
  22  * Copyright (c) 1999 by Fredrik Lundh.
  23  *
  24  * fredrik@pythonware.com
  25  * http://www.pythonware.com
  26  *
  27  * --------------------------------------------------------------------
  28  * This Unicode String Type is
  29  *
  30  * Copyright (c) 1999 by Secret Labs AB
  31  * Copyright (c) 1999 by Fredrik Lundh
  32  *
  33  * By obtaining, using, and/or copying this software and/or its
  34  * associated documentation, you agree that you have read, understood,
  35  * and will comply with the following terms and conditions:
  36  *
  37  * Permission to use, copy, modify, and distribute this software and its
  38  * associated documentation for any purpose and without fee is hereby
  39  * granted, provided that the above copyright notice appears in all
  40  * copies, and that both that copyright notice and this permission notice
  41  * appear in supporting documentation, and that the name of Secret Labs
  42  * AB or the author not be used in advertising or publicity pertaining to
  43  * distribution of the software without specific, written prior
  44  * permission.
  45  *
  46  * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  47  * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  48  * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  49  * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  50  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  51  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  52  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  53  * -------------------------------------------------------------------- */
  54
  55 #include "ctype.h"
  56
  57 /* === Internal API ======================================================= */
  58
  59 /* --- Internal Unicode Format -------------------------------------------- */
  60
  61 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
  62    wchar_t type is a 16-bit unsigned type */
  63 /* #define HAVE_WCHAR_H */
  64 /* #define HAVE_USABLE_WCHAR_T */
  65
  66 /* Defaults for various platforms */
  67 #ifndef HAVE_USABLE_WCHAR_T
  68
  69 /* Windows has a usable wchar_t type */
  70 # if defined(MS_WIN32)
  71 #  define HAVE_USABLE_WCHAR_T
  72 # endif
  73
  74 #endif
  75
  76 /* If the compiler provides a wchar_t type we try to support it
  77    through the interface functions PyUnicode_FromWideChar() and
  78    PyUnicode_AsWideChar(). */
  79
  80 #ifdef HAVE_USABLE_WCHAR_T
  81 # ifndef HAVE_WCHAR_H
  82 #  define HAVE_WCHAR_H
  83 # endif
  84 #endif
  85
  86 #ifdef HAVE_WCHAR_H
  87 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
  88 # ifdef _HAVE_BSDI
  89 #  include <time.h>
  90 # endif
  91 # include "wchar.h"
  92 #endif
  93
  94 #ifdef HAVE_USABLE_WCHAR_T
  95
  96 /* If the compiler defines whcar_t as a 16-bit unsigned type we can
  97    use the compiler type directly.  Works fine with all modern Windows
  98    platforms. */
  99
 100 typedef wchar_t Py_UNICODE;
 101
 102 #else
 103
 104 /* Use if you have a standard ANSI compiler, without wchar_t support.
 105    If a short is not 16 bits on your platform, you have to fix the
 106    typedef below, or the module initialization code will complain. */
 107
 108 typedef unsigned short Py_UNICODE;
 109
 110 #endif
 111
 112 /*
 113  * Use this typedef when you need to represent a UTF-16 surrogate pair
 114  * as single unsigned integer.
 115  */
 116 #if SIZEOF_INT >= 4
 117 typedef unsigned int Py_UCS4;
 118 #elif SIZEOF_LONG >= 4
 119 typedef unsigned long Py_UCS4;
 120 #endif
 121
 122
 123 /* --- Internal Unicode Operations ---------------------------------------- */
 124
 125 /* If you want Python to use the compiler's wctype.h functions instead
 126    of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
 127    configure Python using --with-ctype-functions.  This reduces the
 128    interpreter's code size. */
 129
 130 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
 131
 132 #include "wctype.h"
 133
 134 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
 135
 136 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
 137 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
 138 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 139 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 140
 141 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
 142 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
 143 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 144
 145 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 146 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 147 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 148
 149 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 150 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 151 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 152
 153 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
 154
 155 #else
 156
 157 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
 158
 159 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
 160 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
 161 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
 162 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
 163
 164 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
 165 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
 166 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
 167
 168 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
 169 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
 170 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
 171
 172 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
 173 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
 174 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
 175
 176 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
 177
 178 #endif
 179
 180 #define Py_UNICODE_ISALNUM(ch) \
 181        (Py_UNICODE_ISALPHA(ch) || \
 182         Py_UNICODE_ISDECIMAL(ch) || \
 183         Py_UNICODE_ISDIGIT(ch) || \
 184         Py_UNICODE_ISNUMERIC(ch))
 185
 186 #define Py_UNICODE_COPY(target, source, length)\
 187     (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
 188
 189 #define Py_UNICODE_FILL(target, value, length) do\
 190     {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
 191     while (0)
 192
 193 #define Py_UNICODE_MATCH(string, offset, substring)\
 194     ((*((string)->str + (offset)) == *((substring)->str)) &&\
 195      !memcmp((string)->str + (offset), (substring)->str,\
 196              (substring)->length*sizeof(Py_UNICODE)))
 197
 198 #ifdef __cplusplus
 199 extern "C" {
 200 #endif
 201
 202 /* --- Unicode Type ------------------------------------------------------- */
 203
 204 typedef struct {
 205     PyObject_HEAD
 206     int length;                 /* Length of raw Unicode data in buffer */
 207     Py_UNICODE *str;            /* Raw Unicode buffer */
 208     long hash;                  /* Hash value; -1 if not set */
 209     PyObject *defenc;           /* (Default) Encoded version as Python
 210                                    string, or NULL; this is used for
 211                                    implementing the buffer protocol */
 212 } PyUnicodeObject;
 213
 214 extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
 215
 216 #define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
 217
 218 /* Fast access macros */
 219 #define PyUnicode_GET_SIZE(op) \
 220         (((PyUnicodeObject *)(op))->length)
 221 #define PyUnicode_GET_DATA_SIZE(op) \
 222         (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
 223 #define PyUnicode_AS_UNICODE(op) \
 224         (((PyUnicodeObject *)(op))->str)
 225 #define PyUnicode_AS_DATA(op) \
 226         ((const char *)((PyUnicodeObject *)(op))->str)
 227
 228 /* --- Constants ---------------------------------------------------------- */
 229
 230 /* This Unicode character will be used as replacement character during
 231    decoding if the errors argument is set to "replace". Note: the
 232    Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
 233    Unicode 3.0. */
 234
 235 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
 236
 237 /* === Public API ========================================================= */
 238
 239 /* --- Plain Py_UNICODE --------------------------------------------------- */
 240
 241 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
 242    size. u may be NULL which causes the contents to be undefined. It
 243    is the user's responsibility to fill in the needed data.
 244
 245    The buffer is copied into the new object. */
 246
 247 extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
 248     const Py_UNICODE *u,        /* Unicode buffer */
 249     int size                    /* size of buffer */
 250     );
 251
 252 /* Return a read-only pointer to the Unicode object's internal
 253    Py_UNICODE buffer. */
 254
 255 extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
 256     PyObject *unicode           /* Unicode object */
 257     );
 258
 259 /* Get the length of the Unicode object. */
 260
 261 extern DL_IMPORT(int) PyUnicode_GetSize(
 262     PyObject *unicode           /* Unicode object */
 263     );
 264
 265 /* Resize an already allocated Unicode object to the new size length.
 266
 267    *unicode is modified to point to the new (resized) object and 0
 268    returned on success.
 269
 270    This API may only be called by the function which also called the
 271    Unicode constructor. The refcount on the object must be 1. Otherwise,
 272    an error is returned.
 273
 274    Error handling is implemented as follows: an exception is set, -1
 275    is returned and *unicode left untouched.
 276
 277 */
 278
 279 extern DL_IMPORT(int) PyUnicode_Resize(
 280     PyObject **unicode,         /* Pointer to the Unicode object */
 281     int length                  /* New length */
 282     );
 283
 284 /* Coerce obj to an Unicode object and return a reference with
 285    *incremented* refcount.
 286
 287    Coercion is done in the following way:
 288
 289    1. Unicode objects are passed back as-is with incremented
 290       refcount.
 291
 292    2. String and other char buffer compatible objects are decoded
 293       under the assumptions that they contain data using the current
 294       default encoding. Decoding is done in "strict" mode.
 295
 296    3. All other objects raise an exception.
 297
 298    The API returns NULL in case of an error. The caller is responsible
 299    for decref'ing the returned objects.
 300
 301 */
 302
 303 extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
 304     register PyObject *obj,     /* Object */
 305     const char *encoding,       /* encoding */
 306     const char *errors          /* error handling */
 307     );
 308
 309 /* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
 310    which results in using the default encoding as basis for
 311    decoding the object.
 312
 313    Coerces obj to an Unicode object and return a reference with
 314    *incremented* refcount.
 315
 316    The API returns NULL in case of an error. The caller is responsible
 317    for decref'ing the returned objects.
 318
 319 */
 320
 321 extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
 322     register PyObject *obj      /* Object */
 323     );
 324
 325 /* --- wchar_t support for platforms which support it --------------------- */
 326
 327 #ifdef HAVE_WCHAR_H
 328
 329 /* Create a Unicode Object from the whcar_t buffer w of the given
 330    size.
 331
 332    The buffer is copied into the new object. */
 333
 334 extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
 335     register const wchar_t *w,  /* wchar_t buffer */
 336     int size                    /* size of buffer */
 337     );
 338
 339 /* Copies the Unicode Object contents into the whcar_t buffer w.  At
 340    most size wchar_t characters are copied.
 341
 342    Returns the number of wchar_t characters copied or -1 in case of an
 343    error. */
 344
 345 extern DL_IMPORT(int) PyUnicode_AsWideChar(
 346     PyUnicodeObject *unicode,   /* Unicode object */
 347     register wchar_t *w,        /* wchar_t buffer */
 348     int size                    /* size of buffer */
 349     );
 350
 351 #endif
 352
 353 /* === Builtin Codecs =====================================================
 354
 355    Many of these APIs take two arguments encoding and errors. These
 356    parameters encoding and errors have the same semantics as the ones
 357    of the builtin unicode() API.
 358
 359    Setting encoding to NULL causes the default encoding to be used.
 360
 361    Error handling is set by errors which may also be set to NULL
 362    meaning to use the default handling defined for the codec. Default
 363    error handling for all builtin codecs is "strict" (ValueErrors are
 364    raised).
 365
 366    The codecs all use a similar interface. Only deviation from the
 367    generic ones are documented.
 368
 369 */
 370
 371 /* --- Manage the default encoding ---------------------------------------- */
 372
 373 /* Returns the currently active default encoding.
 374
 375    The default encoding is currently implemented as run-time settable
 376    process global.  This may change in future versions of the
 377    interpreter to become a parameter which is managed on a per-thread
 378    basis.
 379
 380  */
 381
 382 extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
 383
 384 /* Sets the currently active default encoding.
 385
 386    Returns 0 on success, -1 in case of an error.
 387
 388  */
 389
 390 extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
 391     const char *encoding        /* Encoding name in standard form */
 392     );
 393
 394 /* --- Generic Codecs ----------------------------------------------------- */
 395
 396 /* Create a Unicode object by decoding the encoded string s of the
 397    given size. */
 398
 399 extern DL_IMPORT(PyObject*) PyUnicode_Decode(
 400     const char *s,              /* encoded string */
 401     int size,                   /* size of buffer */
 402     const char *encoding,       /* encoding */
 403     const char *errors          /* error handling */
 404     );
 405
 406 /* Encodes a Py_UNICODE buffer of the given size and returns a
 407    Python string object. */
 408
 409 extern DL_IMPORT(PyObject*) PyUnicode_Encode(
 410     const Py_UNICODE *s,        /* Unicode char buffer */
 411     int size,                   /* number of Py_UNICODE chars to encode */
 412     const char *encoding,       /* encoding */
 413     const char *errors          /* error handling */
 414     );
 415
 416 /* Encodes a Unicode object and returns the result as Python string
 417    object. */
 418
 419 extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
 420     PyObject *unicode,          /* Unicode object */
 421     const char *encoding,       /* encoding */
 422     const char *errors          /* error handling */
 423     );
 424
 425 /* --- UTF-8 Codecs ------------------------------------------------------- */
 426
 427 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
 428     const char *string,         /* UTF-8 encoded string */
 429     int length,                 /* size of string */
 430     const char *errors          /* error handling */
 431     );
 432
 433 extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
 434     PyObject *unicode           /* Unicode object */
 435     );
 436
 437 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
 438     const Py_UNICODE *data,     /* Unicode char buffer */
 439     int length,                 /* number of Py_UNICODE chars to encode */
 440     const char *errors          /* error handling */
 441     );
 442
 443 /* --- UTF-16 Codecs ------------------------------------------------------ */
 444
 445 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
 446    the corresponding Unicode object.
 447
 448    errors (if non-NULL) defines the error handling. It defaults
 449    to "strict".
 450
 451    If byteorder is non-NULL, the decoder starts decoding using the
 452    given byte order:
 453
 454         *byteorder == -1: little endian
 455         *byteorder == 0:  native order
 456         *byteorder == 1:  big endian
 457
 458    and then switches according to all BOM marks it finds in the input
 459    data. BOM marks are not copied into the resulting Unicode string.
 460    After completion, *byteorder is set to the current byte order at
 461    the end of input data.
 462
 463    If byteorder is NULL, the codec starts in native order mode.
 464
 465 */
 466
 467 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
 468     const char *string,         /* UTF-16 encoded string */
 469     int length,                 /* size of string */
 470     const char *errors,         /* error handling */
 471     int *byteorder              /* pointer to byteorder to use
 472                                    0=native;-1=LE,1=BE; updated on
 473                                    exit */
 474     );
 475
 476 /* Returns a Python string using the UTF-16 encoding in native byte
 477    order. The string always starts with a BOM mark.  */
 478
 479 extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
 480     PyObject *unicode           /* Unicode object */
 481     );
 482
 483 /* Returns a Python string object holding the UTF-16 encoded value of
 484    the Unicode data.
 485
 486    If byteorder is not 0, output is written according to the following
 487    byte order:
 488
 489    byteorder == -1: little endian
 490    byteorder == 0:  native byte order (writes a BOM mark)
 491    byteorder == 1:  big endian
 492
 493    If byteorder is 0, the output string will always start with the
 494    Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
 495    prepended.
 496
 497    Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
 498    UCS-2. This trick makes it possible to add full UTF-16 capabilities
 499    at a later point without compromising the APIs.
 500
 501 */
 502
 503 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
 504     const Py_UNICODE *data,     /* Unicode char buffer */
 505     int length,                 /* number of Py_UNICODE chars to encode */
 506     const char *errors,         /* error handling */
 507     int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
 508     );
 509
 510 /* --- Unicode-Escape Codecs ---------------------------------------------- */
 511
 512 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
 513     const char *string,         /* Unicode-Escape encoded string */
 514     int length,                 /* size of string */
 515     const char *errors          /* error handling */
 516     );
 517
 518 extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
 519     PyObject *unicode           /* Unicode object */
 520     );
 521
 522 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
 523     const Py_UNICODE *data,     /* Unicode char buffer */
 524     int length                  /* Number of Py_UNICODE chars to encode */
 525     );
 526
 527 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
 528
 529 extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
 530     const char *string,         /* Raw-Unicode-Escape encoded string */
 531     int length,                 /* size of string */
 532     const char *errors          /* error handling */
 533     );
 534
 535 extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
 536     PyObject *unicode           /* Unicode object */
 537     );
 538
 539 extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
 540     const Py_UNICODE *data,     /* Unicode char buffer */
 541     int length                  /* Number of Py_UNICODE chars to encode */
 542     );
 543
 544 /* --- Latin-1 Codecs -----------------------------------------------------
 545
 546    Note: Latin-1 corresponds to the first 256 Unicode ordinals.
 547
 548 */
 549
 550 extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
 551     const char *string,         /* Latin-1 encoded string */
 552     int length,                 /* size of string */
 553     const char *errors          /* error handling */
 554     );
 555
 556 extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
 557     PyObject *unicode           /* Unicode object */
 558     );
 559
 560 extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
 561     const Py_UNICODE *data,     /* Unicode char buffer */
 562     int length,                 /* Number of Py_UNICODE chars to encode */
 563     const char *errors          /* error handling */
 564     );
 565
 566 /* --- ASCII Codecs -------------------------------------------------------
 567
 568    Only 7-bit ASCII data is excepted. All other codes generate errors.
 569
 570 */
 571
 572 extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
 573     const char *string,         /* ASCII encoded string */
 574     int length,                 /* size of string */
 575     const char *errors          /* error handling */
 576     );
 577
 578 extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
 579     PyObject *unicode           /* Unicode object */
 580     );
 581
 582 extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
 583     const Py_UNICODE *data,     /* Unicode char buffer */
 584     int length,                 /* Number of Py_UNICODE chars to encode */
 585     const char *errors          /* error handling */
 586     );
 587
 588 /* --- Character Map Codecs -----------------------------------------------
 589
 590    This codec uses mappings to encode and decode characters.
 591
 592    Decoding mappings must map single string characters to single
 593    Unicode characters, integers (which are then interpreted as Unicode
 594    ordinals) or None (meaning "undefined mapping" and causing an
 595    error).
 596
 597    Encoding mappings must map single Unicode characters to single
 598    string characters, integers (which are then interpreted as Latin-1
 599    ordinals) or None (meaning "undefined mapping" and causing an
 600    error).
 601
 602    If a character lookup fails with a LookupError, the character is
 603    copied as-is meaning that its ordinal value will be interpreted as
 604    Unicode or Latin-1 ordinal resp. Because of this mappings only need
 605    to contain those mappings which map characters to different code
 606    points.
 607
 608 */
 609
 610 extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
 611     const char *string,         /* Encoded string */
 612     int length,                 /* size of string */
 613     PyObject *mapping,          /* character mapping
 614                                    (char ordinal -> unicode ordinal) */
 615     const char *errors          /* error handling */
 616     );
 617
 618 extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
 619     PyObject *unicode,          /* Unicode object */
 620     PyObject *mapping           /* character mapping
 621                                    (unicode ordinal -> char ordinal) */
 622     );
 623
 624 extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
 625     const Py_UNICODE *data,     /* Unicode char buffer */
 626     int length,                 /* Number of Py_UNICODE chars to encode */
 627     PyObject *mapping,          /* character mapping
 628                                    (unicode ordinal -> char ordinal) */
 629     const char *errors          /* error handling */
 630     );
 631
 632 /* Translate a Py_UNICODE buffer of the given length by applying a
 633    character mapping table to it and return the resulting Unicode
 634    object.
 635
 636    The mapping table must map Unicode ordinal integers to Unicode
 637    ordinal integers or None (causing deletion of the character).
 638
 639    Mapping tables may be dictionaries or sequences. Unmapped character
 640    ordinals (ones which cause a LookupError) are left untouched and
 641    are copied as-is.
 642
 643 */
 644
 645 extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
 646     const Py_UNICODE *data,     /* Unicode char buffer */
 647     int length,                 /* Number of Py_UNICODE chars to encode */
 648     PyObject *table,            /* Translate table */
 649     const char *errors          /* error handling */
 650     );
 651
 652 #ifdef MS_WIN32
 653
 654 /* --- MBCS codecs for Windows -------------------------------------------- */
 655
 656 extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
 657     const char *string,         /* MBCS encoded string */
 658     int length,                 /* size of string */
 659     const char *errors          /* error handling */
 660     );
 661
 662 extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
 663     PyObject *unicode           /* Unicode object */
 664     );
 665
 666 extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
 667     const Py_UNICODE *data,     /* Unicode char buffer */
 668     int length,                 /* Number of Py_UNICODE chars to encode */
 669     const char *errors          /* error handling */
 670     );
 671
 672 #endif /* MS_WIN32 */
 673
 674 /* --- Decimal Encoder ---------------------------------------------------- */
 675
 676 /* Takes a Unicode string holding a decimal value and writes it into
 677    an output buffer using standard ASCII digit codes.
 678
 679    The output buffer has to provide at least length+1 bytes of storage
 680    area. The output string is 0-terminated.
 681
 682    The encoder converts whitespace to ' ', decimal characters to their
 683    corresponding ASCII digit and all other Latin-1 characters except
 684    \0 as-is. Characters outside this range (Unicode ordinals 1-256)
 685    are treated as errors. This includes embedded NULL bytes.
 686
 687    Error handling is defined by the errors argument:
 688
 689       NULL or "strict": raise a ValueError
 690       "ignore": ignore the wrong characters (these are not copied to the
 691                 output buffer)
 692       "replace": replaces illegal characters with '?'
 693
 694    Returns 0 on success, -1 on failure.
 695
 696 */
 697
 698 extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
 699     Py_UNICODE *s,              /* Unicode buffer */
 700     int length,                 /* Number of Py_UNICODE chars to encode */
 701     char *output,               /* Output buffer; must have size >= length */
 702     const char *errors          /* error handling */
 703     );
 704
 705 /* --- Methods & Slots ----------------------------------------------------
 706
 707    These are capable of handling Unicode objects and strings on input
 708    (we refer to them as strings in the descriptions) and return
 709    Unicode objects or integers as apporpriate. */
 710
 711 /* Concat two strings giving a new Unicode string. */
 712
 713 extern DL_IMPORT(PyObject*) PyUnicode_Concat(
 714     PyObject *left,             /* Left string */
 715     PyObject *right             /* Right string */
 716     );
 717
 718 /* Split a string giving a list of Unicode strings.
 719
 720    If sep is NULL, splitting will be done at all whitespace
 721    substrings. Otherwise, splits occur at the given separator.
 722
 723    At most maxsplit splits will be done. If negative, no limit is set.
 724
 725    Separators are not included in the resulting list.
 726
 727 */
 728
 729 extern DL_IMPORT(PyObject*) PyUnicode_Split(
 730     PyObject *s,                /* String to split */
 731     PyObject *sep,              /* String separator */
 732     int maxsplit                /* Maxsplit count */
 733     );
 734
 735 /* Dito, but split at line breaks.
 736
 737    CRLF is considered to be one line break. Line breaks are not
 738    included in the resulting list. */
 739
 740 extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
 741     PyObject *s,                /* String to split */
 742     int keepends                /* If true, line end markers are included */
 743     );
 744
 745 /* Translate a string by applying a character mapping table to it and
 746    return the resulting Unicode object.
 747
 748    The mapping table must map Unicode ordinal integers to Unicode
 749    ordinal integers or None (causing deletion of the character).
 750
 751    Mapping tables may be dictionaries or sequences. Unmapped character
 752    ordinals (ones which cause a LookupError) are left untouched and
 753    are copied as-is.
 754
 755 */
 756
 757 extern DL_IMPORT(PyObject *) PyUnicode_Translate(
 758     PyObject *str,              /* String */
 759     PyObject *table,            /* Translate table */
 760     const char *errors          /* error handling */
 761     );
 762
 763 /* Join a sequence of strings using the given separator and return
 764    the resulting Unicode string. */
 765
 766 extern DL_IMPORT(PyObject*) PyUnicode_Join(
 767     PyObject *separator,        /* Separator string */
 768     PyObject *seq               /* Sequence object */
 769     );
 770
 771 /* Return 1 if substr matches str[start:end] at the given tail end, 0
 772    otherwise. */
 773
 774 extern DL_IMPORT(int) PyUnicode_Tailmatch(
 775     PyObject *str,              /* String */
 776     PyObject *substr,           /* Prefix or Suffix string */
 777     int start,                  /* Start index */
 778     int end,                    /* Stop index */
 779     int direction               /* Tail end: -1 prefix, +1 suffix */
 780     );
 781
 782 /* Return the first position of substr in str[start:end] using the
 783    given search direction or -1 if not found. */
 784
 785 extern DL_IMPORT(int) PyUnicode_Find(
 786     PyObject *str,              /* String */
 787     PyObject *substr,           /* Substring to find */
 788     int start,                  /* Start index */
 789     int end,                    /* Stop index */
 790     int direction               /* Find direction: +1 forward, -1 backward */
 791     );
 792
 793 /* Count the number of occurrences of substr in str[start:end]. */
 794
 795 extern DL_IMPORT(int) PyUnicode_Count(
 796     PyObject *str,              /* String */
 797     PyObject *substr,           /* Substring to count */
 798     int start,                  /* Start index */
 799     int end                     /* Stop index */
 800     );
 801
 802 /* Replace at most maxcount occurrences of substr in str with replstr
 803    and return the resulting Unicode object. */
 804
 805 extern DL_IMPORT(PyObject *) PyUnicode_Replace(
 806     PyObject *str,              /* String */
 807     PyObject *substr,           /* Substring to find */
 808     PyObject *replstr,          /* Substring to replace */
 809     int maxcount                /* Max. number of replacements to apply;
 810                                    -1 = all */
 811     );
 812
 813 /* Compare two strings and return -1, 0, 1 for less than, equal,
 814    greater than resp. */
 815
 816 extern DL_IMPORT(int) PyUnicode_Compare(
 817     PyObject *left,             /* Left string */
 818     PyObject *right             /* Right string */
 819     );
 820
 821 /* Apply a argument tuple or dictionary to a format string and return
 822    the resulting Unicode string. */
 823
 824 extern DL_IMPORT(PyObject *) PyUnicode_Format(
 825     PyObject *format,           /* Format string */
 826     PyObject *args              /* Argument tuple or dictionary */
 827     );
 828
 829 /* Checks whether element is contained in container and return 1/0
 830    accordingly.
 831
 832    element has to coerce to an one element Unicode string. -1 is
 833    returned in case of an error. */
 834
 835 extern DL_IMPORT(int) PyUnicode_Contains(
 836     PyObject *container,        /* Container string */
 837     PyObject *element           /* Element string */
 838     );
 839
 840 /* === Characters Type APIs =============================================== */
 841
 842 /* These should not be used directly. Use the Py_UNICODE_IS* and
 843    Py_UNICODE_TO* macros instead.
 844
 845    These APIs are implemented in Objects/unicodectype.c.
 846
 847 */
 848
 849 extern DL_IMPORT(int) _PyUnicode_IsLowercase(
 850     register const Py_UNICODE ch        /* Unicode character */
 851     );
 852
 853 extern DL_IMPORT(int) _PyUnicode_IsUppercase(
 854     register const Py_UNICODE ch        /* Unicode character */
 855     );
 856
 857 extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
 858     register const Py_UNICODE ch        /* Unicode character */
 859     );
 860
 861 extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
 862     register const Py_UNICODE ch        /* Unicode character */
 863     );
 864
 865 extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
 866     register const Py_UNICODE ch        /* Unicode character */
 867     );
 868
 869 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
 870     register const Py_UNICODE ch        /* Unicode character */
 871     );
 872
 873 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
 874     register const Py_UNICODE ch        /* Unicode character */
 875     );
 876
 877 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
 878     register const Py_UNICODE ch        /* Unicode character */
 879     );
 880
 881 extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
 882     register const Py_UNICODE ch        /* Unicode character */
 883     );
 884
 885 extern DL_IMPORT(int) _PyUnicode_ToDigit(
 886     register const Py_UNICODE ch        /* Unicode character */
 887     );
 888
 889 extern DL_IMPORT(double) _PyUnicode_ToNumeric(
 890     register const Py_UNICODE ch        /* Unicode character */
 891     );
 892
 893 extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
 894     register const Py_UNICODE ch        /* Unicode character */
 895     );
 896
 897 extern DL_IMPORT(int) _PyUnicode_IsDigit(
 898     register const Py_UNICODE ch        /* Unicode character */
 899     );
 900
 901 extern DL_IMPORT(int) _PyUnicode_IsNumeric(
 902     register const Py_UNICODE ch        /* Unicode character */
 903     );
 904
 905 extern DL_IMPORT(int) _PyUnicode_IsAlpha(
 906     register const Py_UNICODE ch        /* Unicode character */
 907     );
 908
 909 #ifdef __cplusplus
 910 }
 911 #endif
 912 #endif /* !Py_UNICODEOBJECT_H */