Bump version to 0.9.1.
[python/dscho.git] / Include / unicodeobject.h
blobe88b8ed3399a32aaf16e455a24078fe6e90a5721
1 #ifndef Py_UNICODEOBJECT_H
2 #define Py_UNICODEOBJECT_H
4 /*
6 Unicode implementation based on original code by Fredrik Lundh,
7 modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
8 Unicode Integration Proposal (see file Misc/unicode.txt).
10 Copyright (c) Corporation for National Research Initiatives.
13 Original header:
14 --------------------------------------------------------------------
16 * Yet another Unicode string type for Python. This type supports the
17 * 16-bit Basic Multilingual Plane (BMP) only.
19 * Written by Fredrik Lundh, January 1999.
21 * Copyright (c) 1999 by Secret Labs AB.
22 * Copyright (c) 1999 by Fredrik Lundh.
24 * fredrik@pythonware.com
25 * http://www.pythonware.com
27 * --------------------------------------------------------------------
28 * This Unicode String Type is
30 * Copyright (c) 1999 by Secret Labs AB
31 * Copyright (c) 1999 by Fredrik Lundh
33 * By obtaining, using, and/or copying this software and/or its
34 * associated documentation, you agree that you have read, understood,
35 * and will comply with the following terms and conditions:
37 * Permission to use, copy, modify, and distribute this software and its
38 * associated documentation for any purpose and without fee is hereby
39 * granted, provided that the above copyright notice appears in all
40 * copies, and that both that copyright notice and this permission notice
41 * appear in supporting documentation, and that the name of Secret Labs
42 * AB or the author not be used in advertising or publicity pertaining to
43 * distribution of the software without specific, written prior
44 * permission.
46 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
47 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
48 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
49 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
50 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
51 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
52 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
53 * -------------------------------------------------------------------- */
55 #include "ctype.h"
57 /* === Internal API ======================================================= */
59 /* --- Internal Unicode Format -------------------------------------------- */
61 /* Set these flags if the platform has "wchar.h", "wctype.h" and the
62 wchar_t type is a 16-bit unsigned type */
63 /* #define HAVE_WCHAR_H */
64 /* #define HAVE_USABLE_WCHAR_T */
66 /* Defaults for various platforms */
67 #ifndef HAVE_USABLE_WCHAR_T
69 /* Windows has a usable wchar_t type */
70 # if defined(MS_WIN32)
71 # define HAVE_USABLE_WCHAR_T
72 # endif
74 #endif
76 /* If the compiler provides a wchar_t type we try to support it
77 through the interface functions PyUnicode_FromWideChar() and
78 PyUnicode_AsWideChar(). */
80 #ifdef HAVE_USABLE_WCHAR_T
81 # ifndef HAVE_WCHAR_H
82 # define HAVE_WCHAR_H
83 # endif
84 #endif
86 #ifdef HAVE_WCHAR_H
87 /* Work around a cosmetic bug in BSDI 4.x wchar.h; thanks to Thomas Wouters */
88 # ifdef _HAVE_BSDI
89 # include <time.h>
90 # endif
91 # include "wchar.h"
92 #endif
94 #ifdef HAVE_USABLE_WCHAR_T
96 /* If the compiler defines whcar_t as a 16-bit unsigned type we can
97 use the compiler type directly. Works fine with all modern Windows
98 platforms. */
100 typedef wchar_t Py_UNICODE;
102 #else
104 /* Use if you have a standard ANSI compiler, without wchar_t support.
105 If a short is not 16 bits on your platform, you have to fix the
106 typedef below, or the module initialization code will complain. */
108 typedef unsigned short Py_UNICODE;
110 #endif
113 * Use this typedef when you need to represent a UTF-16 surrogate pair
114 * as single unsigned integer.
116 #if SIZEOF_INT >= 4
117 typedef unsigned int Py_UCS4;
118 #elif SIZEOF_LONG >= 4
119 typedef unsigned long Py_UCS4;
120 #endif
123 /* --- Internal Unicode Operations ---------------------------------------- */
125 /* If you want Python to use the compiler's wctype.h functions instead
126 of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
127 configure Python using --with-ctype-functions. This reduces the
128 interpreter's code size. */
130 #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
132 #include "wctype.h"
134 #define Py_UNICODE_ISSPACE(ch) iswspace(ch)
136 #define Py_UNICODE_ISLOWER(ch) iswlower(ch)
137 #define Py_UNICODE_ISUPPER(ch) iswupper(ch)
138 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
139 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
141 #define Py_UNICODE_TOLOWER(ch) towlower(ch)
142 #define Py_UNICODE_TOUPPER(ch) towupper(ch)
143 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
145 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
146 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
147 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
149 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
150 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
151 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
153 #define Py_UNICODE_ISALPHA(ch) iswalpha(ch)
155 #else
157 #define Py_UNICODE_ISSPACE(ch) _PyUnicode_IsWhitespace(ch)
159 #define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
160 #define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
161 #define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
162 #define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
164 #define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
165 #define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
166 #define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
168 #define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
169 #define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
170 #define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
172 #define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
173 #define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
174 #define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
176 #define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
178 #endif
180 #define Py_UNICODE_ISALNUM(ch) \
181 (Py_UNICODE_ISALPHA(ch) || \
182 Py_UNICODE_ISDECIMAL(ch) || \
183 Py_UNICODE_ISDIGIT(ch) || \
184 Py_UNICODE_ISNUMERIC(ch))
186 #define Py_UNICODE_COPY(target, source, length)\
187 (memcpy((target), (source), (length)*sizeof(Py_UNICODE)))
189 #define Py_UNICODE_FILL(target, value, length) do\
190 {int i; for (i = 0; i < (length); i++) (target)[i] = (value);}\
191 while (0)
193 #define Py_UNICODE_MATCH(string, offset, substring)\
194 ((*((string)->str + (offset)) == *((substring)->str)) &&\
195 !memcmp((string)->str + (offset), (substring)->str,\
196 (substring)->length*sizeof(Py_UNICODE)))
198 #ifdef __cplusplus
199 extern "C" {
200 #endif
202 /* --- Unicode Type ------------------------------------------------------- */
204 typedef struct {
205 PyObject_HEAD
206 int length; /* Length of raw Unicode data in buffer */
207 Py_UNICODE *str; /* Raw Unicode buffer */
208 long hash; /* Hash value; -1 if not set */
209 PyObject *defenc; /* (Default) Encoded version as Python
210 string, or NULL; this is used for
211 implementing the buffer protocol */
212 } PyUnicodeObject;
214 extern DL_IMPORT(PyTypeObject) PyUnicode_Type;
216 #define PyUnicode_Check(op) (((op)->ob_type == &PyUnicode_Type))
218 /* Fast access macros */
219 #define PyUnicode_GET_SIZE(op) \
220 (((PyUnicodeObject *)(op))->length)
221 #define PyUnicode_GET_DATA_SIZE(op) \
222 (((PyUnicodeObject *)(op))->length * sizeof(Py_UNICODE))
223 #define PyUnicode_AS_UNICODE(op) \
224 (((PyUnicodeObject *)(op))->str)
225 #define PyUnicode_AS_DATA(op) \
226 ((const char *)((PyUnicodeObject *)(op))->str)
228 /* --- Constants ---------------------------------------------------------- */
230 /* This Unicode character will be used as replacement character during
231 decoding if the errors argument is set to "replace". Note: the
232 Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
233 Unicode 3.0. */
235 #define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UNICODE) 0xFFFD)
237 /* === Public API ========================================================= */
239 /* --- Plain Py_UNICODE --------------------------------------------------- */
241 /* Create a Unicode Object from the Py_UNICODE buffer u of the given
242 size. u may be NULL which causes the contents to be undefined. It
243 is the user's responsibility to fill in the needed data.
245 The buffer is copied into the new object. */
247 extern DL_IMPORT(PyObject*) PyUnicode_FromUnicode(
248 const Py_UNICODE *u, /* Unicode buffer */
249 int size /* size of buffer */
252 /* Return a read-only pointer to the Unicode object's internal
253 Py_UNICODE buffer. */
255 extern DL_IMPORT(Py_UNICODE *) PyUnicode_AsUnicode(
256 PyObject *unicode /* Unicode object */
259 /* Get the length of the Unicode object. */
261 extern DL_IMPORT(int) PyUnicode_GetSize(
262 PyObject *unicode /* Unicode object */
265 /* Resize an already allocated Unicode object to the new size length.
267 *unicode is modified to point to the new (resized) object and 0
268 returned on success.
270 This API may only be called by the function which also called the
271 Unicode constructor. The refcount on the object must be 1. Otherwise,
272 an error is returned.
274 Error handling is implemented as follows: an exception is set, -1
275 is returned and *unicode left untouched.
279 extern DL_IMPORT(int) PyUnicode_Resize(
280 PyObject **unicode, /* Pointer to the Unicode object */
281 int length /* New length */
284 /* Coerce obj to an Unicode object and return a reference with
285 *incremented* refcount.
287 Coercion is done in the following way:
289 1. Unicode objects are passed back as-is with incremented
290 refcount.
292 2. String and other char buffer compatible objects are decoded
293 under the assumptions that they contain data using the current
294 default encoding. Decoding is done in "strict" mode.
296 3. All other objects raise an exception.
298 The API returns NULL in case of an error. The caller is responsible
299 for decref'ing the returned objects.
303 extern DL_IMPORT(PyObject*) PyUnicode_FromEncodedObject(
304 register PyObject *obj, /* Object */
305 const char *encoding, /* encoding */
306 const char *errors /* error handling */
309 /* Shortcut for PyUnicode_FromEncodedObject(obj, NULL, "strict");
310 which results in using the default encoding as basis for
311 decoding the object.
313 Coerces obj to an Unicode object and return a reference with
314 *incremented* refcount.
316 The API returns NULL in case of an error. The caller is responsible
317 for decref'ing the returned objects.
321 extern DL_IMPORT(PyObject*) PyUnicode_FromObject(
322 register PyObject *obj /* Object */
325 /* --- wchar_t support for platforms which support it --------------------- */
327 #ifdef HAVE_WCHAR_H
329 /* Create a Unicode Object from the whcar_t buffer w of the given
330 size.
332 The buffer is copied into the new object. */
334 extern DL_IMPORT(PyObject*) PyUnicode_FromWideChar(
335 register const wchar_t *w, /* wchar_t buffer */
336 int size /* size of buffer */
339 /* Copies the Unicode Object contents into the whcar_t buffer w. At
340 most size wchar_t characters are copied.
342 Returns the number of wchar_t characters copied or -1 in case of an
343 error. */
345 extern DL_IMPORT(int) PyUnicode_AsWideChar(
346 PyUnicodeObject *unicode, /* Unicode object */
347 register wchar_t *w, /* wchar_t buffer */
348 int size /* size of buffer */
351 #endif
353 /* === Builtin Codecs =====================================================
355 Many of these APIs take two arguments encoding and errors. These
356 parameters encoding and errors have the same semantics as the ones
357 of the builtin unicode() API.
359 Setting encoding to NULL causes the default encoding to be used.
361 Error handling is set by errors which may also be set to NULL
362 meaning to use the default handling defined for the codec. Default
363 error handling for all builtin codecs is "strict" (ValueErrors are
364 raised).
366 The codecs all use a similar interface. Only deviation from the
367 generic ones are documented.
371 /* --- Manage the default encoding ---------------------------------------- */
373 /* Returns the currently active default encoding.
375 The default encoding is currently implemented as run-time settable
376 process global. This may change in future versions of the
377 interpreter to become a parameter which is managed on a per-thread
378 basis.
382 extern DL_IMPORT(const char*) PyUnicode_GetDefaultEncoding(void);
384 /* Sets the currently active default encoding.
386 Returns 0 on success, -1 in case of an error.
390 extern DL_IMPORT(int) PyUnicode_SetDefaultEncoding(
391 const char *encoding /* Encoding name in standard form */
394 /* --- Generic Codecs ----------------------------------------------------- */
396 /* Create a Unicode object by decoding the encoded string s of the
397 given size. */
399 extern DL_IMPORT(PyObject*) PyUnicode_Decode(
400 const char *s, /* encoded string */
401 int size, /* size of buffer */
402 const char *encoding, /* encoding */
403 const char *errors /* error handling */
406 /* Encodes a Py_UNICODE buffer of the given size and returns a
407 Python string object. */
409 extern DL_IMPORT(PyObject*) PyUnicode_Encode(
410 const Py_UNICODE *s, /* Unicode char buffer */
411 int size, /* number of Py_UNICODE chars to encode */
412 const char *encoding, /* encoding */
413 const char *errors /* error handling */
416 /* Encodes a Unicode object and returns the result as Python string
417 object. */
419 extern DL_IMPORT(PyObject*) PyUnicode_AsEncodedString(
420 PyObject *unicode, /* Unicode object */
421 const char *encoding, /* encoding */
422 const char *errors /* error handling */
425 /* --- UTF-8 Codecs ------------------------------------------------------- */
427 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF8(
428 const char *string, /* UTF-8 encoded string */
429 int length, /* size of string */
430 const char *errors /* error handling */
433 extern DL_IMPORT(PyObject*) PyUnicode_AsUTF8String(
434 PyObject *unicode /* Unicode object */
437 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
438 const Py_UNICODE *data, /* Unicode char buffer */
439 int length, /* number of Py_UNICODE chars to encode */
440 const char *errors /* error handling */
443 /* --- UTF-16 Codecs ------------------------------------------------------ */
445 /* Decodes length bytes from a UTF-16 encoded buffer string and returns
446 the corresponding Unicode object.
448 errors (if non-NULL) defines the error handling. It defaults
449 to "strict".
451 If byteorder is non-NULL, the decoder starts decoding using the
452 given byte order:
454 *byteorder == -1: little endian
455 *byteorder == 0: native order
456 *byteorder == 1: big endian
458 and then switches according to all BOM marks it finds in the input
459 data. BOM marks are not copied into the resulting Unicode string.
460 After completion, *byteorder is set to the current byte order at
461 the end of input data.
463 If byteorder is NULL, the codec starts in native order mode.
467 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUTF16(
468 const char *string, /* UTF-16 encoded string */
469 int length, /* size of string */
470 const char *errors, /* error handling */
471 int *byteorder /* pointer to byteorder to use
472 0=native;-1=LE,1=BE; updated on
473 exit */
476 /* Returns a Python string using the UTF-16 encoding in native byte
477 order. The string always starts with a BOM mark. */
479 extern DL_IMPORT(PyObject*) PyUnicode_AsUTF16String(
480 PyObject *unicode /* Unicode object */
483 /* Returns a Python string object holding the UTF-16 encoded value of
484 the Unicode data.
486 If byteorder is not 0, output is written according to the following
487 byte order:
489 byteorder == -1: little endian
490 byteorder == 0: native byte order (writes a BOM mark)
491 byteorder == 1: big endian
493 If byteorder is 0, the output string will always start with the
494 Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
495 prepended.
497 Note that Py_UNICODE data is being interpreted as UTF-16 reduced to
498 UCS-2. This trick makes it possible to add full UTF-16 capabilities
499 at a later point without compromising the APIs.
503 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF16(
504 const Py_UNICODE *data, /* Unicode char buffer */
505 int length, /* number of Py_UNICODE chars to encode */
506 const char *errors, /* error handling */
507 int byteorder /* byteorder to use 0=BOM+native;-1=LE,1=BE */
510 /* --- Unicode-Escape Codecs ---------------------------------------------- */
512 extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscape(
513 const char *string, /* Unicode-Escape encoded string */
514 int length, /* size of string */
515 const char *errors /* error handling */
518 extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString(
519 PyObject *unicode /* Unicode object */
522 extern DL_IMPORT(PyObject*) PyUnicode_EncodeUnicodeEscape(
523 const Py_UNICODE *data, /* Unicode char buffer */
524 int length /* Number of Py_UNICODE chars to encode */
527 /* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
529 extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
530 const char *string, /* Raw-Unicode-Escape encoded string */
531 int length, /* size of string */
532 const char *errors /* error handling */
535 extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
536 PyObject *unicode /* Unicode object */
539 extern DL_IMPORT(PyObject*) PyUnicode_EncodeRawUnicodeEscape(
540 const Py_UNICODE *data, /* Unicode char buffer */
541 int length /* Number of Py_UNICODE chars to encode */
544 /* --- Latin-1 Codecs -----------------------------------------------------
546 Note: Latin-1 corresponds to the first 256 Unicode ordinals.
550 extern DL_IMPORT(PyObject*) PyUnicode_DecodeLatin1(
551 const char *string, /* Latin-1 encoded string */
552 int length, /* size of string */
553 const char *errors /* error handling */
556 extern DL_IMPORT(PyObject*) PyUnicode_AsLatin1String(
557 PyObject *unicode /* Unicode object */
560 extern DL_IMPORT(PyObject*) PyUnicode_EncodeLatin1(
561 const Py_UNICODE *data, /* Unicode char buffer */
562 int length, /* Number of Py_UNICODE chars to encode */
563 const char *errors /* error handling */
566 /* --- ASCII Codecs -------------------------------------------------------
568 Only 7-bit ASCII data is excepted. All other codes generate errors.
572 extern DL_IMPORT(PyObject*) PyUnicode_DecodeASCII(
573 const char *string, /* ASCII encoded string */
574 int length, /* size of string */
575 const char *errors /* error handling */
578 extern DL_IMPORT(PyObject*) PyUnicode_AsASCIIString(
579 PyObject *unicode /* Unicode object */
582 extern DL_IMPORT(PyObject*) PyUnicode_EncodeASCII(
583 const Py_UNICODE *data, /* Unicode char buffer */
584 int length, /* Number of Py_UNICODE chars to encode */
585 const char *errors /* error handling */
588 /* --- Character Map Codecs -----------------------------------------------
590 This codec uses mappings to encode and decode characters.
592 Decoding mappings must map single string characters to single
593 Unicode characters, integers (which are then interpreted as Unicode
594 ordinals) or None (meaning "undefined mapping" and causing an
595 error).
597 Encoding mappings must map single Unicode characters to single
598 string characters, integers (which are then interpreted as Latin-1
599 ordinals) or None (meaning "undefined mapping" and causing an
600 error).
602 If a character lookup fails with a LookupError, the character is
603 copied as-is meaning that its ordinal value will be interpreted as
604 Unicode or Latin-1 ordinal resp. Because of this mappings only need
605 to contain those mappings which map characters to different code
606 points.
610 extern DL_IMPORT(PyObject*) PyUnicode_DecodeCharmap(
611 const char *string, /* Encoded string */
612 int length, /* size of string */
613 PyObject *mapping, /* character mapping
614 (char ordinal -> unicode ordinal) */
615 const char *errors /* error handling */
618 extern DL_IMPORT(PyObject*) PyUnicode_AsCharmapString(
619 PyObject *unicode, /* Unicode object */
620 PyObject *mapping /* character mapping
621 (unicode ordinal -> char ordinal) */
624 extern DL_IMPORT(PyObject*) PyUnicode_EncodeCharmap(
625 const Py_UNICODE *data, /* Unicode char buffer */
626 int length, /* Number of Py_UNICODE chars to encode */
627 PyObject *mapping, /* character mapping
628 (unicode ordinal -> char ordinal) */
629 const char *errors /* error handling */
632 /* Translate a Py_UNICODE buffer of the given length by applying a
633 character mapping table to it and return the resulting Unicode
634 object.
636 The mapping table must map Unicode ordinal integers to Unicode
637 ordinal integers or None (causing deletion of the character).
639 Mapping tables may be dictionaries or sequences. Unmapped character
640 ordinals (ones which cause a LookupError) are left untouched and
641 are copied as-is.
645 extern DL_IMPORT(PyObject *) PyUnicode_TranslateCharmap(
646 const Py_UNICODE *data, /* Unicode char buffer */
647 int length, /* Number of Py_UNICODE chars to encode */
648 PyObject *table, /* Translate table */
649 const char *errors /* error handling */
652 #ifdef MS_WIN32
654 /* --- MBCS codecs for Windows -------------------------------------------- */
656 extern DL_IMPORT(PyObject*) PyUnicode_DecodeMBCS(
657 const char *string, /* MBCS encoded string */
658 int length, /* size of string */
659 const char *errors /* error handling */
662 extern DL_IMPORT(PyObject*) PyUnicode_AsMBCSString(
663 PyObject *unicode /* Unicode object */
666 extern DL_IMPORT(PyObject*) PyUnicode_EncodeMBCS(
667 const Py_UNICODE *data, /* Unicode char buffer */
668 int length, /* Number of Py_UNICODE chars to encode */
669 const char *errors /* error handling */
672 #endif /* MS_WIN32 */
674 /* --- Decimal Encoder ---------------------------------------------------- */
676 /* Takes a Unicode string holding a decimal value and writes it into
677 an output buffer using standard ASCII digit codes.
679 The output buffer has to provide at least length+1 bytes of storage
680 area. The output string is 0-terminated.
682 The encoder converts whitespace to ' ', decimal characters to their
683 corresponding ASCII digit and all other Latin-1 characters except
684 \0 as-is. Characters outside this range (Unicode ordinals 1-256)
685 are treated as errors. This includes embedded NULL bytes.
687 Error handling is defined by the errors argument:
689 NULL or "strict": raise a ValueError
690 "ignore": ignore the wrong characters (these are not copied to the
691 output buffer)
692 "replace": replaces illegal characters with '?'
694 Returns 0 on success, -1 on failure.
698 extern DL_IMPORT(int) PyUnicode_EncodeDecimal(
699 Py_UNICODE *s, /* Unicode buffer */
700 int length, /* Number of Py_UNICODE chars to encode */
701 char *output, /* Output buffer; must have size >= length */
702 const char *errors /* error handling */
705 /* --- Methods & Slots ----------------------------------------------------
707 These are capable of handling Unicode objects and strings on input
708 (we refer to them as strings in the descriptions) and return
709 Unicode objects or integers as apporpriate. */
711 /* Concat two strings giving a new Unicode string. */
713 extern DL_IMPORT(PyObject*) PyUnicode_Concat(
714 PyObject *left, /* Left string */
715 PyObject *right /* Right string */
718 /* Split a string giving a list of Unicode strings.
720 If sep is NULL, splitting will be done at all whitespace
721 substrings. Otherwise, splits occur at the given separator.
723 At most maxsplit splits will be done. If negative, no limit is set.
725 Separators are not included in the resulting list.
729 extern DL_IMPORT(PyObject*) PyUnicode_Split(
730 PyObject *s, /* String to split */
731 PyObject *sep, /* String separator */
732 int maxsplit /* Maxsplit count */
735 /* Dito, but split at line breaks.
737 CRLF is considered to be one line break. Line breaks are not
738 included in the resulting list. */
740 extern DL_IMPORT(PyObject*) PyUnicode_Splitlines(
741 PyObject *s, /* String to split */
742 int keepends /* If true, line end markers are included */
745 /* Translate a string by applying a character mapping table to it and
746 return the resulting Unicode object.
748 The mapping table must map Unicode ordinal integers to Unicode
749 ordinal integers or None (causing deletion of the character).
751 Mapping tables may be dictionaries or sequences. Unmapped character
752 ordinals (ones which cause a LookupError) are left untouched and
753 are copied as-is.
757 extern DL_IMPORT(PyObject *) PyUnicode_Translate(
758 PyObject *str, /* String */
759 PyObject *table, /* Translate table */
760 const char *errors /* error handling */
763 /* Join a sequence of strings using the given separator and return
764 the resulting Unicode string. */
766 extern DL_IMPORT(PyObject*) PyUnicode_Join(
767 PyObject *separator, /* Separator string */
768 PyObject *seq /* Sequence object */
771 /* Return 1 if substr matches str[start:end] at the given tail end, 0
772 otherwise. */
774 extern DL_IMPORT(int) PyUnicode_Tailmatch(
775 PyObject *str, /* String */
776 PyObject *substr, /* Prefix or Suffix string */
777 int start, /* Start index */
778 int end, /* Stop index */
779 int direction /* Tail end: -1 prefix, +1 suffix */
782 /* Return the first position of substr in str[start:end] using the
783 given search direction or -1 if not found. */
785 extern DL_IMPORT(int) PyUnicode_Find(
786 PyObject *str, /* String */
787 PyObject *substr, /* Substring to find */
788 int start, /* Start index */
789 int end, /* Stop index */
790 int direction /* Find direction: +1 forward, -1 backward */
793 /* Count the number of occurrences of substr in str[start:end]. */
795 extern DL_IMPORT(int) PyUnicode_Count(
796 PyObject *str, /* String */
797 PyObject *substr, /* Substring to count */
798 int start, /* Start index */
799 int end /* Stop index */
802 /* Replace at most maxcount occurrences of substr in str with replstr
803 and return the resulting Unicode object. */
805 extern DL_IMPORT(PyObject *) PyUnicode_Replace(
806 PyObject *str, /* String */
807 PyObject *substr, /* Substring to find */
808 PyObject *replstr, /* Substring to replace */
809 int maxcount /* Max. number of replacements to apply;
810 -1 = all */
813 /* Compare two strings and return -1, 0, 1 for less than, equal,
814 greater than resp. */
816 extern DL_IMPORT(int) PyUnicode_Compare(
817 PyObject *left, /* Left string */
818 PyObject *right /* Right string */
821 /* Apply a argument tuple or dictionary to a format string and return
822 the resulting Unicode string. */
824 extern DL_IMPORT(PyObject *) PyUnicode_Format(
825 PyObject *format, /* Format string */
826 PyObject *args /* Argument tuple or dictionary */
829 /* Checks whether element is contained in container and return 1/0
830 accordingly.
832 element has to coerce to an one element Unicode string. -1 is
833 returned in case of an error. */
835 extern DL_IMPORT(int) PyUnicode_Contains(
836 PyObject *container, /* Container string */
837 PyObject *element /* Element string */
840 /* === Characters Type APIs =============================================== */
842 /* These should not be used directly. Use the Py_UNICODE_IS* and
843 Py_UNICODE_TO* macros instead.
845 These APIs are implemented in Objects/unicodectype.c.
849 extern DL_IMPORT(int) _PyUnicode_IsLowercase(
850 register const Py_UNICODE ch /* Unicode character */
853 extern DL_IMPORT(int) _PyUnicode_IsUppercase(
854 register const Py_UNICODE ch /* Unicode character */
857 extern DL_IMPORT(int) _PyUnicode_IsTitlecase(
858 register const Py_UNICODE ch /* Unicode character */
861 extern DL_IMPORT(int) _PyUnicode_IsWhitespace(
862 register const Py_UNICODE ch /* Unicode character */
865 extern DL_IMPORT(int) _PyUnicode_IsLinebreak(
866 register const Py_UNICODE ch /* Unicode character */
869 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToLowercase(
870 register const Py_UNICODE ch /* Unicode character */
873 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToUppercase(
874 register const Py_UNICODE ch /* Unicode character */
877 extern DL_IMPORT(Py_UNICODE) _PyUnicode_ToTitlecase(
878 register const Py_UNICODE ch /* Unicode character */
881 extern DL_IMPORT(int) _PyUnicode_ToDecimalDigit(
882 register const Py_UNICODE ch /* Unicode character */
885 extern DL_IMPORT(int) _PyUnicode_ToDigit(
886 register const Py_UNICODE ch /* Unicode character */
889 extern DL_IMPORT(double) _PyUnicode_ToNumeric(
890 register const Py_UNICODE ch /* Unicode character */
893 extern DL_IMPORT(int) _PyUnicode_IsDecimalDigit(
894 register const Py_UNICODE ch /* Unicode character */
897 extern DL_IMPORT(int) _PyUnicode_IsDigit(
898 register const Py_UNICODE ch /* Unicode character */
901 extern DL_IMPORT(int) _PyUnicode_IsNumeric(
902 register const Py_UNICODE ch /* Unicode character */
905 extern DL_IMPORT(int) _PyUnicode_IsAlpha(
906 register const Py_UNICODE ch /* Unicode character */
909 #ifdef __cplusplus
911 #endif
912 #endif /* !Py_UNICODEOBJECT_H */