Move setting of ioready 'wait' earlier in call chain, to
[python/dscho.git] / Modules / unicodedata.c
blobd266ad7e0b49b8a98ad3455c2744e979d1e3b7a2
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.2 data base.
5 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
15 #include "Python.h"
16 #include "ucnhash.h"
18 /* character properties */
20 typedef struct {
21 const unsigned char category; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining; /* combining class value 0 - 255 */
24 const unsigned char bidirectional; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored; /* true if mirrored in bidir mode */
27 } _PyUnicode_DatabaseRecord;
29 /* data file generated by Tools/unicode/makeunicodedata.py */
30 #include "unicodedata_db.h"
32 static const _PyUnicode_DatabaseRecord*
33 _getrecord_ex(Py_UCS4 code)
35 int index;
36 if (code >= 0x110000)
37 index = 0;
38 else {
39 index = index1[(code>>SHIFT)];
40 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
43 return &_PyUnicode_Database_Records[index];
46 static const _PyUnicode_DatabaseRecord*
47 _getrecord(PyUnicodeObject* v)
49 return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
52 /* --- Module API --------------------------------------------------------- */
54 static PyObject *
55 unicodedata_decimal(PyObject *self, PyObject *args)
57 PyUnicodeObject *v;
58 PyObject *defobj = NULL;
59 long rc;
61 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
62 return NULL;
63 if (PyUnicode_GET_SIZE(v) != 1) {
64 PyErr_SetString(PyExc_TypeError,
65 "need a single Unicode character as parameter");
66 return NULL;
68 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
69 if (rc < 0) {
70 if (defobj == NULL) {
71 PyErr_SetString(PyExc_ValueError,
72 "not a decimal");
73 return NULL;
75 else {
76 Py_INCREF(defobj);
77 return defobj;
80 return PyInt_FromLong(rc);
83 static PyObject *
84 unicodedata_digit(PyObject *self, PyObject *args)
86 PyUnicodeObject *v;
87 PyObject *defobj = NULL;
88 long rc;
90 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
91 return NULL;
92 if (PyUnicode_GET_SIZE(v) != 1) {
93 PyErr_SetString(PyExc_TypeError,
94 "need a single Unicode character as parameter");
95 return NULL;
97 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
98 if (rc < 0) {
99 if (defobj == NULL) {
100 PyErr_SetString(PyExc_ValueError, "not a digit");
101 return NULL;
103 else {
104 Py_INCREF(defobj);
105 return defobj;
108 return PyInt_FromLong(rc);
111 static PyObject *
112 unicodedata_numeric(PyObject *self, PyObject *args)
114 PyUnicodeObject *v;
115 PyObject *defobj = NULL;
116 double rc;
118 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
119 return NULL;
120 if (PyUnicode_GET_SIZE(v) != 1) {
121 PyErr_SetString(PyExc_TypeError,
122 "need a single Unicode character as parameter");
123 return NULL;
125 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
126 if (rc < 0) {
127 if (defobj == NULL) {
128 PyErr_SetString(PyExc_ValueError, "not a numeric character");
129 return NULL;
131 else {
132 Py_INCREF(defobj);
133 return defobj;
136 return PyFloat_FromDouble(rc);
139 static PyObject *
140 unicodedata_category(PyObject *self, PyObject *args)
142 PyUnicodeObject *v;
143 int index;
145 if (!PyArg_ParseTuple(args, "O!:category",
146 &PyUnicode_Type, &v))
147 return NULL;
148 if (PyUnicode_GET_SIZE(v) != 1) {
149 PyErr_SetString(PyExc_TypeError,
150 "need a single Unicode character as parameter");
151 return NULL;
153 index = (int) _getrecord(v)->category;
154 return PyString_FromString(_PyUnicode_CategoryNames[index]);
157 static PyObject *
158 unicodedata_bidirectional(PyObject *self, PyObject *args)
160 PyUnicodeObject *v;
161 int index;
163 if (!PyArg_ParseTuple(args, "O!:bidirectional",
164 &PyUnicode_Type, &v))
165 return NULL;
166 if (PyUnicode_GET_SIZE(v) != 1) {
167 PyErr_SetString(PyExc_TypeError,
168 "need a single Unicode character as parameter");
169 return NULL;
171 index = (int) _getrecord(v)->bidirectional;
172 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
175 static PyObject *
176 unicodedata_combining(PyObject *self, PyObject *args)
178 PyUnicodeObject *v;
180 if (!PyArg_ParseTuple(args, "O!:combining",
181 &PyUnicode_Type, &v))
182 return NULL;
183 if (PyUnicode_GET_SIZE(v) != 1) {
184 PyErr_SetString(PyExc_TypeError,
185 "need a single Unicode character as parameter");
186 return NULL;
188 return PyInt_FromLong((int) _getrecord(v)->combining);
191 static PyObject *
192 unicodedata_mirrored(PyObject *self, PyObject *args)
194 PyUnicodeObject *v;
196 if (!PyArg_ParseTuple(args, "O!:mirrored",
197 &PyUnicode_Type, &v))
198 return NULL;
199 if (PyUnicode_GET_SIZE(v) != 1) {
200 PyErr_SetString(PyExc_TypeError,
201 "need a single Unicode character as parameter");
202 return NULL;
204 return PyInt_FromLong((int) _getrecord(v)->mirrored);
207 static PyObject *
208 unicodedata_decomposition(PyObject *self, PyObject *args)
210 PyUnicodeObject *v;
211 char decomp[256];
212 int code, index, count, i;
214 if (!PyArg_ParseTuple(args, "O!:decomposition",
215 &PyUnicode_Type, &v))
216 return NULL;
217 if (PyUnicode_GET_SIZE(v) != 1) {
218 PyErr_SetString(PyExc_TypeError,
219 "need a single Unicode character as parameter");
220 return NULL;
223 code = (int) *PyUnicode_AS_UNICODE(v);
225 if (code < 0 || code >= 0x110000)
226 index = 0;
227 else {
228 index = decomp_index1[(code>>DECOMP_SHIFT)];
229 index = decomp_index2[(index<<DECOMP_SHIFT)+
230 (code&((1<<DECOMP_SHIFT)-1))];
233 /* high byte is number of hex bytes (usually one or two), low byte
234 is prefix code (from*/
235 count = decomp_data[index] >> 8;
237 /* XXX: could allocate the PyString up front instead
238 (strlen(prefix) + 5 * count + 1 bytes) */
240 /* copy prefix */
241 i = strlen(decomp_prefix[decomp_data[index] & 255]);
242 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
244 while (count-- > 0) {
245 if (i)
246 decomp[i++] = ' ';
247 assert((size_t)i < sizeof(decomp));
248 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
249 decomp_data[++index]);
250 i += strlen(decomp + i);
253 decomp[i] = '\0';
255 return PyString_FromString(decomp);
258 void
259 get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
261 if (code >= 0x110000) {
262 *index = 0;
264 else {
265 *index = decomp_index1[(code>>DECOMP_SHIFT)];
266 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
267 (code&((1<<DECOMP_SHIFT)-1))];
270 /* high byte is number of hex bytes (usually one or two), low byte
271 is prefix code (from*/
272 *count = decomp_data[*index] >> 8;
273 *prefix = decomp_data[*index] & 255;
275 (*index)++;
278 #define SBase 0xAC00
279 #define LBase 0x1100
280 #define VBase 0x1161
281 #define TBase 0x11A7
282 #define LCount 19
283 #define VCount 21
284 #define TCount 28
285 #define NCount (VCount*TCount)
286 #define SCount (LCount*NCount)
288 static PyObject*
289 nfd_nfkd(PyObject *input, int k)
291 PyObject *result;
292 Py_UNICODE *i, *end, *o;
293 /* Longest decomposition in Unicode 3.2: U+FDFA */
294 Py_UNICODE stack[20];
295 int space, stackptr, isize;
296 int index, prefix, count;
297 unsigned char prev, cur;
299 stackptr = 0;
300 isize = PyUnicode_GET_SIZE(input);
301 /* Overallocate atmost 10 characters. */
302 space = (isize > 10 ? 10 : isize) + isize;
303 result = PyUnicode_FromUnicode(NULL, space);
304 if (!result)
305 return NULL;
306 i = PyUnicode_AS_UNICODE(input);
307 end = i + isize;
308 o = PyUnicode_AS_UNICODE(result);
310 while (i < end) {
311 stack[stackptr++] = *i++;
312 while(stackptr) {
313 Py_UNICODE code = stack[--stackptr];
314 if (!space) {
315 space = PyString_GET_SIZE(result) + 10;
316 if (PyUnicode_Resize(&result, space) == -1)
317 return NULL;
318 o = PyUnicode_AS_UNICODE(result) + space - 10;
319 space = 10;
321 /* Hangul Decomposition. */
322 if (SBase <= code && code < (SBase+SCount)) {
323 int SIndex = code - SBase;
324 int L = LBase + SIndex / NCount;
325 int V = VBase + (SIndex % NCount) / TCount;
326 int T = TBase + SIndex % TCount;
327 *o++ = L;
328 *o++ = V;
329 space -= 2;
330 if (T != TBase) {
331 *o++ = T;
332 space --;
334 continue;
336 /* Other decompoistions. */
337 get_decomp_record(code, &index, &prefix, &count);
339 /* Copy character if it is not decomposable, or has a
340 compatibility decomposition, but we do NFD. */
341 if (!count || (prefix && !k)) {
342 *o++ = code;
343 space--;
344 continue;
346 /* Copy decomposition onto the stack, in reverse
347 order. */
348 while(count) {
349 code = decomp_data[index + (--count)];
350 stack[stackptr++] = code;
355 /* Drop overallocation. Cannot fail. */
356 PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
358 /* Sort canonically. */
359 i = PyUnicode_AS_UNICODE(result);
360 prev = _getrecord_ex(*i)->combining;
361 end = i + PyUnicode_GET_SIZE(result);
362 for (i++; i < end; i++) {
363 cur = _getrecord_ex(*i)->combining;
364 if (prev == 0 || cur == 0 || prev <= cur) {
365 prev = cur;
366 continue;
368 /* Non-canonical order. Need to switch *i with previous. */
369 o = i - 1;
370 while (1) {
371 Py_UNICODE tmp = o[1];
372 o[1] = o[0];
373 o[0] = tmp;
374 o--;
375 if (o < PyUnicode_AS_UNICODE(result))
376 break;
377 prev = _getrecord_ex(*o)->combining;
378 if (prev == 0 || prev <= cur)
379 break;
381 prev = _getrecord_ex(*i)->combining;
383 return result;
386 static int
387 find_nfc_index(struct reindex* nfc, Py_UNICODE code)
389 int index;
390 for (index = 0; nfc[index].start; index++) {
391 int start = nfc[index].start;
392 if (code < start)
393 return -1;
394 if (code <= start + nfc[index].count) {
395 int delta = code - start;
396 return nfc[index].index + delta;
399 return -1;
402 static PyObject*
403 nfc_nfkc(PyObject *input, int k)
405 PyObject *result;
406 Py_UNICODE *i, *i1, *o, *end;
407 int f,l,index,index1,comb;
408 Py_UNICODE code;
409 Py_UNICODE *skipped[20];
410 int cskipped = 0;
412 result = nfd_nfkd(input, k);
413 if (!result)
414 return NULL;
416 /* We are going to modify result in-place.
417 If nfd_nfkd is changed to sometimes return the input,
418 this code needs to be reviewed. */
419 assert(result != input);
421 i = PyUnicode_AS_UNICODE(result);
422 end = i + PyUnicode_GET_SIZE(result);
423 o = PyUnicode_AS_UNICODE(result);
425 again:
426 while (i < end) {
427 for (index = 0; index < cskipped; index++) {
428 if (skipped[index] == i) {
429 /* *i character is skipped.
430 Remove from list. */
431 skipped[index] = skipped[cskipped-1];
432 cskipped--;
433 i++;
434 goto again; /* continue while */
437 /* Hangul Composition. We don't need to check for <LV,T>
438 pairs, since we always have decomposed data. */
439 if (LBase <= *i && *i < (LBase+LCount) &&
440 i + 1 < end &&
441 VBase <= i[1] && i[1] <= (VBase+VCount)) {
442 int LIndex, VIndex;
443 LIndex = i[0] - LBase;
444 VIndex = i[1] - VBase;
445 code = SBase + (LIndex*VCount+VIndex)*TCount;
446 i+=2;
447 if (i < end &&
448 TBase <= *i && *i <= (TBase+TCount)) {
449 code += *i-TBase;
450 i++;
452 *o++ = code;
453 continue;
456 f = find_nfc_index(nfc_first, *i);
457 if (f == -1) {
458 *o++ = *i++;
459 continue;
461 /* Find next unblocked character. */
462 i1 = i+1;
463 comb = 0;
464 while (i1 < end) {
465 int comb1 = _getrecord_ex(*i1)->combining;
466 if (comb1 && comb == comb1) {
467 /* Character is blocked. */
468 i1++;
469 continue;
471 l = find_nfc_index(nfc_last, *i1);
472 /* *i1 cannot be combined with *i. If *i1
473 is a starter, we don't need to look further.
474 Otherwise, record the combining class. */
475 if (l == -1) {
476 not_combinable:
477 if (comb1 == 0)
478 break;
479 comb = comb1;
480 i1++;
481 continue;
483 index = f*TOTAL_LAST + l;
484 index1 = comp_index[index >> COMP_SHIFT];
485 code = comp_data[(index1<<COMP_SHIFT)+
486 (index&((1<<COMP_SHIFT)-1))];
487 if (code == 0)
488 goto not_combinable;
490 /* Replace the original character. */
491 *i = code;
492 /* Mark the second character unused. */
493 skipped[cskipped++] = i1;
494 i1++;
495 f = find_nfc_index(nfc_first, *i);
496 if (f == -1)
497 break;
499 *o++ = *i++;
501 if (o != end)
502 PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
503 return result;
506 static PyObject*
507 unicodedata_normalize(PyObject *self, PyObject *args)
509 char *form;
510 PyObject *input;
512 if(!PyArg_ParseTuple(args, "sO!:normalized",
513 &form, &PyUnicode_Type, &input))
514 return NULL;
516 if (strcmp(form, "NFC") == 0)
517 return nfc_nfkc(input, 0);
518 if (strcmp(form, "NFKC") == 0)
519 return nfc_nfkc(input, 1);
520 if (strcmp(form, "NFD") == 0)
521 return nfd_nfkd(input, 0);
522 if (strcmp(form, "NFKD") == 0)
523 return nfd_nfkd(input, 1);
524 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
525 return NULL;
528 /* -------------------------------------------------------------------- */
529 /* unicode character name tables */
531 /* data file generated by Tools/unicode/makeunicodedata.py */
532 #include "unicodename_db.h"
534 /* -------------------------------------------------------------------- */
535 /* database code (cut and pasted from the unidb package) */
537 static unsigned long
538 _gethash(const char *s, int len, int scale)
540 int i;
541 unsigned long h = 0;
542 unsigned long ix;
543 for (i = 0; i < len; i++) {
544 h = (h * scale) + (unsigned char) toupper(s[i]);
545 ix = h & 0xff000000;
546 if (ix)
547 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
549 return h;
552 static char *hangul_syllables[][3] = {
553 { "G", "A", "" },
554 { "GG", "AE", "G" },
555 { "N", "YA", "GG" },
556 { "D", "YAE", "GS" },
557 { "DD", "EO", "N", },
558 { "R", "E", "NJ" },
559 { "M", "YEO", "NH" },
560 { "B", "YE", "D" },
561 { "BB", "O", "L" },
562 { "S", "WA", "LG" },
563 { "SS", "WAE", "LM" },
564 { "", "OE", "LB" },
565 { "J", "YO", "LS" },
566 { "JJ", "U", "LT" },
567 { "C", "WEO", "LP" },
568 { "K", "WE", "LH" },
569 { "T", "WI", "M" },
570 { "P", "YU", "B" },
571 { "H", "EU", "BS" },
572 { 0, "YI", "S" },
573 { 0, "I", "SS" },
574 { 0, 0, "NG" },
575 { 0, 0, "J" },
576 { 0, 0, "C" },
577 { 0, 0, "K" },
578 { 0, 0, "T" },
579 { 0, 0, "P" },
580 { 0, 0, "H" }
583 static int
584 is_unified_ideograph(Py_UCS4 code)
586 return (
587 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
588 (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
589 (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
592 static int
593 _getucname(Py_UCS4 code, char* buffer, int buflen)
595 int offset;
596 int i;
597 int word;
598 unsigned char* w;
600 if (SBase <= code && code < SBase+SCount) {
601 /* Hangul syllable. */
602 int SIndex = code - SBase;
603 int L = SIndex / NCount;
604 int V = (SIndex % NCount) / TCount;
605 int T = SIndex % TCount;
607 if (buflen < 27)
608 /* Worst case: HANGUL SYLLABLE <10chars>. */
609 return 0;
610 strcpy(buffer, "HANGUL SYLLABLE ");
611 buffer += 16;
612 strcpy(buffer, hangul_syllables[L][0]);
613 buffer += strlen(hangul_syllables[L][0]);
614 strcpy(buffer, hangul_syllables[V][1]);
615 buffer += strlen(hangul_syllables[V][1]);
616 strcpy(buffer, hangul_syllables[T][2]);
617 buffer += strlen(hangul_syllables[T][2]);
618 *buffer = '\0';
619 return 1;
622 if (is_unified_ideograph(code)) {
623 if (buflen < 28)
624 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
625 return 0;
626 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
627 return 1;
630 if (code >= 0x110000)
631 return 0;
633 /* get offset into phrasebook */
634 offset = phrasebook_offset1[(code>>phrasebook_shift)];
635 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
636 (code&((1<<phrasebook_shift)-1))];
637 if (!offset)
638 return 0;
640 i = 0;
642 for (;;) {
643 /* get word index */
644 word = phrasebook[offset] - phrasebook_short;
645 if (word >= 0) {
646 word = (word << 8) + phrasebook[offset+1];
647 offset += 2;
648 } else
649 word = phrasebook[offset++];
650 if (i) {
651 if (i > buflen)
652 return 0; /* buffer overflow */
653 buffer[i++] = ' ';
655 /* copy word string from lexicon. the last character in the
656 word has bit 7 set. the last word in a string ends with
657 0x80 */
658 w = lexicon + lexicon_offset[word];
659 while (*w < 128) {
660 if (i >= buflen)
661 return 0; /* buffer overflow */
662 buffer[i++] = *w++;
664 if (i >= buflen)
665 return 0; /* buffer overflow */
666 buffer[i++] = *w & 127;
667 if (*w == 128)
668 break; /* end of word */
671 return 1;
674 static int
675 _cmpname(int code, const char* name, int namelen)
677 /* check if code corresponds to the given name */
678 int i;
679 char buffer[NAME_MAXLEN];
680 if (!_getucname(code, buffer, sizeof(buffer)))
681 return 0;
682 for (i = 0; i < namelen; i++) {
683 if (toupper(name[i]) != buffer[i])
684 return 0;
686 return buffer[namelen] == '\0';
689 static void
690 find_syllable(const char *str, int *len, int *pos, int count, int column)
692 int i, len1;
693 *len = -1;
694 for (i = 0; i < count; i++) {
695 char *s = hangul_syllables[i][column];
696 len1 = strlen(s);
697 if (len1 <= *len)
698 continue;
699 if (strncmp(str, s, len1) == 0) {
700 *len = len1;
701 *pos = i;
704 if (*len == -1) {
705 *len = 0;
706 *pos = -1;
710 static int
711 _getcode(const char* name, int namelen, Py_UCS4* code)
713 unsigned int h, v;
714 unsigned int mask = code_size-1;
715 unsigned int i, incr;
717 /* Check for hangul syllables. */
718 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
719 int L, V, T, len;
720 const char *pos = name + 16;
721 find_syllable(pos, &len, &L, LCount, 0);
722 pos += len;
723 find_syllable(pos, &len, &V, VCount, 1);
724 pos += len;
725 find_syllable(pos, &len, &T, TCount, 2);
726 pos += len;
727 if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
728 *code = SBase + (L*VCount+V)*TCount + T;
729 return 1;
731 /* Otherwise, it's an illegal syllable name. */
732 return 0;
735 /* Check for unified ideographs. */
736 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
737 /* Four or five hexdigits must follow. */
738 v = 0;
739 name += 22;
740 namelen -= 22;
741 if (namelen != 4 && namelen != 5)
742 return 0;
743 while (namelen--) {
744 v *= 16;
745 if (*name >= '0' && *name <= '9')
746 v += *name - '0';
747 else if (*name >= 'A' && *name <= 'F')
748 v += *name - 'A' + 10;
749 else
750 return 0;
751 name++;
753 if (!is_unified_ideograph(v))
754 return 0;
755 *code = v;
756 return 1;
759 /* the following is the same as python's dictionary lookup, with
760 only minor changes. see the makeunicodedata script for more
761 details */
763 h = (unsigned int) _gethash(name, namelen, code_magic);
764 i = (~h) & mask;
765 v = code_hash[i];
766 if (!v)
767 return 0;
768 if (_cmpname(v, name, namelen)) {
769 *code = v;
770 return 1;
772 incr = (h ^ (h >> 3)) & mask;
773 if (!incr)
774 incr = mask;
775 for (;;) {
776 i = (i + incr) & mask;
777 v = code_hash[i];
778 if (!v)
779 return 0;
780 if (_cmpname(v, name, namelen)) {
781 *code = v;
782 return 1;
784 incr = incr << 1;
785 if (incr > mask)
786 incr = incr ^ code_poly;
790 static const _PyUnicode_Name_CAPI hashAPI =
792 sizeof(_PyUnicode_Name_CAPI),
793 _getucname,
794 _getcode
797 /* -------------------------------------------------------------------- */
798 /* Python bindings */
800 static PyObject *
801 unicodedata_name(PyObject* self, PyObject* args)
803 char name[NAME_MAXLEN];
805 PyUnicodeObject* v;
806 PyObject* defobj = NULL;
807 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
808 return NULL;
810 if (PyUnicode_GET_SIZE(v) != 1) {
811 PyErr_SetString(PyExc_TypeError,
812 "need a single Unicode character as parameter");
813 return NULL;
816 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
817 name, sizeof(name))) {
818 if (defobj == NULL) {
819 PyErr_SetString(PyExc_ValueError, "no such name");
820 return NULL;
822 else {
823 Py_INCREF(defobj);
824 return defobj;
828 return Py_BuildValue("s", name);
831 static PyObject *
832 unicodedata_lookup(PyObject* self, PyObject* args)
834 Py_UCS4 code;
835 Py_UNICODE str[1];
837 char* name;
838 int namelen;
839 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
840 return NULL;
842 if (!_getcode(name, namelen, &code)) {
843 char fmt[] = "undefined character name '%s'";
844 char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
845 sprintf(buf, fmt, name);
846 PyErr_SetString(PyExc_KeyError, buf);
847 PyMem_FREE(buf);
848 return NULL;
851 str[0] = (Py_UNICODE) code;
852 return PyUnicode_FromUnicode(str, 1);
855 /* XXX Add doc strings. */
857 static PyMethodDef unicodedata_functions[] = {
858 {"decimal", unicodedata_decimal, METH_VARARGS},
859 {"digit", unicodedata_digit, METH_VARARGS},
860 {"numeric", unicodedata_numeric, METH_VARARGS},
861 {"category", unicodedata_category, METH_VARARGS},
862 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
863 {"combining", unicodedata_combining, METH_VARARGS},
864 {"mirrored", unicodedata_mirrored, METH_VARARGS},
865 {"decomposition",unicodedata_decomposition, METH_VARARGS},
866 {"name", unicodedata_name, METH_VARARGS},
867 {"lookup", unicodedata_lookup, METH_VARARGS},
868 {"normalize", unicodedata_normalize, METH_VARARGS},
869 {NULL, NULL} /* sentinel */
872 PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
874 PyMODINIT_FUNC
875 initunicodedata(void)
877 PyObject *m, *v;
879 m = Py_InitModule3(
880 "unicodedata", unicodedata_functions, unicodedata_docstring);
881 if (!m)
882 return;
884 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
886 /* Export C API */
887 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
888 if (v != NULL)
889 PyModule_AddObject(m, "ucnhash_CAPI", v);
893 Local variables:
894 c-basic-offset: 4
895 indent-tabs-mode: nil
896 End: