1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.2 data base.
5 Data was extracted from the Unicode 3.2 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
11 Copyright (c) Corporation for National Research Initiatives.
13 ------------------------------------------------------------------------ */
18 /* character properties */
21 const unsigned char category
; /* index into
22 _PyUnicode_CategoryNames */
23 const unsigned char combining
; /* combining class value 0 - 255 */
24 const unsigned char bidirectional
; /* index into
25 _PyUnicode_BidirectionalNames */
26 const unsigned char mirrored
; /* true if mirrored in bidir mode */
27 } _PyUnicode_DatabaseRecord
;
29 /* data file generated by Tools/unicode/makeunicodedata.py */
30 #include "unicodedata_db.h"
32 static const _PyUnicode_DatabaseRecord
*
33 _getrecord_ex(Py_UCS4 code
)
39 index
= index1
[(code
>>SHIFT
)];
40 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
43 return &_PyUnicode_Database_Records
[index
];
46 static const _PyUnicode_DatabaseRecord
*
47 _getrecord(PyUnicodeObject
* v
)
49 return _getrecord_ex(*PyUnicode_AS_UNICODE(v
));
52 /* --- Module API --------------------------------------------------------- */
55 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
58 PyObject
*defobj
= NULL
;
61 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
63 if (PyUnicode_GET_SIZE(v
) != 1) {
64 PyErr_SetString(PyExc_TypeError
,
65 "need a single Unicode character as parameter");
68 rc
= Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v
));
71 PyErr_SetString(PyExc_ValueError
,
80 return PyInt_FromLong(rc
);
84 unicodedata_digit(PyObject
*self
, PyObject
*args
)
87 PyObject
*defobj
= NULL
;
90 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
92 if (PyUnicode_GET_SIZE(v
) != 1) {
93 PyErr_SetString(PyExc_TypeError
,
94 "need a single Unicode character as parameter");
97 rc
= Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v
));
100 PyErr_SetString(PyExc_ValueError
, "not a digit");
108 return PyInt_FromLong(rc
);
112 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
115 PyObject
*defobj
= NULL
;
118 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
120 if (PyUnicode_GET_SIZE(v
) != 1) {
121 PyErr_SetString(PyExc_TypeError
,
122 "need a single Unicode character as parameter");
125 rc
= Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v
));
127 if (defobj
== NULL
) {
128 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
136 return PyFloat_FromDouble(rc
);
140 unicodedata_category(PyObject
*self
, PyObject
*args
)
145 if (!PyArg_ParseTuple(args
, "O!:category",
146 &PyUnicode_Type
, &v
))
148 if (PyUnicode_GET_SIZE(v
) != 1) {
149 PyErr_SetString(PyExc_TypeError
,
150 "need a single Unicode character as parameter");
153 index
= (int) _getrecord(v
)->category
;
154 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
158 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
163 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
164 &PyUnicode_Type
, &v
))
166 if (PyUnicode_GET_SIZE(v
) != 1) {
167 PyErr_SetString(PyExc_TypeError
,
168 "need a single Unicode character as parameter");
171 index
= (int) _getrecord(v
)->bidirectional
;
172 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
176 unicodedata_combining(PyObject
*self
, PyObject
*args
)
180 if (!PyArg_ParseTuple(args
, "O!:combining",
181 &PyUnicode_Type
, &v
))
183 if (PyUnicode_GET_SIZE(v
) != 1) {
184 PyErr_SetString(PyExc_TypeError
,
185 "need a single Unicode character as parameter");
188 return PyInt_FromLong((int) _getrecord(v
)->combining
);
192 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
196 if (!PyArg_ParseTuple(args
, "O!:mirrored",
197 &PyUnicode_Type
, &v
))
199 if (PyUnicode_GET_SIZE(v
) != 1) {
200 PyErr_SetString(PyExc_TypeError
,
201 "need a single Unicode character as parameter");
204 return PyInt_FromLong((int) _getrecord(v
)->mirrored
);
208 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
212 int code
, index
, count
, i
;
214 if (!PyArg_ParseTuple(args
, "O!:decomposition",
215 &PyUnicode_Type
, &v
))
217 if (PyUnicode_GET_SIZE(v
) != 1) {
218 PyErr_SetString(PyExc_TypeError
,
219 "need a single Unicode character as parameter");
223 code
= (int) *PyUnicode_AS_UNICODE(v
);
225 if (code
< 0 || code
>= 0x110000)
228 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
229 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
230 (code
&((1<<DECOMP_SHIFT
)-1))];
233 /* high byte is number of hex bytes (usually one or two), low byte
234 is prefix code (from*/
235 count
= decomp_data
[index
] >> 8;
237 /* XXX: could allocate the PyString up front instead
238 (strlen(prefix) + 5 * count + 1 bytes) */
241 i
= strlen(decomp_prefix
[decomp_data
[index
] & 255]);
242 memcpy(decomp
, decomp_prefix
[decomp_data
[index
] & 255], i
);
244 while (count
-- > 0) {
247 assert((size_t)i
< sizeof(decomp
));
248 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
249 decomp_data
[++index
]);
250 i
+= strlen(decomp
+ i
);
255 return PyString_FromString(decomp
);
259 get_decomp_record(Py_UCS4 code
, int *index
, int *prefix
, int *count
)
261 if (code
>= 0x110000) {
265 *index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
266 *index
= decomp_index2
[(*index
<<DECOMP_SHIFT
)+
267 (code
&((1<<DECOMP_SHIFT
)-1))];
270 /* high byte is number of hex bytes (usually one or two), low byte
271 is prefix code (from*/
272 *count
= decomp_data
[*index
] >> 8;
273 *prefix
= decomp_data
[*index
] & 255;
285 #define NCount (VCount*TCount)
286 #define SCount (LCount*NCount)
289 nfd_nfkd(PyObject
*input
, int k
)
292 Py_UNICODE
*i
, *end
, *o
;
293 /* Longest decomposition in Unicode 3.2: U+FDFA */
294 Py_UNICODE stack
[20];
295 int space
, stackptr
, isize
;
296 int index
, prefix
, count
;
297 unsigned char prev
, cur
;
300 isize
= PyUnicode_GET_SIZE(input
);
301 /* Overallocate atmost 10 characters. */
302 space
= (isize
> 10 ? 10 : isize
) + isize
;
303 result
= PyUnicode_FromUnicode(NULL
, space
);
306 i
= PyUnicode_AS_UNICODE(input
);
308 o
= PyUnicode_AS_UNICODE(result
);
311 stack
[stackptr
++] = *i
++;
313 Py_UNICODE code
= stack
[--stackptr
];
315 space
= PyString_GET_SIZE(result
) + 10;
316 if (PyUnicode_Resize(&result
, space
) == -1)
318 o
= PyUnicode_AS_UNICODE(result
) + space
- 10;
321 /* Hangul Decomposition. */
322 if (SBase
<= code
&& code
< (SBase
+SCount
)) {
323 int SIndex
= code
- SBase
;
324 int L
= LBase
+ SIndex
/ NCount
;
325 int V
= VBase
+ (SIndex
% NCount
) / TCount
;
326 int T
= TBase
+ SIndex
% TCount
;
336 /* Other decompoistions. */
337 get_decomp_record(code
, &index
, &prefix
, &count
);
339 /* Copy character if it is not decomposable, or has a
340 compatibility decomposition, but we do NFD. */
341 if (!count
|| (prefix
&& !k
)) {
346 /* Copy decomposition onto the stack, in reverse
349 code
= decomp_data
[index
+ (--count
)];
350 stack
[stackptr
++] = code
;
355 /* Drop overallocation. Cannot fail. */
356 PyUnicode_Resize(&result
, PyUnicode_GET_SIZE(result
) - space
);
358 /* Sort canonically. */
359 i
= PyUnicode_AS_UNICODE(result
);
360 prev
= _getrecord_ex(*i
)->combining
;
361 end
= i
+ PyUnicode_GET_SIZE(result
);
362 for (i
++; i
< end
; i
++) {
363 cur
= _getrecord_ex(*i
)->combining
;
364 if (prev
== 0 || cur
== 0 || prev
<= cur
) {
368 /* Non-canonical order. Need to switch *i with previous. */
371 Py_UNICODE tmp
= o
[1];
375 if (o
< PyUnicode_AS_UNICODE(result
))
377 prev
= _getrecord_ex(*o
)->combining
;
378 if (prev
== 0 || prev
<= cur
)
381 prev
= _getrecord_ex(*i
)->combining
;
387 find_nfc_index(struct reindex
* nfc
, Py_UNICODE code
)
390 for (index
= 0; nfc
[index
].start
; index
++) {
391 int start
= nfc
[index
].start
;
394 if (code
<= start
+ nfc
[index
].count
) {
395 int delta
= code
- start
;
396 return nfc
[index
].index
+ delta
;
403 nfc_nfkc(PyObject
*input
, int k
)
406 Py_UNICODE
*i
, *i1
, *o
, *end
;
407 int f
,l
,index
,index1
,comb
;
409 Py_UNICODE
*skipped
[20];
412 result
= nfd_nfkd(input
, k
);
416 /* We are going to modify result in-place.
417 If nfd_nfkd is changed to sometimes return the input,
418 this code needs to be reviewed. */
419 assert(result
!= input
);
421 i
= PyUnicode_AS_UNICODE(result
);
422 end
= i
+ PyUnicode_GET_SIZE(result
);
423 o
= PyUnicode_AS_UNICODE(result
);
427 for (index
= 0; index
< cskipped
; index
++) {
428 if (skipped
[index
] == i
) {
429 /* *i character is skipped.
431 skipped
[index
] = skipped
[cskipped
-1];
434 goto again
; /* continue while */
437 /* Hangul Composition. We don't need to check for <LV,T>
438 pairs, since we always have decomposed data. */
439 if (LBase
<= *i
&& *i
< (LBase
+LCount
) &&
441 VBase
<= i
[1] && i
[1] <= (VBase
+VCount
)) {
443 LIndex
= i
[0] - LBase
;
444 VIndex
= i
[1] - VBase
;
445 code
= SBase
+ (LIndex
*VCount
+VIndex
)*TCount
;
448 TBase
<= *i
&& *i
<= (TBase
+TCount
)) {
456 f
= find_nfc_index(nfc_first
, *i
);
461 /* Find next unblocked character. */
465 int comb1
= _getrecord_ex(*i1
)->combining
;
466 if (comb1
&& comb
== comb1
) {
467 /* Character is blocked. */
471 l
= find_nfc_index(nfc_last
, *i1
);
472 /* *i1 cannot be combined with *i. If *i1
473 is a starter, we don't need to look further.
474 Otherwise, record the combining class. */
483 index
= f
*TOTAL_LAST
+ l
;
484 index1
= comp_index
[index
>> COMP_SHIFT
];
485 code
= comp_data
[(index1
<<COMP_SHIFT
)+
486 (index
&((1<<COMP_SHIFT
)-1))];
490 /* Replace the original character. */
492 /* Mark the second character unused. */
493 skipped
[cskipped
++] = i1
;
495 f
= find_nfc_index(nfc_first
, *i
);
502 PyUnicode_Resize(&result
, o
- PyUnicode_AS_UNICODE(result
));
507 unicodedata_normalize(PyObject
*self
, PyObject
*args
)
512 if(!PyArg_ParseTuple(args
, "sO!:normalized",
513 &form
, &PyUnicode_Type
, &input
))
516 if (strcmp(form
, "NFC") == 0)
517 return nfc_nfkc(input
, 0);
518 if (strcmp(form
, "NFKC") == 0)
519 return nfc_nfkc(input
, 1);
520 if (strcmp(form
, "NFD") == 0)
521 return nfd_nfkd(input
, 0);
522 if (strcmp(form
, "NFKD") == 0)
523 return nfd_nfkd(input
, 1);
524 PyErr_SetString(PyExc_ValueError
, "invalid normalization form");
528 /* -------------------------------------------------------------------- */
529 /* unicode character name tables */
531 /* data file generated by Tools/unicode/makeunicodedata.py */
532 #include "unicodename_db.h"
534 /* -------------------------------------------------------------------- */
535 /* database code (cut and pasted from the unidb package) */
538 _gethash(const char *s
, int len
, int scale
)
543 for (i
= 0; i
< len
; i
++) {
544 h
= (h
* scale
) + (unsigned char) toupper(s
[i
]);
547 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
552 static char *hangul_syllables
[][3] = {
556 { "D", "YAE", "GS" },
557 { "DD", "EO", "N", },
559 { "M", "YEO", "NH" },
563 { "SS", "WAE", "LM" },
567 { "C", "WEO", "LP" },
584 is_unified_ideograph(Py_UCS4 code
)
587 (0x3400 <= code
&& code
<= 0x4DB5) || /* CJK Ideograph Extension A */
588 (0x4E00 <= code
&& code
<= 0x9FA5) || /* CJK Ideograph */
589 (0x20000 <= code
&& code
<= 0x2A6D6));/* CJK Ideograph Extension B */
593 _getucname(Py_UCS4 code
, char* buffer
, int buflen
)
600 if (SBase
<= code
&& code
< SBase
+SCount
) {
601 /* Hangul syllable. */
602 int SIndex
= code
- SBase
;
603 int L
= SIndex
/ NCount
;
604 int V
= (SIndex
% NCount
) / TCount
;
605 int T
= SIndex
% TCount
;
608 /* Worst case: HANGUL SYLLABLE <10chars>. */
610 strcpy(buffer
, "HANGUL SYLLABLE ");
612 strcpy(buffer
, hangul_syllables
[L
][0]);
613 buffer
+= strlen(hangul_syllables
[L
][0]);
614 strcpy(buffer
, hangul_syllables
[V
][1]);
615 buffer
+= strlen(hangul_syllables
[V
][1]);
616 strcpy(buffer
, hangul_syllables
[T
][2]);
617 buffer
+= strlen(hangul_syllables
[T
][2]);
622 if (is_unified_ideograph(code
)) {
624 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
626 sprintf(buffer
, "CJK UNIFIED IDEOGRAPH-%X", code
);
630 if (code
>= 0x110000)
633 /* get offset into phrasebook */
634 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
635 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
636 (code
&((1<<phrasebook_shift
)-1))];
644 word
= phrasebook
[offset
] - phrasebook_short
;
646 word
= (word
<< 8) + phrasebook
[offset
+1];
649 word
= phrasebook
[offset
++];
652 return 0; /* buffer overflow */
655 /* copy word string from lexicon. the last character in the
656 word has bit 7 set. the last word in a string ends with
658 w
= lexicon
+ lexicon_offset
[word
];
661 return 0; /* buffer overflow */
665 return 0; /* buffer overflow */
666 buffer
[i
++] = *w
& 127;
668 break; /* end of word */
675 _cmpname(int code
, const char* name
, int namelen
)
677 /* check if code corresponds to the given name */
679 char buffer
[NAME_MAXLEN
];
680 if (!_getucname(code
, buffer
, sizeof(buffer
)))
682 for (i
= 0; i
< namelen
; i
++) {
683 if (toupper(name
[i
]) != buffer
[i
])
686 return buffer
[namelen
] == '\0';
690 find_syllable(const char *str
, int *len
, int *pos
, int count
, int column
)
694 for (i
= 0; i
< count
; i
++) {
695 char *s
= hangul_syllables
[i
][column
];
699 if (strncmp(str
, s
, len1
) == 0) {
711 _getcode(const char* name
, int namelen
, Py_UCS4
* code
)
714 unsigned int mask
= code_size
-1;
715 unsigned int i
, incr
;
717 /* Check for hangul syllables. */
718 if (strncmp(name
, "HANGUL SYLLABLE ", 16) == 0) {
720 const char *pos
= name
+ 16;
721 find_syllable(pos
, &len
, &L
, LCount
, 0);
723 find_syllable(pos
, &len
, &V
, VCount
, 1);
725 find_syllable(pos
, &len
, &T
, TCount
, 2);
727 if (V
!= -1 && V
!= -1 && T
!= -1 && pos
-name
== namelen
) {
728 *code
= SBase
+ (L
*VCount
+V
)*TCount
+ T
;
731 /* Otherwise, it's an illegal syllable name. */
735 /* Check for unified ideographs. */
736 if (strncmp(name
, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
737 /* Four or five hexdigits must follow. */
741 if (namelen
!= 4 && namelen
!= 5)
745 if (*name
>= '0' && *name
<= '9')
747 else if (*name
>= 'A' && *name
<= 'F')
748 v
+= *name
- 'A' + 10;
753 if (!is_unified_ideograph(v
))
759 /* the following is the same as python's dictionary lookup, with
760 only minor changes. see the makeunicodedata script for more
763 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
768 if (_cmpname(v
, name
, namelen
)) {
772 incr
= (h
^ (h
>> 3)) & mask
;
776 i
= (i
+ incr
) & mask
;
780 if (_cmpname(v
, name
, namelen
)) {
786 incr
= incr
^ code_poly
;
790 static const _PyUnicode_Name_CAPI hashAPI
=
792 sizeof(_PyUnicode_Name_CAPI
),
797 /* -------------------------------------------------------------------- */
798 /* Python bindings */
801 unicodedata_name(PyObject
* self
, PyObject
* args
)
803 char name
[NAME_MAXLEN
];
806 PyObject
* defobj
= NULL
;
807 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
810 if (PyUnicode_GET_SIZE(v
) != 1) {
811 PyErr_SetString(PyExc_TypeError
,
812 "need a single Unicode character as parameter");
816 if (!_getucname((Py_UCS4
) *PyUnicode_AS_UNICODE(v
),
817 name
, sizeof(name
))) {
818 if (defobj
== NULL
) {
819 PyErr_SetString(PyExc_ValueError
, "no such name");
828 return Py_BuildValue("s", name
);
832 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
839 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
842 if (!_getcode(name
, namelen
, &code
)) {
843 char fmt
[] = "undefined character name '%s'";
844 char *buf
= PyMem_MALLOC(sizeof(fmt
) + namelen
);
845 sprintf(buf
, fmt
, name
);
846 PyErr_SetString(PyExc_KeyError
, buf
);
851 str
[0] = (Py_UNICODE
) code
;
852 return PyUnicode_FromUnicode(str
, 1);
855 /* XXX Add doc strings. */
857 static PyMethodDef unicodedata_functions
[] = {
858 {"decimal", unicodedata_decimal
, METH_VARARGS
},
859 {"digit", unicodedata_digit
, METH_VARARGS
},
860 {"numeric", unicodedata_numeric
, METH_VARARGS
},
861 {"category", unicodedata_category
, METH_VARARGS
},
862 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
},
863 {"combining", unicodedata_combining
, METH_VARARGS
},
864 {"mirrored", unicodedata_mirrored
, METH_VARARGS
},
865 {"decomposition",unicodedata_decomposition
, METH_VARARGS
},
866 {"name", unicodedata_name
, METH_VARARGS
},
867 {"lookup", unicodedata_lookup
, METH_VARARGS
},
868 {"normalize", unicodedata_normalize
, METH_VARARGS
},
869 {NULL
, NULL
} /* sentinel */
872 PyDoc_STRVAR(unicodedata_docstring
, "unicode character database");
875 initunicodedata(void)
880 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
884 PyModule_AddStringConstant(m
, "unidata_version", UNIDATA_VERSION
);
887 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
889 PyModule_AddObject(m
, "ucnhash_CAPI", v
);
895 indent-tabs-mode: nil