1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.0 data base.
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Copyright (c) Corporation for National Research Initiatives.
12 ------------------------------------------------------------------------ */
17 /* character properties */
20 const unsigned char category
; /* index into
21 _PyUnicode_CategoryNames */
22 const unsigned char combining
; /* combining class value 0 - 255 */
23 const unsigned char bidirectional
; /* index into
24 _PyUnicode_BidirectionalNames */
25 const unsigned char mirrored
; /* true if mirrored in bidir mode */
26 } _PyUnicode_DatabaseRecord
;
28 /* data file generated by Tools/unicode/makeunicodedata.py */
29 #include "unicodedata_db.h"
31 static const _PyUnicode_DatabaseRecord
*
32 _getrecord(PyUnicodeObject
* v
)
37 code
= (int) *PyUnicode_AS_UNICODE(v
);
39 if (code
< 0 || code
>= 65536)
42 index
= index1
[(code
>>SHIFT
)];
43 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
46 return &_PyUnicode_Database_Records
[index
];
49 /* --- Module API --------------------------------------------------------- */
52 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
55 PyObject
*defobj
= NULL
;
58 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
60 if (PyUnicode_GET_SIZE(v
) != 1) {
61 PyErr_SetString(PyExc_TypeError
,
62 "need a single Unicode character as parameter");
65 rc
= Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v
));
68 PyErr_SetString(PyExc_ValueError
,
77 return PyInt_FromLong(rc
);
81 unicodedata_digit(PyObject
*self
, PyObject
*args
)
84 PyObject
*defobj
= NULL
;
87 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
89 if (PyUnicode_GET_SIZE(v
) != 1) {
90 PyErr_SetString(PyExc_TypeError
,
91 "need a single Unicode character as parameter");
94 rc
= Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v
));
97 PyErr_SetString(PyExc_ValueError
, "not a digit");
105 return PyInt_FromLong(rc
);
109 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
112 PyObject
*defobj
= NULL
;
115 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
117 if (PyUnicode_GET_SIZE(v
) != 1) {
118 PyErr_SetString(PyExc_TypeError
,
119 "need a single Unicode character as parameter");
122 rc
= Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v
));
124 if (defobj
== NULL
) {
125 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
133 return PyFloat_FromDouble(rc
);
137 unicodedata_category(PyObject
*self
, PyObject
*args
)
142 if (!PyArg_ParseTuple(args
, "O!:category",
143 &PyUnicode_Type
, &v
))
145 if (PyUnicode_GET_SIZE(v
) != 1) {
146 PyErr_SetString(PyExc_TypeError
,
147 "need a single Unicode character as parameter");
150 index
= (int) _getrecord(v
)->category
;
151 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
155 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
160 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
161 &PyUnicode_Type
, &v
))
163 if (PyUnicode_GET_SIZE(v
) != 1) {
164 PyErr_SetString(PyExc_TypeError
,
165 "need a single Unicode character as parameter");
168 index
= (int) _getrecord(v
)->bidirectional
;
169 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
173 unicodedata_combining(PyObject
*self
, PyObject
*args
)
177 if (!PyArg_ParseTuple(args
, "O!:combining",
178 &PyUnicode_Type
, &v
))
180 if (PyUnicode_GET_SIZE(v
) != 1) {
181 PyErr_SetString(PyExc_TypeError
,
182 "need a single Unicode character as parameter");
185 return PyInt_FromLong((int) _getrecord(v
)->combining
);
189 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
193 if (!PyArg_ParseTuple(args
, "O!:mirrored",
194 &PyUnicode_Type
, &v
))
196 if (PyUnicode_GET_SIZE(v
) != 1) {
197 PyErr_SetString(PyExc_TypeError
,
198 "need a single Unicode character as parameter");
201 return PyInt_FromLong((int) _getrecord(v
)->mirrored
);
205 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
209 int code
, index
, count
, i
;
211 if (!PyArg_ParseTuple(args
, "O!:decomposition",
212 &PyUnicode_Type
, &v
))
214 if (PyUnicode_GET_SIZE(v
) != 1) {
215 PyErr_SetString(PyExc_TypeError
,
216 "need a single Unicode character as parameter");
220 code
= (int) *PyUnicode_AS_UNICODE(v
);
222 if (code
< 0 || code
>= 65536)
225 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
226 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
227 (code
&((1<<DECOMP_SHIFT
)-1))];
230 /* high byte is of hex bytes (usually one or two), low byte
231 is prefix code (from*/
232 count
= decomp_data
[index
] >> 8;
234 /* XXX: could allocate the PyString up front instead
235 (strlen(prefix) + 5 * count + 1 bytes) */
238 i
= strlen(decomp_prefix
[decomp_data
[index
] & 255]);
239 memcpy(decomp
, decomp_prefix
[decomp_data
[index
] & 255], i
);
241 while (count
-- > 0) {
244 sprintf(decomp
+ i
, "%04X", decomp_data
[++index
]);
245 i
+= strlen(decomp
+ i
);
250 return PyString_FromString(decomp
);
253 /* -------------------------------------------------------------------- */
254 /* unicode character name tables */
256 /* data file generated by Tools/unicode/makeunicodedata.py */
257 #include "unicodename_db.h"
259 /* -------------------------------------------------------------------- */
260 /* database code (cut and pasted from the unidb package) */
263 _gethash(const char *s
, int len
, int scale
)
268 for (i
= 0; i
< len
; i
++) {
269 h
= (h
* scale
) + (unsigned char) toupper(s
[i
]);
272 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
278 _getname(Py_UCS4 code
, char* buffer
, int buflen
)
285 if (code
< 0 || code
>= 65536)
288 /* get offset into phrasebook */
289 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
290 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
291 (code
&((1<<phrasebook_shift
)-1))];
299 word
= phrasebook
[offset
] - phrasebook_short
;
301 word
= (word
<< 8) + phrasebook
[offset
+1];
304 word
= phrasebook
[offset
++];
307 return 0; /* buffer overflow */
310 /* copy word string from lexicon. the last character in the
311 word has bit 7 set. the last word in a string ends with
313 w
= lexicon
+ lexicon_offset
[word
];
316 return 0; /* buffer overflow */
320 return 0; /* buffer overflow */
321 buffer
[i
++] = *w
& 127;
323 break; /* end of word */
330 _cmpname(int code
, const char* name
, int namelen
)
332 /* check if code corresponds to the given name */
334 char buffer
[NAME_MAXLEN
];
335 if (!_getname(code
, buffer
, sizeof(buffer
)))
337 for (i
= 0; i
< namelen
; i
++) {
338 if (toupper(name
[i
]) != buffer
[i
])
341 return buffer
[namelen
] == '\0';
345 _getcode(const char* name
, int namelen
, Py_UCS4
* code
)
348 unsigned int mask
= code_size
-1;
349 unsigned int i
, incr
;
351 /* the following is the same as python's dictionary lookup, with
352 only minor changes. see the makeunicodedata script for more
355 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
360 if (_cmpname(v
, name
, namelen
)) {
364 incr
= (h
^ (h
>> 3)) & mask
;
368 i
= (i
+ incr
) & mask
;
372 if (_cmpname(v
, name
, namelen
)) {
378 incr
= incr
^ code_poly
;
382 static const _PyUnicode_Name_CAPI hashAPI
=
384 sizeof(_PyUnicode_Name_CAPI
),
389 /* -------------------------------------------------------------------- */
390 /* Python bindings */
393 unicodedata_name(PyObject
* self
, PyObject
* args
)
395 char name
[NAME_MAXLEN
];
398 PyObject
* defobj
= NULL
;
399 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
402 if (PyUnicode_GET_SIZE(v
) != 1) {
403 PyErr_SetString(PyExc_TypeError
,
404 "need a single Unicode character as parameter");
408 if (!_getname((Py_UCS4
) *PyUnicode_AS_UNICODE(v
),
409 name
, sizeof(name
))) {
410 if (defobj
== NULL
) {
411 PyErr_SetString(PyExc_ValueError
, "no such name");
420 return Py_BuildValue("s", name
);
424 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
431 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
434 if (!_getcode(name
, namelen
, &code
)) {
435 PyErr_SetString(PyExc_KeyError
, "undefined character name");
439 str
[0] = (Py_UNICODE
) code
;
440 return PyUnicode_FromUnicode(str
, 1);
443 /* XXX Add doc strings. */
445 static PyMethodDef unicodedata_functions
[] = {
446 {"decimal", unicodedata_decimal
, METH_VARARGS
},
447 {"digit", unicodedata_digit
, METH_VARARGS
},
448 {"numeric", unicodedata_numeric
, METH_VARARGS
},
449 {"category", unicodedata_category
, METH_VARARGS
},
450 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
},
451 {"combining", unicodedata_combining
, METH_VARARGS
},
452 {"mirrored", unicodedata_mirrored
, METH_VARARGS
},
453 {"decomposition",unicodedata_decomposition
, METH_VARARGS
},
454 {"name", unicodedata_name
, METH_VARARGS
},
455 {"lookup", unicodedata_lookup
, METH_VARARGS
},
456 {NULL
, NULL
} /* sentinel */
459 static char *unicodedata_docstring
= "unicode character database";
462 initunicodedata(void)
467 "unicodedata", unicodedata_functions
,
468 unicodedata_docstring
, NULL
, PYTHON_API_VERSION
);
472 d
= PyModule_GetDict(m
);
477 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
478 PyDict_SetItemString(d
, "ucnhash_CAPI", v
);