1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.0 data base.
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Copyright (c) Corporation for National Research Initiatives.
12 ------------------------------------------------------------------------ */
17 /* character properties */
20 const unsigned char category
; /* index into
21 _PyUnicode_CategoryNames */
22 const unsigned char combining
; /* combining class value 0 - 255 */
23 const unsigned char bidirectional
; /* index into
24 _PyUnicode_BidirectionalNames */
25 const unsigned char mirrored
; /* true if mirrored in bidir mode */
26 } _PyUnicode_DatabaseRecord
;
28 /* data file generated by Tools/unicode/makeunicodedata.py */
29 #include "unicodedata_db.h"
31 static const _PyUnicode_DatabaseRecord
*
32 _getrecord(PyUnicodeObject
* v
)
37 code
= (int) *PyUnicode_AS_UNICODE(v
);
39 if (code
< 0 || code
>= 65536)
42 index
= index1
[(code
>>SHIFT
)];
43 index
= index2
[(index
<<SHIFT
)+(code
&((1<<SHIFT
)-1))];
46 return &_PyUnicode_Database_Records
[index
];
49 /* --- Module API --------------------------------------------------------- */
52 unicodedata_decimal(PyObject
*self
, PyObject
*args
)
55 PyObject
*defobj
= NULL
;
58 if (!PyArg_ParseTuple(args
, "O!|O:decimal", &PyUnicode_Type
, &v
, &defobj
))
60 if (PyUnicode_GET_SIZE(v
) != 1) {
61 PyErr_SetString(PyExc_TypeError
,
62 "need a single Unicode character as parameter");
65 rc
= Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v
));
68 PyErr_SetString(PyExc_ValueError
,
77 return PyInt_FromLong(rc
);
81 unicodedata_digit(PyObject
*self
, PyObject
*args
)
84 PyObject
*defobj
= NULL
;
87 if (!PyArg_ParseTuple(args
, "O!|O:digit", &PyUnicode_Type
, &v
, &defobj
))
89 if (PyUnicode_GET_SIZE(v
) != 1) {
90 PyErr_SetString(PyExc_TypeError
,
91 "need a single Unicode character as parameter");
94 rc
= Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v
));
97 PyErr_SetString(PyExc_ValueError
, "not a digit");
105 return PyInt_FromLong(rc
);
109 unicodedata_numeric(PyObject
*self
, PyObject
*args
)
112 PyObject
*defobj
= NULL
;
115 if (!PyArg_ParseTuple(args
, "O!|O:numeric", &PyUnicode_Type
, &v
, &defobj
))
117 if (PyUnicode_GET_SIZE(v
) != 1) {
118 PyErr_SetString(PyExc_TypeError
,
119 "need a single Unicode character as parameter");
122 rc
= Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v
));
124 if (defobj
== NULL
) {
125 PyErr_SetString(PyExc_ValueError
, "not a numeric character");
133 return PyFloat_FromDouble(rc
);
137 unicodedata_category(PyObject
*self
, PyObject
*args
)
142 if (!PyArg_ParseTuple(args
, "O!:category",
143 &PyUnicode_Type
, &v
))
145 if (PyUnicode_GET_SIZE(v
) != 1) {
146 PyErr_SetString(PyExc_TypeError
,
147 "need a single Unicode character as parameter");
150 index
= (int) _getrecord(v
)->category
;
151 return PyString_FromString(_PyUnicode_CategoryNames
[index
]);
155 unicodedata_bidirectional(PyObject
*self
, PyObject
*args
)
160 if (!PyArg_ParseTuple(args
, "O!:bidirectional",
161 &PyUnicode_Type
, &v
))
163 if (PyUnicode_GET_SIZE(v
) != 1) {
164 PyErr_SetString(PyExc_TypeError
,
165 "need a single Unicode character as parameter");
168 index
= (int) _getrecord(v
)->bidirectional
;
169 return PyString_FromString(_PyUnicode_BidirectionalNames
[index
]);
173 unicodedata_combining(PyObject
*self
, PyObject
*args
)
177 if (!PyArg_ParseTuple(args
, "O!:combining",
178 &PyUnicode_Type
, &v
))
180 if (PyUnicode_GET_SIZE(v
) != 1) {
181 PyErr_SetString(PyExc_TypeError
,
182 "need a single Unicode character as parameter");
185 return PyInt_FromLong((int) _getrecord(v
)->combining
);
189 unicodedata_mirrored(PyObject
*self
, PyObject
*args
)
193 if (!PyArg_ParseTuple(args
, "O!:mirrored",
194 &PyUnicode_Type
, &v
))
196 if (PyUnicode_GET_SIZE(v
) != 1) {
197 PyErr_SetString(PyExc_TypeError
,
198 "need a single Unicode character as parameter");
201 return PyInt_FromLong((int) _getrecord(v
)->mirrored
);
205 unicodedata_decomposition(PyObject
*self
, PyObject
*args
)
209 int code
, index
, count
, i
;
211 if (!PyArg_ParseTuple(args
, "O!:decomposition",
212 &PyUnicode_Type
, &v
))
214 if (PyUnicode_GET_SIZE(v
) != 1) {
215 PyErr_SetString(PyExc_TypeError
,
216 "need a single Unicode character as parameter");
220 code
= (int) *PyUnicode_AS_UNICODE(v
);
222 if (code
< 0 || code
>= 65536)
225 index
= decomp_index1
[(code
>>DECOMP_SHIFT
)];
226 index
= decomp_index2
[(index
<<DECOMP_SHIFT
)+
227 (code
&((1<<DECOMP_SHIFT
)-1))];
230 /* high byte is number of hex bytes (usually one or two), low byte
231 is prefix code (from*/
232 count
= decomp_data
[index
] >> 8;
234 /* XXX: could allocate the PyString up front instead
235 (strlen(prefix) + 5 * count + 1 bytes) */
238 i
= strlen(decomp_prefix
[decomp_data
[index
] & 255]);
239 memcpy(decomp
, decomp_prefix
[decomp_data
[index
] & 255], i
);
241 while (count
-- > 0) {
244 assert((size_t)i
< sizeof(decomp
));
245 PyOS_snprintf(decomp
+ i
, sizeof(decomp
) - i
, "%04X",
246 decomp_data
[++index
]);
247 i
+= strlen(decomp
+ i
);
252 return PyString_FromString(decomp
);
255 /* -------------------------------------------------------------------- */
256 /* unicode character name tables */
258 /* data file generated by Tools/unicode/makeunicodedata.py */
259 #include "unicodename_db.h"
261 /* -------------------------------------------------------------------- */
262 /* database code (cut and pasted from the unidb package) */
265 _gethash(const char *s
, int len
, int scale
)
270 for (i
= 0; i
< len
; i
++) {
271 h
= (h
* scale
) + (unsigned char) toupper(s
[i
]);
274 h
= (h
^ ((ix
>>24) & 0xff)) & 0x00ffffff;
280 _getucname(Py_UCS4 code
, char* buffer
, int buflen
)
290 /* get offset into phrasebook */
291 offset
= phrasebook_offset1
[(code
>>phrasebook_shift
)];
292 offset
= phrasebook_offset2
[(offset
<<phrasebook_shift
) +
293 (code
&((1<<phrasebook_shift
)-1))];
301 word
= phrasebook
[offset
] - phrasebook_short
;
303 word
= (word
<< 8) + phrasebook
[offset
+1];
306 word
= phrasebook
[offset
++];
309 return 0; /* buffer overflow */
312 /* copy word string from lexicon. the last character in the
313 word has bit 7 set. the last word in a string ends with
315 w
= lexicon
+ lexicon_offset
[word
];
318 return 0; /* buffer overflow */
322 return 0; /* buffer overflow */
323 buffer
[i
++] = *w
& 127;
325 break; /* end of word */
332 _cmpname(int code
, const char* name
, int namelen
)
334 /* check if code corresponds to the given name */
336 char buffer
[NAME_MAXLEN
];
337 if (!_getucname(code
, buffer
, sizeof(buffer
)))
339 for (i
= 0; i
< namelen
; i
++) {
340 if (toupper(name
[i
]) != buffer
[i
])
343 return buffer
[namelen
] == '\0';
347 _getcode(const char* name
, int namelen
, Py_UCS4
* code
)
350 unsigned int mask
= code_size
-1;
351 unsigned int i
, incr
;
353 /* the following is the same as python's dictionary lookup, with
354 only minor changes. see the makeunicodedata script for more
357 h
= (unsigned int) _gethash(name
, namelen
, code_magic
);
362 if (_cmpname(v
, name
, namelen
)) {
366 incr
= (h
^ (h
>> 3)) & mask
;
370 i
= (i
+ incr
) & mask
;
374 if (_cmpname(v
, name
, namelen
)) {
380 incr
= incr
^ code_poly
;
384 static const _PyUnicode_Name_CAPI hashAPI
=
386 sizeof(_PyUnicode_Name_CAPI
),
391 /* -------------------------------------------------------------------- */
392 /* Python bindings */
395 unicodedata_name(PyObject
* self
, PyObject
* args
)
397 char name
[NAME_MAXLEN
];
400 PyObject
* defobj
= NULL
;
401 if (!PyArg_ParseTuple(args
, "O!|O:name", &PyUnicode_Type
, &v
, &defobj
))
404 if (PyUnicode_GET_SIZE(v
) != 1) {
405 PyErr_SetString(PyExc_TypeError
,
406 "need a single Unicode character as parameter");
410 if (!_getucname((Py_UCS4
) *PyUnicode_AS_UNICODE(v
),
411 name
, sizeof(name
))) {
412 if (defobj
== NULL
) {
413 PyErr_SetString(PyExc_ValueError
, "no such name");
422 return Py_BuildValue("s", name
);
426 unicodedata_lookup(PyObject
* self
, PyObject
* args
)
433 if (!PyArg_ParseTuple(args
, "s#:lookup", &name
, &namelen
))
436 if (!_getcode(name
, namelen
, &code
)) {
437 PyErr_SetString(PyExc_KeyError
, "undefined character name");
441 str
[0] = (Py_UNICODE
) code
;
442 return PyUnicode_FromUnicode(str
, 1);
445 /* XXX Add doc strings. */
447 static PyMethodDef unicodedata_functions
[] = {
448 {"decimal", unicodedata_decimal
, METH_VARARGS
},
449 {"digit", unicodedata_digit
, METH_VARARGS
},
450 {"numeric", unicodedata_numeric
, METH_VARARGS
},
451 {"category", unicodedata_category
, METH_VARARGS
},
452 {"bidirectional", unicodedata_bidirectional
, METH_VARARGS
},
453 {"combining", unicodedata_combining
, METH_VARARGS
},
454 {"mirrored", unicodedata_mirrored
, METH_VARARGS
},
455 {"decomposition",unicodedata_decomposition
, METH_VARARGS
},
456 {"name", unicodedata_name
, METH_VARARGS
},
457 {"lookup", unicodedata_lookup
, METH_VARARGS
},
458 {NULL
, NULL
} /* sentinel */
461 PyDoc_STRVAR(unicodedata_docstring
, "unicode character database");
464 initunicodedata(void)
469 "unicodedata", unicodedata_functions
, unicodedata_docstring
);
474 v
= PyCObject_FromVoidPtr((void *) &hashAPI
, NULL
);
476 PyModule_AddObject(m
, "ucnhash_CAPI", v
);