- Got rid of newmodule.c
[python/dscho.git] / Modules / unicodedata.c
blob966c8ba4f1219688d23e6b359322ed66954af513
1 /* ------------------------------------------------------------------------
3 unicodedata -- Provides access to the Unicode 3.0 data base.
5 Data was extracted from the Unicode 3.0 UnicodeData.txt file.
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Copyright (c) Corporation for National Research Initiatives.
12 ------------------------------------------------------------------------ */
14 #include "Python.h"
15 #include "ucnhash.h"
17 /* character properties */
19 typedef struct {
20 const unsigned char category; /* index into
21 _PyUnicode_CategoryNames */
22 const unsigned char combining; /* combining class value 0 - 255 */
23 const unsigned char bidirectional; /* index into
24 _PyUnicode_BidirectionalNames */
25 const unsigned char mirrored; /* true if mirrored in bidir mode */
26 } _PyUnicode_DatabaseRecord;
28 /* data file generated by Tools/unicode/makeunicodedata.py */
29 #include "unicodedata_db.h"
31 static const _PyUnicode_DatabaseRecord*
32 _getrecord(PyUnicodeObject* v)
34 int code;
35 int index;
37 code = (int) *PyUnicode_AS_UNICODE(v);
39 if (code < 0 || code >= 65536)
40 index = 0;
41 else {
42 index = index1[(code>>SHIFT)];
43 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
46 return &_PyUnicode_Database_Records[index];
49 /* --- Module API --------------------------------------------------------- */
51 static PyObject *
52 unicodedata_decimal(PyObject *self, PyObject *args)
54 PyUnicodeObject *v;
55 PyObject *defobj = NULL;
56 long rc;
58 if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
59 return NULL;
60 if (PyUnicode_GET_SIZE(v) != 1) {
61 PyErr_SetString(PyExc_TypeError,
62 "need a single Unicode character as parameter");
63 return NULL;
65 rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
66 if (rc < 0) {
67 if (defobj == NULL) {
68 PyErr_SetString(PyExc_ValueError,
69 "not a decimal");
70 return NULL;
72 else {
73 Py_INCREF(defobj);
74 return defobj;
77 return PyInt_FromLong(rc);
80 static PyObject *
81 unicodedata_digit(PyObject *self, PyObject *args)
83 PyUnicodeObject *v;
84 PyObject *defobj = NULL;
85 long rc;
87 if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
88 return NULL;
89 if (PyUnicode_GET_SIZE(v) != 1) {
90 PyErr_SetString(PyExc_TypeError,
91 "need a single Unicode character as parameter");
92 return NULL;
94 rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
95 if (rc < 0) {
96 if (defobj == NULL) {
97 PyErr_SetString(PyExc_ValueError, "not a digit");
98 return NULL;
100 else {
101 Py_INCREF(defobj);
102 return defobj;
105 return PyInt_FromLong(rc);
108 static PyObject *
109 unicodedata_numeric(PyObject *self, PyObject *args)
111 PyUnicodeObject *v;
112 PyObject *defobj = NULL;
113 double rc;
115 if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
116 return NULL;
117 if (PyUnicode_GET_SIZE(v) != 1) {
118 PyErr_SetString(PyExc_TypeError,
119 "need a single Unicode character as parameter");
120 return NULL;
122 rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
123 if (rc < 0) {
124 if (defobj == NULL) {
125 PyErr_SetString(PyExc_ValueError, "not a numeric character");
126 return NULL;
128 else {
129 Py_INCREF(defobj);
130 return defobj;
133 return PyFloat_FromDouble(rc);
136 static PyObject *
137 unicodedata_category(PyObject *self, PyObject *args)
139 PyUnicodeObject *v;
140 int index;
142 if (!PyArg_ParseTuple(args, "O!:category",
143 &PyUnicode_Type, &v))
144 return NULL;
145 if (PyUnicode_GET_SIZE(v) != 1) {
146 PyErr_SetString(PyExc_TypeError,
147 "need a single Unicode character as parameter");
148 return NULL;
150 index = (int) _getrecord(v)->category;
151 return PyString_FromString(_PyUnicode_CategoryNames[index]);
154 static PyObject *
155 unicodedata_bidirectional(PyObject *self, PyObject *args)
157 PyUnicodeObject *v;
158 int index;
160 if (!PyArg_ParseTuple(args, "O!:bidirectional",
161 &PyUnicode_Type, &v))
162 return NULL;
163 if (PyUnicode_GET_SIZE(v) != 1) {
164 PyErr_SetString(PyExc_TypeError,
165 "need a single Unicode character as parameter");
166 return NULL;
168 index = (int) _getrecord(v)->bidirectional;
169 return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
172 static PyObject *
173 unicodedata_combining(PyObject *self, PyObject *args)
175 PyUnicodeObject *v;
177 if (!PyArg_ParseTuple(args, "O!:combining",
178 &PyUnicode_Type, &v))
179 return NULL;
180 if (PyUnicode_GET_SIZE(v) != 1) {
181 PyErr_SetString(PyExc_TypeError,
182 "need a single Unicode character as parameter");
183 return NULL;
185 return PyInt_FromLong((int) _getrecord(v)->combining);
188 static PyObject *
189 unicodedata_mirrored(PyObject *self, PyObject *args)
191 PyUnicodeObject *v;
193 if (!PyArg_ParseTuple(args, "O!:mirrored",
194 &PyUnicode_Type, &v))
195 return NULL;
196 if (PyUnicode_GET_SIZE(v) != 1) {
197 PyErr_SetString(PyExc_TypeError,
198 "need a single Unicode character as parameter");
199 return NULL;
201 return PyInt_FromLong((int) _getrecord(v)->mirrored);
204 static PyObject *
205 unicodedata_decomposition(PyObject *self, PyObject *args)
207 PyUnicodeObject *v;
208 char decomp[256];
209 int code, index, count, i;
211 if (!PyArg_ParseTuple(args, "O!:decomposition",
212 &PyUnicode_Type, &v))
213 return NULL;
214 if (PyUnicode_GET_SIZE(v) != 1) {
215 PyErr_SetString(PyExc_TypeError,
216 "need a single Unicode character as parameter");
217 return NULL;
220 code = (int) *PyUnicode_AS_UNICODE(v);
222 if (code < 0 || code >= 65536)
223 index = 0;
224 else {
225 index = decomp_index1[(code>>DECOMP_SHIFT)];
226 index = decomp_index2[(index<<DECOMP_SHIFT)+
227 (code&((1<<DECOMP_SHIFT)-1))];
230 /* high byte is number of hex bytes (usually one or two), low byte
231 is prefix code (from*/
232 count = decomp_data[index] >> 8;
234 /* XXX: could allocate the PyString up front instead
235 (strlen(prefix) + 5 * count + 1 bytes) */
237 /* copy prefix */
238 i = strlen(decomp_prefix[decomp_data[index] & 255]);
239 memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
241 while (count-- > 0) {
242 if (i)
243 decomp[i++] = ' ';
244 assert((size_t)i < sizeof(decomp));
245 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
246 decomp_data[++index]);
247 i += strlen(decomp + i);
250 decomp[i] = '\0';
252 return PyString_FromString(decomp);
255 /* -------------------------------------------------------------------- */
256 /* unicode character name tables */
258 /* data file generated by Tools/unicode/makeunicodedata.py */
259 #include "unicodename_db.h"
261 /* -------------------------------------------------------------------- */
262 /* database code (cut and pasted from the unidb package) */
264 static unsigned long
265 _gethash(const char *s, int len, int scale)
267 int i;
268 unsigned long h = 0;
269 unsigned long ix;
270 for (i = 0; i < len; i++) {
271 h = (h * scale) + (unsigned char) toupper(s[i]);
272 ix = h & 0xff000000;
273 if (ix)
274 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
276 return h;
279 static int
280 _getucname(Py_UCS4 code, char* buffer, int buflen)
282 int offset;
283 int i;
284 int word;
285 unsigned char* w;
287 if (code >= 65536)
288 return 0;
290 /* get offset into phrasebook */
291 offset = phrasebook_offset1[(code>>phrasebook_shift)];
292 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
293 (code&((1<<phrasebook_shift)-1))];
294 if (!offset)
295 return 0;
297 i = 0;
299 for (;;) {
300 /* get word index */
301 word = phrasebook[offset] - phrasebook_short;
302 if (word >= 0) {
303 word = (word << 8) + phrasebook[offset+1];
304 offset += 2;
305 } else
306 word = phrasebook[offset++];
307 if (i) {
308 if (i > buflen)
309 return 0; /* buffer overflow */
310 buffer[i++] = ' ';
312 /* copy word string from lexicon. the last character in the
313 word has bit 7 set. the last word in a string ends with
314 0x80 */
315 w = lexicon + lexicon_offset[word];
316 while (*w < 128) {
317 if (i >= buflen)
318 return 0; /* buffer overflow */
319 buffer[i++] = *w++;
321 if (i >= buflen)
322 return 0; /* buffer overflow */
323 buffer[i++] = *w & 127;
324 if (*w == 128)
325 break; /* end of word */
328 return 1;
331 static int
332 _cmpname(int code, const char* name, int namelen)
334 /* check if code corresponds to the given name */
335 int i;
336 char buffer[NAME_MAXLEN];
337 if (!_getucname(code, buffer, sizeof(buffer)))
338 return 0;
339 for (i = 0; i < namelen; i++) {
340 if (toupper(name[i]) != buffer[i])
341 return 0;
343 return buffer[namelen] == '\0';
346 static int
347 _getcode(const char* name, int namelen, Py_UCS4* code)
349 unsigned int h, v;
350 unsigned int mask = code_size-1;
351 unsigned int i, incr;
353 /* the following is the same as python's dictionary lookup, with
354 only minor changes. see the makeunicodedata script for more
355 details */
357 h = (unsigned int) _gethash(name, namelen, code_magic);
358 i = (~h) & mask;
359 v = code_hash[i];
360 if (!v)
361 return 0;
362 if (_cmpname(v, name, namelen)) {
363 *code = v;
364 return 1;
366 incr = (h ^ (h >> 3)) & mask;
367 if (!incr)
368 incr = mask;
369 for (;;) {
370 i = (i + incr) & mask;
371 v = code_hash[i];
372 if (!v)
373 return 0;
374 if (_cmpname(v, name, namelen)) {
375 *code = v;
376 return 1;
378 incr = incr << 1;
379 if (incr > mask)
380 incr = incr ^ code_poly;
384 static const _PyUnicode_Name_CAPI hashAPI =
386 sizeof(_PyUnicode_Name_CAPI),
387 _getucname,
388 _getcode
391 /* -------------------------------------------------------------------- */
392 /* Python bindings */
394 static PyObject *
395 unicodedata_name(PyObject* self, PyObject* args)
397 char name[NAME_MAXLEN];
399 PyUnicodeObject* v;
400 PyObject* defobj = NULL;
401 if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
402 return NULL;
404 if (PyUnicode_GET_SIZE(v) != 1) {
405 PyErr_SetString(PyExc_TypeError,
406 "need a single Unicode character as parameter");
407 return NULL;
410 if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
411 name, sizeof(name))) {
412 if (defobj == NULL) {
413 PyErr_SetString(PyExc_ValueError, "no such name");
414 return NULL;
416 else {
417 Py_INCREF(defobj);
418 return defobj;
422 return Py_BuildValue("s", name);
425 static PyObject *
426 unicodedata_lookup(PyObject* self, PyObject* args)
428 Py_UCS4 code;
429 Py_UNICODE str[1];
431 char* name;
432 int namelen;
433 if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
434 return NULL;
436 if (!_getcode(name, namelen, &code)) {
437 PyErr_SetString(PyExc_KeyError, "undefined character name");
438 return NULL;
441 str[0] = (Py_UNICODE) code;
442 return PyUnicode_FromUnicode(str, 1);
445 /* XXX Add doc strings. */
447 static PyMethodDef unicodedata_functions[] = {
448 {"decimal", unicodedata_decimal, METH_VARARGS},
449 {"digit", unicodedata_digit, METH_VARARGS},
450 {"numeric", unicodedata_numeric, METH_VARARGS},
451 {"category", unicodedata_category, METH_VARARGS},
452 {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
453 {"combining", unicodedata_combining, METH_VARARGS},
454 {"mirrored", unicodedata_mirrored, METH_VARARGS},
455 {"decomposition",unicodedata_decomposition, METH_VARARGS},
456 {"name", unicodedata_name, METH_VARARGS},
457 {"lookup", unicodedata_lookup, METH_VARARGS},
458 {NULL, NULL} /* sentinel */
461 PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
463 DL_EXPORT(void)
464 initunicodedata(void)
466 PyObject *m, *v;
468 m = Py_InitModule3(
469 "unicodedata", unicodedata_functions, unicodedata_docstring);
470 if (!m)
471 return;
473 /* Export C API */
474 v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
475 if (v != NULL)
476 PyModule_AddObject(m, "ucnhash_CAPI", v);