Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 3.0 data base.
   4
   5    Data was extracted from the Unicode 3.0 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9
  10    Copyright (c) Corporation for National Research Initiatives.
  11
  12    ------------------------------------------------------------------------ */
  13
  14 #include "Python.h"
  15 #include "ucnhash.h"
  16
  17 /* character properties */
  18
  19 typedef struct {
  20     const unsigned char category;       /* index into
  21                                            _PyUnicode_CategoryNames */
  22     const unsigned char combining;      /* combining class value 0 - 255 */
  23     const unsigned char bidirectional;  /* index into
  24                                            _PyUnicode_BidirectionalNames */
  25     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  26 } _PyUnicode_DatabaseRecord;
  27
  28 /* data file generated by Tools/unicode/makeunicodedata.py */
  29 #include "unicodedata_db.h"
  30
  31 static const _PyUnicode_DatabaseRecord*
  32 _getrecord(PyUnicodeObject* v)
  33 {
  34     int code;
  35     int index;
  36
  37     code = (int) *PyUnicode_AS_UNICODE(v);
  38
  39     if (code < 0 || code >= 65536)
  40         index = 0;
  41     else {
  42         index = index1[(code>>SHIFT)];
  43         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  44     }
  45
  46     return &_PyUnicode_Database_Records[index];
  47 }
  48
  49 /* --- Module API --------------------------------------------------------- */
  50
  51 static PyObject *
  52 unicodedata_decimal(PyObject *self, PyObject *args)
  53 {
  54     PyUnicodeObject *v;
  55     PyObject *defobj = NULL;
  56     long rc;
  57
  58     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
  59         return NULL;
  60     if (PyUnicode_GET_SIZE(v) != 1) {
  61         PyErr_SetString(PyExc_TypeError,
  62                         "need a single Unicode character as parameter");
  63         return NULL;
  64     }
  65     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
  66     if (rc < 0) {
  67         if (defobj == NULL) {
  68             PyErr_SetString(PyExc_ValueError,
  69                             "not a decimal");
  70             return NULL;
  71         }
  72         else {
  73             Py_INCREF(defobj);
  74             return defobj;
  75         }
  76     }
  77     return PyInt_FromLong(rc);
  78 }
  79
  80 static PyObject *
  81 unicodedata_digit(PyObject *self, PyObject *args)
  82 {
  83     PyUnicodeObject *v;
  84     PyObject *defobj = NULL;
  85     long rc;
  86
  87     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
  88         return NULL;
  89     if (PyUnicode_GET_SIZE(v) != 1) {
  90         PyErr_SetString(PyExc_TypeError,
  91                         "need a single Unicode character as parameter");
  92         return NULL;
  93     }
  94     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
  95     if (rc < 0) {
  96         if (defobj == NULL) {
  97             PyErr_SetString(PyExc_ValueError, "not a digit");
  98             return NULL;
  99         }
 100         else {
 101             Py_INCREF(defobj);
 102             return defobj;
 103         }
 104     }
 105     return PyInt_FromLong(rc);
 106 }
 107
 108 static PyObject *
 109 unicodedata_numeric(PyObject *self, PyObject *args)
 110 {
 111     PyUnicodeObject *v;
 112     PyObject *defobj = NULL;
 113     double rc;
 114
 115     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 116         return NULL;
 117     if (PyUnicode_GET_SIZE(v) != 1) {
 118         PyErr_SetString(PyExc_TypeError,
 119                         "need a single Unicode character as parameter");
 120         return NULL;
 121     }
 122     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 123     if (rc < 0) {
 124         if (defobj == NULL) {
 125             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 126             return NULL;
 127         }
 128         else {
 129             Py_INCREF(defobj);
 130             return defobj;
 131         }
 132     }
 133     return PyFloat_FromDouble(rc);
 134 }
 135
 136 static PyObject *
 137 unicodedata_category(PyObject *self, PyObject *args)
 138 {
 139     PyUnicodeObject *v;
 140     int index;
 141
 142     if (!PyArg_ParseTuple(args, "O!:category",
 143                           &PyUnicode_Type, &v))
 144         return NULL;
 145     if (PyUnicode_GET_SIZE(v) != 1) {
 146         PyErr_SetString(PyExc_TypeError,
 147                         "need a single Unicode character as parameter");
 148         return NULL;
 149     }
 150     index = (int) _getrecord(v)->category;
 151     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 152 }
 153
 154 static PyObject *
 155 unicodedata_bidirectional(PyObject *self, PyObject *args)
 156 {
 157     PyUnicodeObject *v;
 158     int index;
 159
 160     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 161                           &PyUnicode_Type, &v))
 162         return NULL;
 163     if (PyUnicode_GET_SIZE(v) != 1) {
 164         PyErr_SetString(PyExc_TypeError,
 165                         "need a single Unicode character as parameter");
 166         return NULL;
 167     }
 168     index = (int) _getrecord(v)->bidirectional;
 169     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 170 }
 171
 172 static PyObject *
 173 unicodedata_combining(PyObject *self, PyObject *args)
 174 {
 175     PyUnicodeObject *v;
 176
 177     if (!PyArg_ParseTuple(args, "O!:combining",
 178                           &PyUnicode_Type, &v))
 179         return NULL;
 180     if (PyUnicode_GET_SIZE(v) != 1) {
 181         PyErr_SetString(PyExc_TypeError,
 182                         "need a single Unicode character as parameter");
 183         return NULL;
 184     }
 185     return PyInt_FromLong((int) _getrecord(v)->combining);
 186 }
 187
 188 static PyObject *
 189 unicodedata_mirrored(PyObject *self, PyObject *args)
 190 {
 191     PyUnicodeObject *v;
 192
 193     if (!PyArg_ParseTuple(args, "O!:mirrored",
 194                           &PyUnicode_Type, &v))
 195         return NULL;
 196     if (PyUnicode_GET_SIZE(v) != 1) {
 197         PyErr_SetString(PyExc_TypeError,
 198                         "need a single Unicode character as parameter");
 199         return NULL;
 200     }
 201     return PyInt_FromLong((int) _getrecord(v)->mirrored);
 202 }
 203
 204 static PyObject *
 205 unicodedata_decomposition(PyObject *self, PyObject *args)
 206 {
 207     PyUnicodeObject *v;
 208     char decomp[256];
 209     int code, index, count, i;
 210
 211     if (!PyArg_ParseTuple(args, "O!:decomposition",
 212                           &PyUnicode_Type, &v))
 213         return NULL;
 214     if (PyUnicode_GET_SIZE(v) != 1) {
 215         PyErr_SetString(PyExc_TypeError,
 216                         "need a single Unicode character as parameter");
 217         return NULL;
 218     }
 219
 220     code = (int) *PyUnicode_AS_UNICODE(v);
 221
 222     if (code < 0 || code >= 65536)
 223         index = 0;
 224     else {
 225         index = decomp_index1[(code>>DECOMP_SHIFT)];
 226         index = decomp_index2[(index<<DECOMP_SHIFT)+
 227                              (code&((1<<DECOMP_SHIFT)-1))];
 228     }
 229
 230     /* high byte is of hex bytes (usually one or two), low byte
 231        is prefix code (from*/
 232     count = decomp_data[index] >> 8;
 233
 234     /* XXX: could allocate the PyString up front instead
 235        (strlen(prefix) + 5 * count + 1 bytes) */
 236
 237     /* copy prefix */
 238     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 239     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 240
 241     while (count-- > 0) {
 242         if (i)
 243             decomp[i++] = ' ';
 244         sprintf(decomp + i, "%04X", decomp_data[++index]);
 245         i += strlen(decomp + i);
 246     }
 247
 248     decomp[i] = '\0';
 249
 250     return PyString_FromString(decomp);
 251 }
 252
 253 /* -------------------------------------------------------------------- */
 254 /* unicode character name tables */
 255
 256 /* data file generated by Tools/unicode/makeunicodedata.py */
 257 #include "unicodename_db.h"
 258
 259 /* -------------------------------------------------------------------- */
 260 /* database code (cut and pasted from the unidb package) */
 261
 262 static unsigned long
 263 _gethash(const char *s, int len, int scale)
 264 {
 265     int i;
 266     unsigned long h = 0;
 267     unsigned long ix;
 268     for (i = 0; i < len; i++) {
 269         h = (h * scale) + (unsigned char) toupper(s[i]);
 270         ix = h & 0xff000000;
 271         if (ix)
 272             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 273     }
 274     return h;
 275 }
 276
 277 static int
 278 _getname(Py_UCS4 code, char* buffer, int buflen)
 279 {
 280     int offset;
 281     int i;
 282     int word;
 283     unsigned char* w;
 284
 285     if (code >= 65536)
 286         return 0;
 287
 288     /* get offset into phrasebook */
 289     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 290     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 291                                (code&((1<<phrasebook_shift)-1))];
 292     if (!offset)
 293         return 0;
 294
 295     i = 0;
 296
 297     for (;;) {
 298         /* get word index */
 299         word = phrasebook[offset] - phrasebook_short;
 300         if (word >= 0) {
 301             word = (word << 8) + phrasebook[offset+1];
 302             offset += 2;
 303         } else
 304             word = phrasebook[offset++];
 305         if (i) {
 306             if (i > buflen)
 307                 return 0; /* buffer overflow */
 308             buffer[i++] = ' ';
 309         }
 310         /* copy word string from lexicon.  the last character in the
 311            word has bit 7 set.  the last word in a string ends with
 312            0x80 */
 313         w = lexicon + lexicon_offset[word];
 314         while (*w < 128) {
 315             if (i >= buflen)
 316                 return 0; /* buffer overflow */
 317             buffer[i++] = *w++;
 318         }
 319         if (i >= buflen)
 320             return 0; /* buffer overflow */
 321         buffer[i++] = *w & 127;
 322         if (*w == 128)
 323             break; /* end of word */
 324     }
 325
 326     return 1;
 327 }
 328
 329 static int
 330 _cmpname(int code, const char* name, int namelen)
 331 {
 332     /* check if code corresponds to the given name */
 333     int i;
 334     char buffer[NAME_MAXLEN];
 335     if (!_getname(code, buffer, sizeof(buffer)))
 336         return 0;
 337     for (i = 0; i < namelen; i++) {
 338         if (toupper(name[i]) != buffer[i])
 339             return 0;
 340     }
 341     return buffer[namelen] == '\0';
 342 }
 343
 344 static int
 345 _getcode(const char* name, int namelen, Py_UCS4* code)
 346 {
 347     unsigned int h, v;
 348     unsigned int mask = code_size-1;
 349     unsigned int i, incr;
 350
 351     /* the following is the same as python's dictionary lookup, with
 352        only minor changes.  see the makeunicodedata script for more
 353        details */
 354
 355     h = (unsigned int) _gethash(name, namelen, code_magic);
 356     i = (~h) & mask;
 357     v = code_hash[i];
 358     if (!v)
 359         return 0;
 360     if (_cmpname(v, name, namelen)) {
 361         *code = v;
 362         return 1;
 363     }
 364     incr = (h ^ (h >> 3)) & mask;
 365     if (!incr)
 366         incr = mask;
 367     for (;;) {
 368         i = (i + incr) & mask;
 369         v = code_hash[i];
 370         if (!v)
 371             return 0;
 372         if (_cmpname(v, name, namelen)) {
 373             *code = v;
 374             return 1;
 375         }
 376         incr = incr << 1;
 377         if (incr > mask)
 378             incr = incr ^ code_poly;
 379     }
 380 }
 381
 382 static const _PyUnicode_Name_CAPI hashAPI =
 383 {
 384     sizeof(_PyUnicode_Name_CAPI),
 385     _getname,
 386     _getcode
 387 };
 388
 389 /* -------------------------------------------------------------------- */
 390 /* Python bindings */
 391
 392 static PyObject *
 393 unicodedata_name(PyObject* self, PyObject* args)
 394 {
 395     char name[NAME_MAXLEN];
 396
 397     PyUnicodeObject* v;
 398     PyObject* defobj = NULL;
 399     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 400         return NULL;
 401
 402     if (PyUnicode_GET_SIZE(v) != 1) {
 403         PyErr_SetString(PyExc_TypeError,
 404                         "need a single Unicode character as parameter");
 405         return NULL;
 406     }
 407
 408     if (!_getname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
 409                              name, sizeof(name))) {
 410         if (defobj == NULL) {
 411             PyErr_SetString(PyExc_ValueError, "no such name");
 412             return NULL;
 413         }
 414         else {
 415             Py_INCREF(defobj);
 416             return defobj;
 417         }
 418     }
 419
 420     return Py_BuildValue("s", name);
 421 }
 422
 423 static PyObject *
 424 unicodedata_lookup(PyObject* self, PyObject* args)
 425 {
 426     Py_UCS4 code;
 427     Py_UNICODE str[1];
 428
 429     char* name;
 430     int namelen;
 431     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 432         return NULL;
 433
 434     if (!_getcode(name, namelen, &code)) {
 435         PyErr_SetString(PyExc_KeyError, "undefined character name");
 436         return NULL;
 437     }
 438
 439     str[0] = (Py_UNICODE) code;
 440     return PyUnicode_FromUnicode(str, 1);
 441 }
 442
 443 /* XXX Add doc strings. */
 444
 445 static PyMethodDef unicodedata_functions[] = {
 446     {"decimal", unicodedata_decimal, METH_VARARGS},
 447     {"digit", unicodedata_digit, METH_VARARGS},
 448     {"numeric", unicodedata_numeric, METH_VARARGS},
 449     {"category", unicodedata_category, METH_VARARGS},
 450     {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
 451     {"combining", unicodedata_combining, METH_VARARGS},
 452     {"mirrored", unicodedata_mirrored, METH_VARARGS},
 453     {"decomposition",unicodedata_decomposition, METH_VARARGS},
 454     {"name", unicodedata_name, METH_VARARGS},
 455     {"lookup", unicodedata_lookup, METH_VARARGS},
 456     {NULL, NULL}                /* sentinel */
 457 };
 458
 459 static char *unicodedata_docstring = "unicode character database";
 460
 461 DL_EXPORT(void)
 462 initunicodedata(void)
 463 {
 464     PyObject *m, *d, *v;
 465
 466     m = Py_InitModule3(
 467         "unicodedata", unicodedata_functions, unicodedata_docstring);
 468     if (!m)
 469         return;
 470
 471     d = PyModule_GetDict(m);
 472     if (!d)
 473         return;
 474
 475     /* Export C API */
 476     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
 477     if (v != NULL) {
 478         PyDict_SetItemString(d, "ucnhash_CAPI", v);
 479         Py_DECREF(v);
 480     }
 481 }