Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 3.0 data base.
   4
   5    Data was extracted from the Unicode 3.0 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9
  10    Copyright (c) Corporation for National Research Initiatives.
  11
  12    ------------------------------------------------------------------------ */
  13
  14 #include "Python.h"
  15 #include "ucnhash.h"
  16
  17 /* character properties */
  18
  19 typedef struct {
  20     const unsigned char category;       /* index into
  21                                            _PyUnicode_CategoryNames */
  22     const unsigned char combining;      /* combining class value 0 - 255 */
  23     const unsigned char bidirectional;  /* index into
  24                                            _PyUnicode_BidirectionalNames */
  25     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  26 } _PyUnicode_DatabaseRecord;
  27
  28 /* data file generated by Tools/unicode/makeunicodedata.py */
  29 #include "unicodedata_db.h"
  30
  31 static const _PyUnicode_DatabaseRecord*
  32 _getrecord(PyUnicodeObject* v)
  33 {
  34     int code;
  35     int index;
  36
  37     code = (int) *PyUnicode_AS_UNICODE(v);
  38
  39     if (code < 0 || code >= 65536)
  40         index = 0;
  41     else {
  42         index = index1[(code>>SHIFT)];
  43         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  44     }
  45
  46     return &_PyUnicode_Database_Records[index];
  47 }
  48
  49 /* --- Module API --------------------------------------------------------- */
  50
  51 static PyObject *
  52 unicodedata_decimal(PyObject *self, PyObject *args)
  53 {
  54     PyUnicodeObject *v;
  55     PyObject *defobj = NULL;
  56     long rc;
  57
  58     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
  59         return NULL;
  60     if (PyUnicode_GET_SIZE(v) != 1) {
  61         PyErr_SetString(PyExc_TypeError,
  62                         "need a single Unicode character as parameter");
  63         return NULL;
  64     }
  65     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
  66     if (rc < 0) {
  67         if (defobj == NULL) {
  68             PyErr_SetString(PyExc_ValueError,
  69                             "not a decimal");
  70             return NULL;
  71         }
  72         else {
  73             Py_INCREF(defobj);
  74             return defobj;
  75         }
  76     }
  77     return PyInt_FromLong(rc);
  78 }
  79
  80 static PyObject *
  81 unicodedata_digit(PyObject *self, PyObject *args)
  82 {
  83     PyUnicodeObject *v;
  84     PyObject *defobj = NULL;
  85     long rc;
  86
  87     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
  88         return NULL;
  89     if (PyUnicode_GET_SIZE(v) != 1) {
  90         PyErr_SetString(PyExc_TypeError,
  91                         "need a single Unicode character as parameter");
  92         return NULL;
  93     }
  94     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
  95     if (rc < 0) {
  96         if (defobj == NULL) {
  97             PyErr_SetString(PyExc_ValueError, "not a digit");
  98             return NULL;
  99         }
 100         else {
 101             Py_INCREF(defobj);
 102             return defobj;
 103         }
 104     }
 105     return PyInt_FromLong(rc);
 106 }
 107
 108 static PyObject *
 109 unicodedata_numeric(PyObject *self, PyObject *args)
 110 {
 111     PyUnicodeObject *v;
 112     PyObject *defobj = NULL;
 113     double rc;
 114
 115     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 116         return NULL;
 117     if (PyUnicode_GET_SIZE(v) != 1) {
 118         PyErr_SetString(PyExc_TypeError,
 119                         "need a single Unicode character as parameter");
 120         return NULL;
 121     }
 122     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 123     if (rc < 0) {
 124         if (defobj == NULL) {
 125             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 126             return NULL;
 127         }
 128         else {
 129             Py_INCREF(defobj);
 130             return defobj;
 131         }
 132     }
 133     return PyFloat_FromDouble(rc);
 134 }
 135
 136 static PyObject *
 137 unicodedata_category(PyObject *self, PyObject *args)
 138 {
 139     PyUnicodeObject *v;
 140     int index;
 141
 142     if (!PyArg_ParseTuple(args, "O!:category",
 143                           &PyUnicode_Type, &v))
 144         return NULL;
 145     if (PyUnicode_GET_SIZE(v) != 1) {
 146         PyErr_SetString(PyExc_TypeError,
 147                         "need a single Unicode character as parameter");
 148         return NULL;
 149     }
 150     index = (int) _getrecord(v)->category;
 151     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 152 }
 153
 154 static PyObject *
 155 unicodedata_bidirectional(PyObject *self, PyObject *args)
 156 {
 157     PyUnicodeObject *v;
 158     int index;
 159
 160     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 161                           &PyUnicode_Type, &v))
 162         return NULL;
 163     if (PyUnicode_GET_SIZE(v) != 1) {
 164         PyErr_SetString(PyExc_TypeError,
 165                         "need a single Unicode character as parameter");
 166         return NULL;
 167     }
 168     index = (int) _getrecord(v)->bidirectional;
 169     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 170 }
 171
 172 static PyObject *
 173 unicodedata_combining(PyObject *self, PyObject *args)
 174 {
 175     PyUnicodeObject *v;
 176
 177     if (!PyArg_ParseTuple(args, "O!:combining",
 178                           &PyUnicode_Type, &v))
 179         return NULL;
 180     if (PyUnicode_GET_SIZE(v) != 1) {
 181         PyErr_SetString(PyExc_TypeError,
 182                         "need a single Unicode character as parameter");
 183         return NULL;
 184     }
 185     return PyInt_FromLong((int) _getrecord(v)->combining);
 186 }
 187
 188 static PyObject *
 189 unicodedata_mirrored(PyObject *self, PyObject *args)
 190 {
 191     PyUnicodeObject *v;
 192
 193     if (!PyArg_ParseTuple(args, "O!:mirrored",
 194                           &PyUnicode_Type, &v))
 195         return NULL;
 196     if (PyUnicode_GET_SIZE(v) != 1) {
 197         PyErr_SetString(PyExc_TypeError,
 198                         "need a single Unicode character as parameter");
 199         return NULL;
 200     }
 201     return PyInt_FromLong((int) _getrecord(v)->mirrored);
 202 }
 203
 204 static PyObject *
 205 unicodedata_decomposition(PyObject *self, PyObject *args)
 206 {
 207     PyUnicodeObject *v;
 208     char decomp[256];
 209     int code, index, count, i;
 210
 211     if (!PyArg_ParseTuple(args, "O!:decomposition",
 212                           &PyUnicode_Type, &v))
 213         return NULL;
 214     if (PyUnicode_GET_SIZE(v) != 1) {
 215         PyErr_SetString(PyExc_TypeError,
 216                         "need a single Unicode character as parameter");
 217         return NULL;
 218     }
 219
 220     code = (int) *PyUnicode_AS_UNICODE(v);
 221
 222     if (code < 0 || code >= 65536)
 223         index = 0;
 224     else {
 225         index = decomp_index1[(code>>DECOMP_SHIFT)];
 226         index = decomp_index2[(index<<DECOMP_SHIFT)+
 227                              (code&((1<<DECOMP_SHIFT)-1))];
 228     }
 229
 230     /* high byte is number of hex bytes (usually one or two), low byte
 231        is prefix code (from*/
 232     count = decomp_data[index] >> 8;
 233
 234     /* XXX: could allocate the PyString up front instead
 235        (strlen(prefix) + 5 * count + 1 bytes) */
 236
 237     /* copy prefix */
 238     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 239     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 240
 241     while (count-- > 0) {
 242         if (i)
 243             decomp[i++] = ' ';
 244         assert((size_t)i < sizeof(decomp));
 245         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 246                       decomp_data[++index]);
 247         i += strlen(decomp + i);
 248     }
 249
 250     decomp[i] = '\0';
 251
 252     return PyString_FromString(decomp);
 253 }
 254
 255 /* -------------------------------------------------------------------- */
 256 /* unicode character name tables */
 257
 258 /* data file generated by Tools/unicode/makeunicodedata.py */
 259 #include "unicodename_db.h"
 260
 261 /* -------------------------------------------------------------------- */
 262 /* database code (cut and pasted from the unidb package) */
 263
 264 static unsigned long
 265 _gethash(const char *s, int len, int scale)
 266 {
 267     int i;
 268     unsigned long h = 0;
 269     unsigned long ix;
 270     for (i = 0; i < len; i++) {
 271         h = (h * scale) + (unsigned char) toupper(s[i]);
 272         ix = h & 0xff000000;
 273         if (ix)
 274             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 275     }
 276     return h;
 277 }
 278
 279 static int
 280 _getucname(Py_UCS4 code, char* buffer, int buflen)
 281 {
 282     int offset;
 283     int i;
 284     int word;
 285     unsigned char* w;
 286
 287     if (code >= 65536)
 288         return 0;
 289
 290     /* get offset into phrasebook */
 291     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 292     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 293                                (code&((1<<phrasebook_shift)-1))];
 294     if (!offset)
 295         return 0;
 296
 297     i = 0;
 298
 299     for (;;) {
 300         /* get word index */
 301         word = phrasebook[offset] - phrasebook_short;
 302         if (word >= 0) {
 303             word = (word << 8) + phrasebook[offset+1];
 304             offset += 2;
 305         } else
 306             word = phrasebook[offset++];
 307         if (i) {
 308             if (i > buflen)
 309                 return 0; /* buffer overflow */
 310             buffer[i++] = ' ';
 311         }
 312         /* copy word string from lexicon.  the last character in the
 313            word has bit 7 set.  the last word in a string ends with
 314            0x80 */
 315         w = lexicon + lexicon_offset[word];
 316         while (*w < 128) {
 317             if (i >= buflen)
 318                 return 0; /* buffer overflow */
 319             buffer[i++] = *w++;
 320         }
 321         if (i >= buflen)
 322             return 0; /* buffer overflow */
 323         buffer[i++] = *w & 127;
 324         if (*w == 128)
 325             break; /* end of word */
 326     }
 327
 328     return 1;
 329 }
 330
 331 static int
 332 _cmpname(int code, const char* name, int namelen)
 333 {
 334     /* check if code corresponds to the given name */
 335     int i;
 336     char buffer[NAME_MAXLEN];
 337     if (!_getucname(code, buffer, sizeof(buffer)))
 338         return 0;
 339     for (i = 0; i < namelen; i++) {
 340         if (toupper(name[i]) != buffer[i])
 341             return 0;
 342     }
 343     return buffer[namelen] == '\0';
 344 }
 345
 346 static int
 347 _getcode(const char* name, int namelen, Py_UCS4* code)
 348 {
 349     unsigned int h, v;
 350     unsigned int mask = code_size-1;
 351     unsigned int i, incr;
 352
 353     /* the following is the same as python's dictionary lookup, with
 354        only minor changes.  see the makeunicodedata script for more
 355        details */
 356
 357     h = (unsigned int) _gethash(name, namelen, code_magic);
 358     i = (~h) & mask;
 359     v = code_hash[i];
 360     if (!v)
 361         return 0;
 362     if (_cmpname(v, name, namelen)) {
 363         *code = v;
 364         return 1;
 365     }
 366     incr = (h ^ (h >> 3)) & mask;
 367     if (!incr)
 368         incr = mask;
 369     for (;;) {
 370         i = (i + incr) & mask;
 371         v = code_hash[i];
 372         if (!v)
 373             return 0;
 374         if (_cmpname(v, name, namelen)) {
 375             *code = v;
 376             return 1;
 377         }
 378         incr = incr << 1;
 379         if (incr > mask)
 380             incr = incr ^ code_poly;
 381     }
 382 }
 383
 384 static const _PyUnicode_Name_CAPI hashAPI =
 385 {
 386     sizeof(_PyUnicode_Name_CAPI),
 387     _getucname,
 388     _getcode
 389 };
 390
 391 /* -------------------------------------------------------------------- */
 392 /* Python bindings */
 393
 394 static PyObject *
 395 unicodedata_name(PyObject* self, PyObject* args)
 396 {
 397     char name[NAME_MAXLEN];
 398
 399     PyUnicodeObject* v;
 400     PyObject* defobj = NULL;
 401     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 402         return NULL;
 403
 404     if (PyUnicode_GET_SIZE(v) != 1) {
 405         PyErr_SetString(PyExc_TypeError,
 406                         "need a single Unicode character as parameter");
 407         return NULL;
 408     }
 409
 410     if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
 411                              name, sizeof(name))) {
 412         if (defobj == NULL) {
 413             PyErr_SetString(PyExc_ValueError, "no such name");
 414             return NULL;
 415         }
 416         else {
 417             Py_INCREF(defobj);
 418             return defobj;
 419         }
 420     }
 421
 422     return Py_BuildValue("s", name);
 423 }
 424
 425 static PyObject *
 426 unicodedata_lookup(PyObject* self, PyObject* args)
 427 {
 428     Py_UCS4 code;
 429     Py_UNICODE str[1];
 430
 431     char* name;
 432     int namelen;
 433     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 434         return NULL;
 435
 436     if (!_getcode(name, namelen, &code)) {
 437         PyErr_SetString(PyExc_KeyError, "undefined character name");
 438         return NULL;
 439     }
 440
 441     str[0] = (Py_UNICODE) code;
 442     return PyUnicode_FromUnicode(str, 1);
 443 }
 444
 445 /* XXX Add doc strings. */
 446
 447 static PyMethodDef unicodedata_functions[] = {
 448     {"decimal", unicodedata_decimal, METH_VARARGS},
 449     {"digit", unicodedata_digit, METH_VARARGS},
 450     {"numeric", unicodedata_numeric, METH_VARARGS},
 451     {"category", unicodedata_category, METH_VARARGS},
 452     {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
 453     {"combining", unicodedata_combining, METH_VARARGS},
 454     {"mirrored", unicodedata_mirrored, METH_VARARGS},
 455     {"decomposition",unicodedata_decomposition, METH_VARARGS},
 456     {"name", unicodedata_name, METH_VARARGS},
 457     {"lookup", unicodedata_lookup, METH_VARARGS},
 458     {NULL, NULL}                /* sentinel */
 459 };
 460
 461 PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
 462
 463 DL_EXPORT(void)
 464 initunicodedata(void)
 465 {
 466     PyObject *m, *v;
 467
 468     m = Py_InitModule3(
 469         "unicodedata", unicodedata_functions, unicodedata_docstring);
 470     if (!m)
 471         return;
 472
 473     /* Export C API */
 474     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
 475     if (v != NULL)
 476         PyModule_AddObject(m, "ucnhash_CAPI", v);
 477 }