Modules/unicodedata.c

   1 /* ------------------------------------------------------------------------
   2
   3    unicodedata -- Provides access to the Unicode 3.2 data base.
   4
   5    Data was extracted from the Unicode 3.2 UnicodeData.txt file.
   6
   7    Written by Marc-Andre Lemburg (mal@lemburg.com).
   8    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9    Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11    Copyright (c) Corporation for National Research Initiatives.
  12
  13    ------------------------------------------------------------------------ */
  14
  15 #include "Python.h"
  16 #include "ucnhash.h"
  17
  18 /* character properties */
  19
  20 typedef struct {
  21     const unsigned char category;       /* index into
  22                                            _PyUnicode_CategoryNames */
  23     const unsigned char combining;      /* combining class value 0 - 255 */
  24     const unsigned char bidirectional;  /* index into
  25                                            _PyUnicode_BidirectionalNames */
  26     const unsigned char mirrored;       /* true if mirrored in bidir mode */
  27 } _PyUnicode_DatabaseRecord;
  28
  29 /* data file generated by Tools/unicode/makeunicodedata.py */
  30 #include "unicodedata_db.h"
  31
  32 static const _PyUnicode_DatabaseRecord*
  33 _getrecord_ex(Py_UCS4 code)
  34 {
  35     int index;
  36     if (code >= 0x110000)
  37         index = 0;
  38     else {
  39         index = index1[(code>>SHIFT)];
  40         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  41     }
  42
  43     return &_PyUnicode_Database_Records[index];
  44 }
  45
  46 static const _PyUnicode_DatabaseRecord*
  47 _getrecord(PyUnicodeObject* v)
  48 {
  49     return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
  50 }
  51
  52 /* --- Module API --------------------------------------------------------- */
  53
  54 static PyObject *
  55 unicodedata_decimal(PyObject *self, PyObject *args)
  56 {
  57     PyUnicodeObject *v;
  58     PyObject *defobj = NULL;
  59     long rc;
  60
  61     if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
  62         return NULL;
  63     if (PyUnicode_GET_SIZE(v) != 1) {
  64         PyErr_SetString(PyExc_TypeError,
  65                         "need a single Unicode character as parameter");
  66         return NULL;
  67     }
  68     rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
  69     if (rc < 0) {
  70         if (defobj == NULL) {
  71             PyErr_SetString(PyExc_ValueError,
  72                             "not a decimal");
  73             return NULL;
  74         }
  75         else {
  76             Py_INCREF(defobj);
  77             return defobj;
  78         }
  79     }
  80     return PyInt_FromLong(rc);
  81 }
  82
  83 static PyObject *
  84 unicodedata_digit(PyObject *self, PyObject *args)
  85 {
  86     PyUnicodeObject *v;
  87     PyObject *defobj = NULL;
  88     long rc;
  89
  90     if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
  91         return NULL;
  92     if (PyUnicode_GET_SIZE(v) != 1) {
  93         PyErr_SetString(PyExc_TypeError,
  94                         "need a single Unicode character as parameter");
  95         return NULL;
  96     }
  97     rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
  98     if (rc < 0) {
  99         if (defobj == NULL) {
 100             PyErr_SetString(PyExc_ValueError, "not a digit");
 101             return NULL;
 102         }
 103         else {
 104             Py_INCREF(defobj);
 105             return defobj;
 106         }
 107     }
 108     return PyInt_FromLong(rc);
 109 }
 110
 111 static PyObject *
 112 unicodedata_numeric(PyObject *self, PyObject *args)
 113 {
 114     PyUnicodeObject *v;
 115     PyObject *defobj = NULL;
 116     double rc;
 117
 118     if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 119         return NULL;
 120     if (PyUnicode_GET_SIZE(v) != 1) {
 121         PyErr_SetString(PyExc_TypeError,
 122                         "need a single Unicode character as parameter");
 123         return NULL;
 124     }
 125     rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
 126     if (rc < 0) {
 127         if (defobj == NULL) {
 128             PyErr_SetString(PyExc_ValueError, "not a numeric character");
 129             return NULL;
 130         }
 131         else {
 132             Py_INCREF(defobj);
 133             return defobj;
 134         }
 135     }
 136     return PyFloat_FromDouble(rc);
 137 }
 138
 139 static PyObject *
 140 unicodedata_category(PyObject *self, PyObject *args)
 141 {
 142     PyUnicodeObject *v;
 143     int index;
 144
 145     if (!PyArg_ParseTuple(args, "O!:category",
 146                           &PyUnicode_Type, &v))
 147         return NULL;
 148     if (PyUnicode_GET_SIZE(v) != 1) {
 149         PyErr_SetString(PyExc_TypeError,
 150                         "need a single Unicode character as parameter");
 151         return NULL;
 152     }
 153     index = (int) _getrecord(v)->category;
 154     return PyString_FromString(_PyUnicode_CategoryNames[index]);
 155 }
 156
 157 static PyObject *
 158 unicodedata_bidirectional(PyObject *self, PyObject *args)
 159 {
 160     PyUnicodeObject *v;
 161     int index;
 162
 163     if (!PyArg_ParseTuple(args, "O!:bidirectional",
 164                           &PyUnicode_Type, &v))
 165         return NULL;
 166     if (PyUnicode_GET_SIZE(v) != 1) {
 167         PyErr_SetString(PyExc_TypeError,
 168                         "need a single Unicode character as parameter");
 169         return NULL;
 170     }
 171     index = (int) _getrecord(v)->bidirectional;
 172     return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 173 }
 174
 175 static PyObject *
 176 unicodedata_combining(PyObject *self, PyObject *args)
 177 {
 178     PyUnicodeObject *v;
 179
 180     if (!PyArg_ParseTuple(args, "O!:combining",
 181                           &PyUnicode_Type, &v))
 182         return NULL;
 183     if (PyUnicode_GET_SIZE(v) != 1) {
 184         PyErr_SetString(PyExc_TypeError,
 185                         "need a single Unicode character as parameter");
 186         return NULL;
 187     }
 188     return PyInt_FromLong((int) _getrecord(v)->combining);
 189 }
 190
 191 static PyObject *
 192 unicodedata_mirrored(PyObject *self, PyObject *args)
 193 {
 194     PyUnicodeObject *v;
 195
 196     if (!PyArg_ParseTuple(args, "O!:mirrored",
 197                           &PyUnicode_Type, &v))
 198         return NULL;
 199     if (PyUnicode_GET_SIZE(v) != 1) {
 200         PyErr_SetString(PyExc_TypeError,
 201                         "need a single Unicode character as parameter");
 202         return NULL;
 203     }
 204     return PyInt_FromLong((int) _getrecord(v)->mirrored);
 205 }
 206
 207 static PyObject *
 208 unicodedata_decomposition(PyObject *self, PyObject *args)
 209 {
 210     PyUnicodeObject *v;
 211     char decomp[256];
 212     int code, index, count, i;
 213
 214     if (!PyArg_ParseTuple(args, "O!:decomposition",
 215                           &PyUnicode_Type, &v))
 216         return NULL;
 217     if (PyUnicode_GET_SIZE(v) != 1) {
 218         PyErr_SetString(PyExc_TypeError,
 219                         "need a single Unicode character as parameter");
 220         return NULL;
 221     }
 222
 223     code = (int) *PyUnicode_AS_UNICODE(v);
 224
 225     if (code < 0 || code >= 0x110000)
 226         index = 0;
 227     else {
 228         index = decomp_index1[(code>>DECOMP_SHIFT)];
 229         index = decomp_index2[(index<<DECOMP_SHIFT)+
 230                              (code&((1<<DECOMP_SHIFT)-1))];
 231     }
 232
 233     /* high byte is number of hex bytes (usually one or two), low byte
 234        is prefix code (from*/
 235     count = decomp_data[index] >> 8;
 236
 237     /* XXX: could allocate the PyString up front instead
 238        (strlen(prefix) + 5 * count + 1 bytes) */
 239
 240     /* copy prefix */
 241     i = strlen(decomp_prefix[decomp_data[index] & 255]);
 242     memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
 243
 244     while (count-- > 0) {
 245         if (i)
 246             decomp[i++] = ' ';
 247         assert((size_t)i < sizeof(decomp));
 248         PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 249                       decomp_data[++index]);
 250         i += strlen(decomp + i);
 251     }
 252
 253     decomp[i] = '\0';
 254
 255     return PyString_FromString(decomp);
 256 }
 257
 258 void
 259 get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
 260 {
 261     if (code >= 0x110000) {
 262         *index = 0;
 263     }
 264     else {
 265         *index = decomp_index1[(code>>DECOMP_SHIFT)];
 266         *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 267                                (code&((1<<DECOMP_SHIFT)-1))];
 268     }
 269
 270     /* high byte is number of hex bytes (usually one or two), low byte
 271        is prefix code (from*/
 272     *count = decomp_data[*index] >> 8;
 273     *prefix = decomp_data[*index] & 255;
 274
 275     (*index)++;
 276 }
 277
 278 #define SBase   0xAC00
 279 #define LBase   0x1100
 280 #define VBase   0x1161
 281 #define TBase   0x11A7
 282 #define LCount  19
 283 #define VCount  21
 284 #define TCount  28
 285 #define NCount  (VCount*TCount)
 286 #define SCount  (LCount*NCount)
 287
 288 static PyObject*
 289 nfd_nfkd(PyObject *input, int k)
 290 {
 291     PyObject *result;
 292     Py_UNICODE *i, *end, *o;
 293     /* Longest decomposition in Unicode 3.2: U+FDFA */
 294     Py_UNICODE stack[20];
 295     int space, stackptr, isize;
 296     int index, prefix, count;
 297     unsigned char prev, cur;
 298
 299     stackptr = 0;
 300     isize = PyUnicode_GET_SIZE(input);
 301     /* Overallocate atmost 10 characters. */
 302     space = (isize > 10 ? 10 : isize) + isize;
 303     result = PyUnicode_FromUnicode(NULL, space);
 304     if (!result)
 305         return NULL;
 306     i = PyUnicode_AS_UNICODE(input);
 307     end = i + isize;
 308     o = PyUnicode_AS_UNICODE(result);
 309
 310     while (i < end) {
 311         stack[stackptr++] = *i++;
 312         while(stackptr) {
 313             Py_UNICODE code = stack[--stackptr];
 314             if (!space) {
 315                 space = PyString_GET_SIZE(result) + 10;
 316                 if (PyUnicode_Resize(&result, space) == -1)
 317                     return NULL;
 318                 o = PyUnicode_AS_UNICODE(result) + space - 10;
 319                 space = 10;
 320             }
 321             /* Hangul Decomposition. */
 322             if (SBase <= code && code < (SBase+SCount)) {
 323                 int SIndex = code - SBase;
 324                 int L = LBase + SIndex / NCount;
 325                 int V = VBase + (SIndex % NCount) / TCount;
 326                 int T = TBase + SIndex % TCount;
 327                 *o++ = L;
 328                 *o++ = V;
 329                 space -= 2;
 330                 if (T != TBase) {
 331                     *o++ = T;
 332                     space --;
 333                 }
 334                 continue;
 335             }
 336             /* Other decompoistions. */
 337             get_decomp_record(code, &index, &prefix, &count);
 338
 339             /* Copy character if it is not decomposable, or has a
 340                compatibility decomposition, but we do NFD. */
 341             if (!count || (prefix && !k)) {
 342                 *o++ = code;
 343                 space--;
 344                 continue;
 345             }
 346             /* Copy decomposition onto the stack, in reverse
 347                order.  */
 348             while(count) {
 349                 code = decomp_data[index + (--count)];
 350                 stack[stackptr++] = code;
 351             }
 352         }
 353     }
 354
 355     /* Drop overallocation. Cannot fail. */
 356     PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 357
 358     /* Sort canonically. */
 359     i = PyUnicode_AS_UNICODE(result);
 360     prev = _getrecord_ex(*i)->combining;
 361     end = i + PyUnicode_GET_SIZE(result);
 362     for (i++; i < end; i++) {
 363         cur = _getrecord_ex(*i)->combining;
 364         if (prev == 0 || cur == 0 || prev <= cur) {
 365             prev = cur;
 366             continue;
 367         }
 368         /* Non-canonical order. Need to switch *i with previous. */
 369         o = i - 1;
 370         while (1) {
 371             Py_UNICODE tmp = o[1];
 372             o[1] = o[0];
 373             o[0] = tmp;
 374             o--;
 375             if (o < PyUnicode_AS_UNICODE(result))
 376                 break;
 377             prev = _getrecord_ex(*o)->combining;
 378             if (prev == 0 || prev <= cur)
 379                 break;
 380         }
 381         prev = _getrecord_ex(*i)->combining;
 382     }
 383     return result;
 384 }
 385
 386 static int
 387 find_nfc_index(struct reindex* nfc, Py_UNICODE code)
 388 {
 389     int index;
 390     for (index = 0; nfc[index].start; index++) {
 391         int start = nfc[index].start;
 392         if (code < start)
 393             return -1;
 394         if (code <= start + nfc[index].count) {
 395             int delta = code - start;
 396             return nfc[index].index + delta;
 397         }
 398     }
 399     return -1;
 400 }
 401
 402 static PyObject*
 403 nfc_nfkc(PyObject *input, int k)
 404 {
 405     PyObject *result;
 406     Py_UNICODE *i, *i1, *o, *end;
 407     int f,l,index,index1,comb;
 408     Py_UNICODE code;
 409     Py_UNICODE *skipped[20];
 410     int cskipped = 0;
 411
 412     result = nfd_nfkd(input, k);
 413     if (!result)
 414         return NULL;
 415
 416     /* We are going to modify result in-place.
 417        If nfd_nfkd is changed to sometimes return the input,
 418        this code needs to be reviewed. */
 419     assert(result != input);
 420
 421     i = PyUnicode_AS_UNICODE(result);
 422     end = i + PyUnicode_GET_SIZE(result);
 423     o = PyUnicode_AS_UNICODE(result);
 424
 425   again:
 426     while (i < end) {
 427       for (index = 0; index < cskipped; index++) {
 428           if (skipped[index] == i) {
 429               /* *i character is skipped.
 430                  Remove from list. */
 431               skipped[index] = skipped[cskipped-1];
 432               cskipped--;
 433               i++;
 434               goto again; /* continue while */
 435           }
 436       }
 437       /* Hangul Composition. We don't need to check for <LV,T>
 438          pairs, since we always have decomposed data. */
 439       if (LBase <= *i && *i < (LBase+LCount) &&
 440           i + 1 < end &&
 441           VBase <= i[1] && i[1] <= (VBase+VCount)) {
 442           int LIndex, VIndex;
 443           LIndex = i[0] - LBase;
 444           VIndex = i[1] - VBase;
 445           code = SBase + (LIndex*VCount+VIndex)*TCount;
 446           i+=2;
 447           if (i < end &&
 448               TBase <= *i && *i <= (TBase+TCount)) {
 449               code += *i-TBase;
 450               i++;
 451           }
 452           *o++ = code;
 453           continue;
 454       }
 455
 456       f = find_nfc_index(nfc_first, *i);
 457       if (f == -1) {
 458           *o++ = *i++;
 459           continue;
 460       }
 461       /* Find next unblocked character. */
 462       i1 = i+1;
 463       comb = 0;
 464       while (i1 < end) {
 465           int comb1 = _getrecord_ex(*i1)->combining;
 466           if (comb1 && comb == comb1) {
 467               /* Character is blocked. */
 468               i1++;
 469               continue;
 470           }
 471           l = find_nfc_index(nfc_last, *i1);
 472           /* *i1 cannot be combined with *i. If *i1
 473              is a starter, we don't need to look further.
 474              Otherwise, record the combining class. */
 475           if (l == -1) {
 476             not_combinable:
 477               if (comb1 == 0)
 478                   break;
 479               comb = comb1;
 480               i1++;
 481               continue;
 482           }
 483           index = f*TOTAL_LAST + l;
 484           index1 = comp_index[index >> COMP_SHIFT];
 485           code = comp_data[(index1<<COMP_SHIFT)+
 486                            (index&((1<<COMP_SHIFT)-1))];
 487           if (code == 0)
 488               goto not_combinable;
 489
 490           /* Replace the original character. */
 491           *i = code;
 492           /* Mark the second character unused. */
 493           skipped[cskipped++] = i1;
 494           i1++;
 495           f = find_nfc_index(nfc_first, *i);
 496           if (f == -1)
 497               break;
 498       }
 499       *o++ = *i++;
 500     }
 501     if (o != end)
 502         PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 503     return result;
 504 }
 505
 506 static PyObject*
 507 unicodedata_normalize(PyObject *self, PyObject *args)
 508 {
 509     char *form;
 510     PyObject *input;
 511
 512     if(!PyArg_ParseTuple(args, "sO!:normalized",
 513                          &form, &PyUnicode_Type, &input))
 514         return NULL;
 515
 516     if (strcmp(form, "NFC") == 0)
 517         return nfc_nfkc(input, 0);
 518     if (strcmp(form, "NFKC") == 0)
 519         return nfc_nfkc(input, 1);
 520     if (strcmp(form, "NFD") == 0)
 521         return nfd_nfkd(input, 0);
 522     if (strcmp(form, "NFKD") == 0)
 523         return nfd_nfkd(input, 1);
 524     PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 525     return NULL;
 526 }
 527
 528 /* -------------------------------------------------------------------- */
 529 /* unicode character name tables */
 530
 531 /* data file generated by Tools/unicode/makeunicodedata.py */
 532 #include "unicodename_db.h"
 533
 534 /* -------------------------------------------------------------------- */
 535 /* database code (cut and pasted from the unidb package) */
 536
 537 static unsigned long
 538 _gethash(const char *s, int len, int scale)
 539 {
 540     int i;
 541     unsigned long h = 0;
 542     unsigned long ix;
 543     for (i = 0; i < len; i++) {
 544         h = (h * scale) + (unsigned char) toupper(s[i]);
 545         ix = h & 0xff000000;
 546         if (ix)
 547             h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 548     }
 549     return h;
 550 }
 551
 552 static char *hangul_syllables[][3] = {
 553     { "G",  "A",   ""   },
 554     { "GG", "AE",  "G"  },
 555     { "N",  "YA",  "GG" },
 556     { "D",  "YAE", "GS" },
 557     { "DD", "EO",  "N", },
 558     { "R",  "E",   "NJ" },
 559     { "M",  "YEO", "NH" },
 560     { "B",  "YE",  "D"  },
 561     { "BB", "O",   "L"  },
 562     { "S",  "WA",  "LG" },
 563     { "SS", "WAE", "LM" },
 564     { "",   "OE",  "LB" },
 565     { "J",  "YO",  "LS" },
 566     { "JJ", "U",   "LT" },
 567     { "C",  "WEO", "LP" },
 568     { "K",  "WE",  "LH" },
 569     { "T",  "WI",  "M"  },
 570     { "P",  "YU",  "B"  },
 571     { "H",  "EU",  "BS" },
 572     { 0,    "YI",  "S"  },
 573     { 0,    "I",   "SS" },
 574     { 0,    0,     "NG" },
 575     { 0,    0,     "J"  },
 576     { 0,    0,     "C"  },
 577     { 0,    0,     "K"  },
 578     { 0,    0,     "T"  },
 579     { 0,    0,     "P"  },
 580     { 0,    0,     "H"  }
 581 };
 582
 583 static int
 584 is_unified_ideograph(Py_UCS4 code)
 585 {
 586     return (
 587         (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 588         (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
 589         (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 590 }
 591
 592 static int
 593 _getucname(Py_UCS4 code, char* buffer, int buflen)
 594 {
 595     int offset;
 596     int i;
 597     int word;
 598     unsigned char* w;
 599
 600     if (SBase <= code && code < SBase+SCount) {
 601         /* Hangul syllable. */
 602         int SIndex = code - SBase;
 603         int L = SIndex / NCount;
 604         int V = (SIndex % NCount) / TCount;
 605         int T = SIndex % TCount;
 606
 607         if (buflen < 27)
 608             /* Worst case: HANGUL SYLLABLE <10chars>. */
 609             return 0;
 610         strcpy(buffer, "HANGUL SYLLABLE ");
 611         buffer += 16;
 612         strcpy(buffer, hangul_syllables[L][0]);
 613         buffer += strlen(hangul_syllables[L][0]);
 614         strcpy(buffer, hangul_syllables[V][1]);
 615         buffer += strlen(hangul_syllables[V][1]);
 616         strcpy(buffer, hangul_syllables[T][2]);
 617         buffer += strlen(hangul_syllables[T][2]);
 618         *buffer = '\0';
 619         return 1;
 620     }
 621
 622     if (is_unified_ideograph(code)) {
 623         if (buflen < 28)
 624             /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 625             return 0;
 626         sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 627         return 1;
 628     }
 629
 630     if (code >= 0x110000)
 631         return 0;
 632
 633     /* get offset into phrasebook */
 634     offset = phrasebook_offset1[(code>>phrasebook_shift)];
 635     offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 636                                (code&((1<<phrasebook_shift)-1))];
 637     if (!offset)
 638         return 0;
 639
 640     i = 0;
 641
 642     for (;;) {
 643         /* get word index */
 644         word = phrasebook[offset] - phrasebook_short;
 645         if (word >= 0) {
 646             word = (word << 8) + phrasebook[offset+1];
 647             offset += 2;
 648         } else
 649             word = phrasebook[offset++];
 650         if (i) {
 651             if (i > buflen)
 652                 return 0; /* buffer overflow */
 653             buffer[i++] = ' ';
 654         }
 655         /* copy word string from lexicon.  the last character in the
 656            word has bit 7 set.  the last word in a string ends with
 657            0x80 */
 658         w = lexicon + lexicon_offset[word];
 659         while (*w < 128) {
 660             if (i >= buflen)
 661                 return 0; /* buffer overflow */
 662             buffer[i++] = *w++;
 663         }
 664         if (i >= buflen)
 665             return 0; /* buffer overflow */
 666         buffer[i++] = *w & 127;
 667         if (*w == 128)
 668             break; /* end of word */
 669     }
 670
 671     return 1;
 672 }
 673
 674 static int
 675 _cmpname(int code, const char* name, int namelen)
 676 {
 677     /* check if code corresponds to the given name */
 678     int i;
 679     char buffer[NAME_MAXLEN];
 680     if (!_getucname(code, buffer, sizeof(buffer)))
 681         return 0;
 682     for (i = 0; i < namelen; i++) {
 683         if (toupper(name[i]) != buffer[i])
 684             return 0;
 685     }
 686     return buffer[namelen] == '\0';
 687 }
 688
 689 static void
 690 find_syllable(const char *str, int *len, int *pos, int count, int column)
 691 {
 692     int i, len1;
 693     *len = -1;
 694     for (i = 0; i < count; i++) {
 695         char *s = hangul_syllables[i][column];
 696         len1 = strlen(s);
 697         if (len1 <= *len)
 698             continue;
 699         if (strncmp(str, s, len1) == 0) {
 700             *len = len1;
 701             *pos = i;
 702         }
 703     }
 704     if (*len == -1) {
 705         *len = 0;
 706         *pos = -1;
 707     }
 708 }
 709
 710 static int
 711 _getcode(const char* name, int namelen, Py_UCS4* code)
 712 {
 713     unsigned int h, v;
 714     unsigned int mask = code_size-1;
 715     unsigned int i, incr;
 716
 717     /* Check for hangul syllables. */
 718     if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 719         int L, V, T, len;
 720         const char *pos = name + 16;
 721         find_syllable(pos, &len, &L, LCount, 0);
 722         pos += len;
 723         find_syllable(pos, &len, &V, VCount, 1);
 724         pos += len;
 725         find_syllable(pos, &len, &T, TCount, 2);
 726         pos += len;
 727         if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
 728             *code = SBase + (L*VCount+V)*TCount + T;
 729             return 1;
 730         }
 731         /* Otherwise, it's an illegal syllable name. */
 732         return 0;
 733     }
 734
 735     /* Check for unified ideographs. */
 736     if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 737         /* Four or five hexdigits must follow. */
 738         v = 0;
 739         name += 22;
 740         namelen -= 22;
 741         if (namelen != 4 && namelen != 5)
 742             return 0;
 743         while (namelen--) {
 744             v *= 16;
 745             if (*name >= '0' && *name <= '9')
 746                 v += *name - '0';
 747             else if (*name >= 'A' && *name <= 'F')
 748                 v += *name - 'A' + 10;
 749             else
 750                 return 0;
 751             name++;
 752         }
 753         if (!is_unified_ideograph(v))
 754             return 0;
 755         *code = v;
 756         return 1;
 757     }
 758
 759     /* the following is the same as python's dictionary lookup, with
 760        only minor changes.  see the makeunicodedata script for more
 761        details */
 762
 763     h = (unsigned int) _gethash(name, namelen, code_magic);
 764     i = (~h) & mask;
 765     v = code_hash[i];
 766     if (!v)
 767         return 0;
 768     if (_cmpname(v, name, namelen)) {
 769         *code = v;
 770         return 1;
 771     }
 772     incr = (h ^ (h >> 3)) & mask;
 773     if (!incr)
 774         incr = mask;
 775     for (;;) {
 776         i = (i + incr) & mask;
 777         v = code_hash[i];
 778         if (!v)
 779             return 0;
 780         if (_cmpname(v, name, namelen)) {
 781             *code = v;
 782             return 1;
 783         }
 784         incr = incr << 1;
 785         if (incr > mask)
 786             incr = incr ^ code_poly;
 787     }
 788 }
 789
 790 static const _PyUnicode_Name_CAPI hashAPI =
 791 {
 792     sizeof(_PyUnicode_Name_CAPI),
 793     _getucname,
 794     _getcode
 795 };
 796
 797 /* -------------------------------------------------------------------- */
 798 /* Python bindings */
 799
 800 static PyObject *
 801 unicodedata_name(PyObject* self, PyObject* args)
 802 {
 803     char name[NAME_MAXLEN];
 804
 805     PyUnicodeObject* v;
 806     PyObject* defobj = NULL;
 807     if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
 808         return NULL;
 809
 810     if (PyUnicode_GET_SIZE(v) != 1) {
 811         PyErr_SetString(PyExc_TypeError,
 812                         "need a single Unicode character as parameter");
 813         return NULL;
 814     }
 815
 816     if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
 817                              name, sizeof(name))) {
 818         if (defobj == NULL) {
 819             PyErr_SetString(PyExc_ValueError, "no such name");
 820             return NULL;
 821         }
 822         else {
 823             Py_INCREF(defobj);
 824             return defobj;
 825         }
 826     }
 827
 828     return Py_BuildValue("s", name);
 829 }
 830
 831 static PyObject *
 832 unicodedata_lookup(PyObject* self, PyObject* args)
 833 {
 834     Py_UCS4 code;
 835     Py_UNICODE str[1];
 836
 837     char* name;
 838     int namelen;
 839     if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
 840         return NULL;
 841
 842     if (!_getcode(name, namelen, &code)) {
 843         char fmt[] = "undefined character name '%s'";
 844         char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
 845         sprintf(buf, fmt, name);
 846         PyErr_SetString(PyExc_KeyError, buf);
 847         PyMem_FREE(buf);
 848         return NULL;
 849     }
 850
 851     str[0] = (Py_UNICODE) code;
 852     return PyUnicode_FromUnicode(str, 1);
 853 }
 854
 855 /* XXX Add doc strings. */
 856
 857 static PyMethodDef unicodedata_functions[] = {
 858     {"decimal", unicodedata_decimal, METH_VARARGS},
 859     {"digit", unicodedata_digit, METH_VARARGS},
 860     {"numeric", unicodedata_numeric, METH_VARARGS},
 861     {"category", unicodedata_category, METH_VARARGS},
 862     {"bidirectional", unicodedata_bidirectional, METH_VARARGS},
 863     {"combining", unicodedata_combining, METH_VARARGS},
 864     {"mirrored", unicodedata_mirrored, METH_VARARGS},
 865     {"decomposition",unicodedata_decomposition, METH_VARARGS},
 866     {"name", unicodedata_name, METH_VARARGS},
 867     {"lookup", unicodedata_lookup, METH_VARARGS},
 868     {"normalize", unicodedata_normalize, METH_VARARGS},
 869     {NULL, NULL}                /* sentinel */
 870 };
 871
 872 PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
 873
 874 PyMODINIT_FUNC
 875 initunicodedata(void)
 876 {
 877     PyObject *m, *v;
 878
 879     m = Py_InitModule3(
 880         "unicodedata", unicodedata_functions, unicodedata_docstring);
 881     if (!m)
 882         return;
 883
 884     PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
 885
 886     /* Export C API */
 887     v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
 888     if (v != NULL)
 889         PyModule_AddObject(m, "ucnhash_CAPI", v);
 890 }
 891
 892 /*
 893 Local variables:
 894 c-basic-offset: 4
 895 indent-tabs-mode: nil
 896 End:
 897 */