Objects/unicodectype.c

   1 /*
   2    Unicode character type helpers.
   3
   4    Written by Marc-Andre Lemburg (mal@lemburg.com).
   5    Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   6
   7    Copyright (c) Corporation for National Research Initiatives.
   8
   9 */
  10
  11 #include "Python.h"
  12 #include "unicodeobject.h"
  13
  14 #define ALPHA_MASK 0x01
  15 #define DECIMAL_MASK 0x02
  16 #define DIGIT_MASK 0x04
  17 #define LOWER_MASK 0x08
  18 #define LINEBREAK_MASK 0x10
  19 #define SPACE_MASK 0x20
  20 #define TITLE_MASK 0x40
  21 #define UPPER_MASK 0x80
  22
  23 typedef struct {
  24     const unsigned short flags;
  25     const Py_UNICODE upper;
  26     const Py_UNICODE lower;
  27     const Py_UNICODE title;
  28     const unsigned char decimal;
  29     const unsigned char digit;
  30 } _PyUnicode_TypeRecord;
  31
  32 #include "unicodetype_db.h"
  33
  34 static const _PyUnicode_TypeRecord *
  35 gettyperecord(Py_UNICODE code)
  36 {
  37     int index;
  38
  39     if (code >= 0x110000)
  40         index = 0;
  41     else {
  42         index = index1[(code>>SHIFT)];
  43         index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  44     }
  45
  46     return &_PyUnicode_TypeRecords[index];
  47 }
  48
  49 /* Returns 1 for Unicode characters having the category 'Zl' or type
  50    'B', 0 otherwise. */
  51
  52 int _PyUnicode_IsLinebreak(Py_UNICODE ch)
  53 {
  54     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  55
  56     return (ctype->flags & LINEBREAK_MASK) != 0;
  57 }
  58
  59 /* Returns the titlecase Unicode characters corresponding to ch or just
  60    ch if no titlecase mapping is known. */
  61
  62 Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
  63 {
  64     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  65     int delta;
  66
  67     if (ctype->title)
  68         delta = ctype->title;
  69     else
  70         delta = ctype->upper;
  71
  72     if (delta >= 32768)
  73             delta -= 65536;
  74
  75     return ch + delta;
  76 }
  77
  78 /* Returns 1 for Unicode characters having the category 'Lt', 0
  79    otherwise. */
  80
  81 int _PyUnicode_IsTitlecase(Py_UNICODE ch)
  82 {
  83     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  84
  85     return (ctype->flags & TITLE_MASK) != 0;
  86 }
  87
  88 /* Returns the integer decimal (0-9) for Unicode characters having
  89    this property, -1 otherwise. */
  90
  91 int _PyUnicode_ToDecimalDigit(Py_UNICODE ch)
  92 {
  93     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
  94
  95     return (ctype->flags & DECIMAL_MASK) ? ctype->decimal : -1;
  96 }
  97
  98 int _PyUnicode_IsDecimalDigit(Py_UNICODE ch)
  99 {
 100     if (_PyUnicode_ToDecimalDigit(ch) < 0)
 101         return 0;
 102     return 1;
 103 }
 104
 105 /* Returns the integer digit (0-9) for Unicode characters having
 106    this property, -1 otherwise. */
 107
 108 int _PyUnicode_ToDigit(Py_UNICODE ch)
 109 {
 110     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 111
 112     return (ctype->flags & DIGIT_MASK) ? ctype->digit : -1;
 113 }
 114
 115 int _PyUnicode_IsDigit(Py_UNICODE ch)
 116 {
 117     if (_PyUnicode_ToDigit(ch) < 0)
 118         return 0;
 119     return 1;
 120 }
 121
 122 /* Returns the numeric value as double for Unicode characters having
 123    this property, -1.0 otherwise. */
 124
 125 /* TODO: replace with unicodetype_db.h table */
 126
 127 double _PyUnicode_ToNumeric(Py_UNICODE ch)
 128 {
 129     switch (ch) {
 130     case 0x3007:
 131         return (double) 0;
 132     case 0x09F4:
 133     case 0x215F:
 134     case 0x2160:
 135     case 0x2170:
 136     case 0x3021:
 137     case 0x3280:
 138         return (double) 1;
 139     case 0x00BD:
 140         return (double) 1 / 2;
 141     case 0x2153:
 142         return (double) 1 / 3;
 143     case 0x00BC:
 144         return (double) 1 / 4;
 145     case 0x2155:
 146         return (double) 1 / 5;
 147     case 0x2159:
 148         return (double) 1 / 6;
 149     case 0x215B:
 150         return (double) 1 / 8;
 151     case 0x0BF0:
 152     case 0x1372:
 153     case 0x2169:
 154     case 0x2179:
 155     case 0x2469:
 156     case 0x247D:
 157     case 0x2491:
 158     case 0x277F:
 159     case 0x2789:
 160     case 0x2793:
 161     case 0x3038:
 162     case 0x3289:
 163         return (double) 10;
 164     case 0x0BF1:
 165     case 0x137B:
 166     case 0x216D:
 167     case 0x217D:
 168         return (double) 100;
 169     case 0x0BF2:
 170     case 0x216F:
 171     case 0x217F:
 172     case 0x2180:
 173         return (double) 1000;
 174     case 0x137C:
 175     case 0x2182:
 176         return (double) 10000;
 177     case 0x216A:
 178     case 0x217A:
 179     case 0x246A:
 180     case 0x247E:
 181     case 0x2492:
 182         return (double) 11;
 183     case 0x216B:
 184     case 0x217B:
 185     case 0x246B:
 186     case 0x247F:
 187     case 0x2493:
 188         return (double) 12;
 189     case 0x246C:
 190     case 0x2480:
 191     case 0x2494:
 192         return (double) 13;
 193     case 0x246D:
 194     case 0x2481:
 195     case 0x2495:
 196         return (double) 14;
 197     case 0x246E:
 198     case 0x2482:
 199     case 0x2496:
 200         return (double) 15;
 201     case 0x09F9:
 202     case 0x246F:
 203     case 0x2483:
 204     case 0x2497:
 205         return (double) 16;
 206     case 0x16EE:
 207     case 0x2470:
 208     case 0x2484:
 209     case 0x2498:
 210         return (double) 17;
 211     case 0x16EF:
 212     case 0x2471:
 213     case 0x2485:
 214     case 0x2499:
 215         return (double) 18;
 216     case 0x16F0:
 217     case 0x2472:
 218     case 0x2486:
 219     case 0x249A:
 220         return (double) 19;
 221     case 0x09F5:
 222     case 0x2161:
 223     case 0x2171:
 224     case 0x3022:
 225     case 0x3281:
 226         return (double) 2;
 227     case 0x2154:
 228         return (double) 2 / 3;
 229     case 0x2156:
 230         return (double) 2 / 5;
 231     case 0x1373:
 232     case 0x2473:
 233     case 0x2487:
 234     case 0x249B:
 235     case 0x3039:
 236         return (double) 20;
 237     case 0x09F6:
 238     case 0x2162:
 239     case 0x2172:
 240     case 0x3023:
 241     case 0x3282:
 242         return (double) 3;
 243     case 0x00BE:
 244         return (double) 3 / 4;
 245     case 0x2157:
 246         return (double) 3 / 5;
 247     case 0x215C:
 248         return (double) 3 / 8;
 249     case 0x1374:
 250     case 0x303A:
 251         return (double) 30;
 252     case 0x09F7:
 253     case 0x2163:
 254     case 0x2173:
 255     case 0x3024:
 256     case 0x3283:
 257         return (double) 4;
 258     case 0x2158:
 259         return (double) 4 / 5;
 260     case 0x1375:
 261         return (double) 40;
 262     case 0x2164:
 263     case 0x2174:
 264     case 0x3025:
 265     case 0x3284:
 266         return (double) 5;
 267     case 0x215A:
 268         return (double) 5 / 6;
 269     case 0x215D:
 270         return (double) 5 / 8;
 271     case 0x1376:
 272     case 0x216C:
 273     case 0x217C:
 274         return (double) 50;
 275     case 0x216E:
 276     case 0x217E:
 277         return (double) 500;
 278     case 0x2181:
 279         return (double) 5000;
 280     case 0x2165:
 281     case 0x2175:
 282     case 0x3026:
 283     case 0x3285:
 284         return (double) 6;
 285     case 0x1377:
 286         return (double) 60;
 287     case 0x2166:
 288     case 0x2176:
 289     case 0x3027:
 290     case 0x3286:
 291         return (double) 7;
 292     case 0x215E:
 293         return (double) 7 / 8;
 294     case 0x1378:
 295         return (double) 70;
 296     case 0x2167:
 297     case 0x2177:
 298     case 0x3028:
 299     case 0x3287:
 300         return (double) 8;
 301     case 0x1379:
 302         return (double) 80;
 303     case 0x2168:
 304     case 0x2178:
 305     case 0x3029:
 306     case 0x3288:
 307         return (double) 9;
 308     case 0x137A:
 309         return (double) 90;
 310     default:
 311         return (double) _PyUnicode_ToDigit(ch);
 312     }
 313 }
 314
 315 int _PyUnicode_IsNumeric(Py_UNICODE ch)
 316 {
 317     if (_PyUnicode_ToNumeric(ch) < 0.0)
 318         return 0;
 319     return 1;
 320 }
 321
 322 #ifndef WANT_WCTYPE_FUNCTIONS
 323
 324 /* Returns 1 for Unicode characters having the bidirectional type
 325    'WS', 'B' or 'S' or the category 'Zs', 0 otherwise. */
 326
 327 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
 328 {
 329     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 330
 331     return (ctype->flags & SPACE_MASK) != 0;
 332 }
 333
 334 /* Returns 1 for Unicode characters having the category 'Ll', 0
 335    otherwise. */
 336
 337 int _PyUnicode_IsLowercase(Py_UNICODE ch)
 338 {
 339     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 340
 341     return (ctype->flags & LOWER_MASK) != 0;
 342 }
 343
 344 /* Returns 1 for Unicode characters having the category 'Lu', 0
 345    otherwise. */
 346
 347 int _PyUnicode_IsUppercase(Py_UNICODE ch)
 348 {
 349     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 350
 351     return (ctype->flags & UPPER_MASK) != 0;
 352 }
 353
 354 /* Returns the uppercase Unicode characters corresponding to ch or just
 355    ch if no uppercase mapping is known. */
 356
 357 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 358 {
 359     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 360     int delta = ctype->upper;
 361     if (delta >= 32768)
 362             delta -= 65536;
 363     return ch + delta;
 364 }
 365
 366 /* Returns the lowercase Unicode characters corresponding to ch or just
 367    ch if no lowercase mapping is known. */
 368
 369 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 370 {
 371     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 372     int delta = ctype->lower;
 373     if (delta >= 32768)
 374             delta -= 65536;
 375     return ch + delta;
 376 }
 377
 378 /* Returns 1 for Unicode characters having the category 'Ll', 'Lu', 'Lt',
 379    'Lo' or 'Lm',  0 otherwise. */
 380
 381 int _PyUnicode_IsAlpha(Py_UNICODE ch)
 382 {
 383     const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
 384
 385     return (ctype->flags & ALPHA_MASK) != 0;
 386 }
 387
 388 #else
 389
 390 /* Export the interfaces using the wchar_t type for portability
 391    reasons:  */
 392
 393 int _PyUnicode_IsWhitespace(Py_UNICODE ch)
 394 {
 395     return iswspace(ch);
 396 }
 397
 398 int _PyUnicode_IsLowercase(Py_UNICODE ch)
 399 {
 400     return iswlower(ch);
 401 }
 402
 403 int _PyUnicode_IsUppercase(Py_UNICODE ch)
 404 {
 405     return iswupper(ch);
 406 }
 407
 408 Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 409 {
 410     return towlower(ch);
 411 }
 412
 413 Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 414 {
 415     return towupper(ch);
 416 }
 417
 418 int _PyUnicode_IsAlpha(Py_UNICODE ch)
 419 {
 420     return iswalpha(ch);
 421 }
 422
 423 #endif