src/backend/regex/regc_locale.c

   1 /*
   2  * regc_locale.c --
   3  *
   4  *      This file contains locale-specific regexp routines.
   5  *      This file is #included by regcomp.c.
   6  *
   7  * Copyright (c) 1998 by Scriptics Corporation.
   8  *
   9  * This software is copyrighted by the Regents of the University of
  10  * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
  11  * Corporation and other parties.  The following terms apply to all files
  12  * associated with the software unless explicitly disclaimed in
  13  * individual files.
  14  *
  15  * The authors hereby grant permission to use, copy, modify, distribute,
  16  * and license this software and its documentation for any purpose, provided
  17  * that existing copyright notices are retained in all copies and that this
  18  * notice is included verbatim in any distributions. No written agreement,
  19  * license, or royalty fee is required for any of the authorized uses.
  20  * Modifications to this software may be copyrighted by their authors
  21  * and need not follow the licensing terms described here, provided that
  22  * the new terms are clearly indicated on the first page of each file where
  23  * they apply.
  24  *
  25  * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
  26  * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
  27  * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
  28  * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  *
  31  * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
  32  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
  33  * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.      THIS SOFTWARE
  34  * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
  35  * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
  36  * MODIFICATIONS.
  37  *
  38  * GOVERNMENT USE: If you are acquiring this software on behalf of the
  39  * U.S. government, the Government shall have only "Restricted Rights"
  40  * in the software and related documentation as defined in the Federal
  41  * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
  42  * are acquiring the software on behalf of the Department of Defense, the
  43  * software shall be classified as "Commercial Computer Software" and the
  44  * Government shall have only "Restricted Rights" as defined in Clause
  45  * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
  46  * authors grant the U.S. Government and others acting in its behalf
  47  * permission to use and distribute the software in accordance with the
  48  * terms specified in this license.
  49  *
  50  * $PostgreSQL$
  51  */
  52
  53 /* ASCII character-name table */
  54
  55 static const struct cname
  56 {
  57         const char *name;
  58         const char      code;
  59 }       cnames[] =
  60
  61 {
  62         {
  63                 "NUL", '\0'
  64         },
  65         {
  66                 "SOH", '\001'
  67         },
  68         {
  69                 "STX", '\002'
  70         },
  71         {
  72                 "ETX", '\003'
  73         },
  74         {
  75                 "EOT", '\004'
  76         },
  77         {
  78                 "ENQ", '\005'
  79         },
  80         {
  81                 "ACK", '\006'
  82         },
  83         {
  84                 "BEL", '\007'
  85         },
  86         {
  87                 "alert", '\007'
  88         },
  89         {
  90                 "BS", '\010'
  91         },
  92         {
  93                 "backspace", '\b'
  94         },
  95         {
  96                 "HT", '\011'
  97         },
  98         {
  99                 "tab", '\t'
 100         },
 101         {
 102                 "LF", '\012'
 103         },
 104         {
 105                 "newline", '\n'
 106         },
 107         {
 108                 "VT", '\013'
 109         },
 110         {
 111                 "vertical-tab", '\v'
 112         },
 113         {
 114                 "FF", '\014'
 115         },
 116         {
 117                 "form-feed", '\f'
 118         },
 119         {
 120                 "CR", '\015'
 121         },
 122         {
 123                 "carriage-return", '\r'
 124         },
 125         {
 126                 "SO", '\016'
 127         },
 128         {
 129                 "SI", '\017'
 130         },
 131         {
 132                 "DLE", '\020'
 133         },
 134         {
 135                 "DC1", '\021'
 136         },
 137         {
 138                 "DC2", '\022'
 139         },
 140         {
 141                 "DC3", '\023'
 142         },
 143         {
 144                 "DC4", '\024'
 145         },
 146         {
 147                 "NAK", '\025'
 148         },
 149         {
 150                 "SYN", '\026'
 151         },
 152         {
 153                 "ETB", '\027'
 154         },
 155         {
 156                 "CAN", '\030'
 157         },
 158         {
 159                 "EM", '\031'
 160         },
 161         {
 162                 "SUB", '\032'
 163         },
 164         {
 165                 "ESC", '\033'
 166         },
 167         {
 168                 "IS4", '\034'
 169         },
 170         {
 171                 "FS", '\034'
 172         },
 173         {
 174                 "IS3", '\035'
 175         },
 176         {
 177                 "GS", '\035'
 178         },
 179         {
 180                 "IS2", '\036'
 181         },
 182         {
 183                 "RS", '\036'
 184         },
 185         {
 186                 "IS1", '\037'
 187         },
 188         {
 189                 "US", '\037'
 190         },
 191         {
 192                 "space", ' '
 193         },
 194         {
 195                 "exclamation-mark", '!'
 196         },
 197         {
 198                 "quotation-mark", '"'
 199         },
 200         {
 201                 "number-sign", '#'
 202         },
 203         {
 204                 "dollar-sign", '$'
 205         },
 206         {
 207                 "percent-sign", '%'
 208         },
 209         {
 210                 "ampersand", '&'
 211         },
 212         {
 213                 "apostrophe", '\''
 214         },
 215         {
 216                 "left-parenthesis", '('
 217         },
 218         {
 219                 "right-parenthesis", ')'
 220         },
 221         {
 222                 "asterisk", '*'
 223         },
 224         {
 225                 "plus-sign", '+'
 226         },
 227         {
 228                 "comma", ','
 229         },
 230         {
 231                 "hyphen", '-'
 232         },
 233         {
 234                 "hyphen-minus", '-'
 235         },
 236         {
 237                 "period", '.'
 238         },
 239         {
 240                 "full-stop", '.'
 241         },
 242         {
 243                 "slash", '/'
 244         },
 245         {
 246                 "solidus", '/'
 247         },
 248         {
 249                 "zero", '0'
 250         },
 251         {
 252                 "one", '1'
 253         },
 254         {
 255                 "two", '2'
 256         },
 257         {
 258                 "three", '3'
 259         },
 260         {
 261                 "four", '4'
 262         },
 263         {
 264                 "five", '5'
 265         },
 266         {
 267                 "six", '6'
 268         },
 269         {
 270                 "seven", '7'
 271         },
 272         {
 273                 "eight", '8'
 274         },
 275         {
 276                 "nine", '9'
 277         },
 278         {
 279                 "colon", ':'
 280         },
 281         {
 282                 "semicolon", ';'
 283         },
 284         {
 285                 "less-than-sign", '<'
 286         },
 287         {
 288                 "equals-sign", '='
 289         },
 290         {
 291                 "greater-than-sign", '>'
 292         },
 293         {
 294                 "question-mark", '?'
 295         },
 296         {
 297                 "commercial-at", '@'
 298         },
 299         {
 300                 "left-square-bracket", '['
 301         },
 302         {
 303                 "backslash", '\\'
 304         },
 305         {
 306                 "reverse-solidus", '\\'
 307         },
 308         {
 309                 "right-square-bracket", ']'
 310         },
 311         {
 312                 "circumflex", '^'
 313         },
 314         {
 315                 "circumflex-accent", '^'
 316         },
 317         {
 318                 "underscore", '_'
 319         },
 320         {
 321                 "low-line", '_'
 322         },
 323         {
 324                 "grave-accent", '`'
 325         },
 326         {
 327                 "left-brace", '{'
 328         },
 329         {
 330                 "left-curly-bracket", '{'
 331         },
 332         {
 333                 "vertical-line", '|'
 334         },
 335         {
 336                 "right-brace", '}'
 337         },
 338         {
 339                 "right-curly-bracket", '}'
 340         },
 341         {
 342                 "tilde", '~'
 343         },
 344         {
 345                 "DEL", '\177'
 346         },
 347         {
 348                 NULL, 0
 349         }
 350 };
 351
 352 /*
 353  * some ctype functions with non-ascii-char guard
 354  */
 355 static int
 356 pg_wc_isdigit(pg_wchar c)
 357 {
 358         return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
 359 }
 360
 361 static int
 362 pg_wc_isalpha(pg_wchar c)
 363 {
 364         return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
 365 }
 366
 367 static int
 368 pg_wc_isalnum(pg_wchar c)
 369 {
 370         return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
 371 }
 372
 373 static int
 374 pg_wc_isupper(pg_wchar c)
 375 {
 376         return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
 377 }
 378
 379 static int
 380 pg_wc_islower(pg_wchar c)
 381 {
 382         return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
 383 }
 384
 385 static int
 386 pg_wc_isgraph(pg_wchar c)
 387 {
 388         return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
 389 }
 390
 391 static int
 392 pg_wc_isprint(pg_wchar c)
 393 {
 394         return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
 395 }
 396
 397 static int
 398 pg_wc_ispunct(pg_wchar c)
 399 {
 400         return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
 401 }
 402
 403 static int
 404 pg_wc_isspace(pg_wchar c)
 405 {
 406         return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
 407 }
 408
 409 static pg_wchar
 410 pg_wc_toupper(pg_wchar c)
 411 {
 412         if (c >= 0 && c <= UCHAR_MAX)
 413                 return toupper((unsigned char) c);
 414         return c;
 415 }
 416
 417 static pg_wchar
 418 pg_wc_tolower(pg_wchar c)
 419 {
 420         if (c >= 0 && c <= UCHAR_MAX)
 421                 return tolower((unsigned char) c);
 422         return c;
 423 }
 424
 425
 426 /*
 427  * element - map collating-element name to celt
 428  */
 429 static celt
 430 element(struct vars * v,                /* context */
 431                 const chr *startp,              /* points to start of name */
 432                 const chr *endp)                /* points just past end of name */
 433 {
 434         const struct cname *cn;
 435         size_t          len;
 436
 437         /* generic:  one-chr names stand for themselves */
 438         assert(startp < endp);
 439         len = endp - startp;
 440         if (len == 1)
 441                 return *startp;
 442
 443         NOTE(REG_ULOCALE);
 444
 445         /* search table */
 446         for (cn = cnames; cn->name != NULL; cn++)
 447         {
 448                 if (strlen(cn->name) == len &&
 449                         pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
 450                 {
 451                         break;                          /* NOTE BREAK OUT */
 452                 }
 453         }
 454         if (cn->name != NULL)
 455                 return CHR(cn->code);
 456
 457         /* couldn't find it */
 458         ERR(REG_ECOLLATE);
 459         return 0;
 460 }
 461
 462 /*
 463  * range - supply cvec for a range, including legality check
 464  */
 465 static struct cvec *
 466 range(struct vars * v,                  /* context */
 467           celt a,                                       /* range start */
 468           celt b,                                       /* range end, might equal a */
 469           int cases)                            /* case-independent? */
 470 {
 471         int                     nchrs;
 472         struct cvec *cv;
 473         celt            c,
 474                                 lc,
 475                                 uc;
 476
 477         if (a != b && !before(a, b))
 478         {
 479                 ERR(REG_ERANGE);
 480                 return NULL;
 481         }
 482
 483         if (!cases)
 484         {                                                       /* easy version */
 485                 cv = getcvec(v, 0, 1);
 486                 NOERRN();
 487                 addrange(cv, a, b);
 488                 return cv;
 489         }
 490
 491         /*
 492          * When case-independent, it's hard to decide when cvec ranges are usable,
 493          * so for now at least, we won't try.  We allocate enough space for two
 494          * case variants plus a little extra for the two title case variants.
 495          */
 496
 497         nchrs = (b - a + 1) * 2 + 4;
 498
 499         cv = getcvec(v, nchrs, 0);
 500         NOERRN();
 501
 502         for (c = a; c <= b; c++)
 503         {
 504                 addchr(cv, c);
 505                 lc = pg_wc_tolower((chr) c);
 506                 if (c != lc)
 507                         addchr(cv, lc);
 508                 uc = pg_wc_toupper((chr) c);
 509                 if (c != uc)
 510                         addchr(cv, uc);
 511         }
 512
 513         return cv;
 514 }
 515
 516 /*
 517  * before - is celt x before celt y, for purposes of range legality?
 518  */
 519 static int                                              /* predicate */
 520 before(celt x, celt y)
 521 {
 522         if (x < y)
 523                 return 1;
 524         return 0;
 525 }
 526
 527 /*
 528  * eclass - supply cvec for an equivalence class
 529  * Must include case counterparts on request.
 530  */
 531 static struct cvec *
 532 eclass(struct vars * v,                 /* context */
 533            celt c,                                      /* Collating element representing the
 534                                                                  * equivalence class. */
 535            int cases)                           /* all cases? */
 536 {
 537         struct cvec *cv;
 538
 539         /* crude fake equivalence class for testing */
 540         if ((v->cflags & REG_FAKE) && c == 'x')
 541         {
 542                 cv = getcvec(v, 4, 0);
 543                 addchr(cv, (chr) 'x');
 544                 addchr(cv, (chr) 'y');
 545                 if (cases)
 546                 {
 547                         addchr(cv, (chr) 'X');
 548                         addchr(cv, (chr) 'Y');
 549                 }
 550                 return cv;
 551         }
 552
 553         /* otherwise, none */
 554         if (cases)
 555                 return allcases(v, c);
 556         cv = getcvec(v, 1, 0);
 557         assert(cv != NULL);
 558         addchr(cv, (chr) c);
 559         return cv;
 560 }
 561
 562 /*
 563  * cclass - supply cvec for a character class
 564  *
 565  * Must include case counterparts on request.
 566  */
 567 static struct cvec *
 568 cclass(struct vars * v,                 /* context */
 569            const chr *startp,           /* where the name starts */
 570            const chr *endp,                     /* just past the end of the name */
 571            int cases)                           /* case-independent? */
 572 {
 573         size_t          len;
 574         struct cvec *cv = NULL;
 575         const char **namePtr;
 576         int                     i,
 577                                 index;
 578
 579         /*
 580          * The following arrays define the valid character class names.
 581          */
 582
 583         static const char *classNames[] = {
 584                 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
 585                 "lower", "print", "punct", "space", "upper", "xdigit", NULL
 586         };
 587
 588         enum classes
 589         {
 590                 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
 591                 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
 592         };
 593
 594         /*
 595          * Map the name to the corresponding enumerated value.
 596          */
 597         len = endp - startp;
 598         index = -1;
 599         for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
 600         {
 601                 if (strlen(*namePtr) == len &&
 602                         pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
 603                 {
 604                         index = i;
 605                         break;
 606                 }
 607         }
 608         if (index == -1)
 609         {
 610                 ERR(REG_ECTYPE);
 611                 return NULL;
 612         }
 613
 614         /*
 615          * Remap lower and upper to alpha if the match is case insensitive.
 616          */
 617
 618         if (cases &&
 619                 ((enum classes) index == CC_LOWER ||
 620                  (enum classes) index == CC_UPPER))
 621                 index = (int) CC_ALPHA;
 622
 623         /*
 624          * Now compute the character class contents.
 625          *
 626          * For the moment, assume that only char codes < 256 can be in these
 627          * classes.
 628          */
 629
 630         switch ((enum classes) index)
 631         {
 632                 case CC_PRINT:
 633                         cv = getcvec(v, UCHAR_MAX, 0);
 634                         if (cv)
 635                         {
 636                                 for (i = 0; i <= UCHAR_MAX; i++)
 637                                 {
 638                                         if (pg_wc_isprint((chr) i))
 639                                                 addchr(cv, (chr) i);
 640                                 }
 641                         }
 642                         break;
 643                 case CC_ALNUM:
 644                         cv = getcvec(v, UCHAR_MAX, 0);
 645                         if (cv)
 646                         {
 647                                 for (i = 0; i <= UCHAR_MAX; i++)
 648                                 {
 649                                         if (pg_wc_isalnum((chr) i))
 650                                                 addchr(cv, (chr) i);
 651                                 }
 652                         }
 653                         break;
 654                 case CC_ALPHA:
 655                         cv = getcvec(v, UCHAR_MAX, 0);
 656                         if (cv)
 657                         {
 658                                 for (i = 0; i <= UCHAR_MAX; i++)
 659                                 {
 660                                         if (pg_wc_isalpha((chr) i))
 661                                                 addchr(cv, (chr) i);
 662                                 }
 663                         }
 664                         break;
 665                 case CC_ASCII:
 666                         cv = getcvec(v, 0, 1);
 667                         if (cv)
 668                                 addrange(cv, 0, 0x7f);
 669                         break;
 670                 case CC_BLANK:
 671                         cv = getcvec(v, 2, 0);
 672                         addchr(cv, '\t');
 673                         addchr(cv, ' ');
 674                         break;
 675                 case CC_CNTRL:
 676                         cv = getcvec(v, 0, 2);
 677                         addrange(cv, 0x0, 0x1f);
 678                         addrange(cv, 0x7f, 0x9f);
 679                         break;
 680                 case CC_DIGIT:
 681                         cv = getcvec(v, 0, 1);
 682                         if (cv)
 683                                 addrange(cv, (chr) '0', (chr) '9');
 684                         break;
 685                 case CC_PUNCT:
 686                         cv = getcvec(v, UCHAR_MAX, 0);
 687                         if (cv)
 688                         {
 689                                 for (i = 0; i <= UCHAR_MAX; i++)
 690                                 {
 691                                         if (pg_wc_ispunct((chr) i))
 692                                                 addchr(cv, (chr) i);
 693                                 }
 694                         }
 695                         break;
 696                 case CC_XDIGIT:
 697                         cv = getcvec(v, 0, 3);
 698                         if (cv)
 699                         {
 700                                 addrange(cv, '0', '9');
 701                                 addrange(cv, 'a', 'f');
 702                                 addrange(cv, 'A', 'F');
 703                         }
 704                         break;
 705                 case CC_SPACE:
 706                         cv = getcvec(v, UCHAR_MAX, 0);
 707                         if (cv)
 708                         {
 709                                 for (i = 0; i <= UCHAR_MAX; i++)
 710                                 {
 711                                         if (pg_wc_isspace((chr) i))
 712                                                 addchr(cv, (chr) i);
 713                                 }
 714                         }
 715                         break;
 716                 case CC_LOWER:
 717                         cv = getcvec(v, UCHAR_MAX, 0);
 718                         if (cv)
 719                         {
 720                                 for (i = 0; i <= UCHAR_MAX; i++)
 721                                 {
 722                                         if (pg_wc_islower((chr) i))
 723                                                 addchr(cv, (chr) i);
 724                                 }
 725                         }
 726                         break;
 727                 case CC_UPPER:
 728                         cv = getcvec(v, UCHAR_MAX, 0);
 729                         if (cv)
 730                         {
 731                                 for (i = 0; i <= UCHAR_MAX; i++)
 732                                 {
 733                                         if (pg_wc_isupper((chr) i))
 734                                                 addchr(cv, (chr) i);
 735                                 }
 736                         }
 737                         break;
 738                 case CC_GRAPH:
 739                         cv = getcvec(v, UCHAR_MAX, 0);
 740                         if (cv)
 741                         {
 742                                 for (i = 0; i <= UCHAR_MAX; i++)
 743                                 {
 744                                         if (pg_wc_isgraph((chr) i))
 745                                                 addchr(cv, (chr) i);
 746                                 }
 747                         }
 748                         break;
 749         }
 750         if (cv == NULL)
 751                 ERR(REG_ESPACE);
 752         return cv;
 753 }
 754
 755 /*
 756  * allcases - supply cvec for all case counterparts of a chr (including itself)
 757  *
 758  * This is a shortcut, preferably an efficient one, for simple characters;
 759  * messy cases are done via range().
 760  */
 761 static struct cvec *
 762 allcases(struct vars * v,               /* context */
 763                  chr pc)                                /* character to get case equivs of */
 764 {
 765         struct cvec *cv;
 766         chr                     c = (chr) pc;
 767         chr                     lc,
 768                                 uc;
 769
 770         lc = pg_wc_tolower((chr) c);
 771         uc = pg_wc_toupper((chr) c);
 772
 773         cv = getcvec(v, 2, 0);
 774         addchr(cv, lc);
 775         if (lc != uc)
 776                 addchr(cv, uc);
 777         return cv;
 778 }
 779
 780 /*
 781  * cmp - chr-substring compare
 782  *
 783  * Backrefs need this.  It should preferably be efficient.
 784  * Note that it does not need to report anything except equal/unequal.
 785  * Note also that the length is exact, and the comparison should not
 786  * stop at embedded NULs!
 787  */
 788 static int                                              /* 0 for equal, nonzero for unequal */
 789 cmp(const chr *x, const chr *y, /* strings to compare */
 790         size_t len)                                     /* exact length of comparison */
 791 {
 792         return memcmp(VS(x), VS(y), len * sizeof(chr));
 793 }
 794
 795 /*
 796  * casecmp - case-independent chr-substring compare
 797  *
 798  * REG_ICASE backrefs need this.  It should preferably be efficient.
 799  * Note that it does not need to report anything except equal/unequal.
 800  * Note also that the length is exact, and the comparison should not
 801  * stop at embedded NULs!
 802  */
 803 static int                                              /* 0 for equal, nonzero for unequal */
 804 casecmp(const chr *x, const chr *y,             /* strings to compare */
 805                 size_t len)                             /* exact length of comparison */
 806 {
 807         for (; len > 0; len--, x++, y++)
 808         {
 809                 if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
 810                         return 1;
 811         }
 812         return 0;
 813 }