Force a checkpoint in CREATE DATABASE before starting to copy the files,
[PostgreSQL.git] / src / backend / regex / regc_locale.c
blobd832a537001bbeef94fcd9da9ceb2122d3eaf359
1 /*
2 * regc_locale.c --
4 * This file contains locale-specific regexp routines.
5 * This file is #included by regcomp.c.
7 * Copyright (c) 1998 by Scriptics Corporation.
9 * This software is copyrighted by the Regents of the University of
10 * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 * Corporation and other parties. The following terms apply to all files
12 * associated with the software unless explicitly disclaimed in
13 * individual files.
15 * The authors hereby grant permission to use, copy, modify, distribute,
16 * and license this software and its documentation for any purpose, provided
17 * that existing copyright notices are retained in all copies and that this
18 * notice is included verbatim in any distributions. No written agreement,
19 * license, or royalty fee is required for any of the authorized uses.
20 * Modifications to this software may be copyrighted by their authors
21 * and need not follow the licensing terms described here, provided that
22 * the new terms are clearly indicated on the first page of each file where
23 * they apply.
25 * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
31 * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 * MODIFICATIONS.
38 * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 * U.S. government, the Government shall have only "Restricted Rights"
40 * in the software and related documentation as defined in the Federal
41 * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 * are acquiring the software on behalf of the Department of Defense, the
43 * software shall be classified as "Commercial Computer Software" and the
44 * Government shall have only "Restricted Rights" as defined in Clause
45 * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 * authors grant the U.S. Government and others acting in its behalf
47 * permission to use and distribute the software in accordance with the
48 * terms specified in this license.
50 * $PostgreSQL$
53 /* ASCII character-name table */
55 static const struct cname
57 const char *name;
58 const char code;
59 } cnames[] =
63 "NUL", '\0'
66 "SOH", '\001'
69 "STX", '\002'
72 "ETX", '\003'
75 "EOT", '\004'
78 "ENQ", '\005'
81 "ACK", '\006'
84 "BEL", '\007'
87 "alert", '\007'
90 "BS", '\010'
93 "backspace", '\b'
96 "HT", '\011'
99 "tab", '\t'
102 "LF", '\012'
105 "newline", '\n'
108 "VT", '\013'
111 "vertical-tab", '\v'
114 "FF", '\014'
117 "form-feed", '\f'
120 "CR", '\015'
123 "carriage-return", '\r'
126 "SO", '\016'
129 "SI", '\017'
132 "DLE", '\020'
135 "DC1", '\021'
138 "DC2", '\022'
141 "DC3", '\023'
144 "DC4", '\024'
147 "NAK", '\025'
150 "SYN", '\026'
153 "ETB", '\027'
156 "CAN", '\030'
159 "EM", '\031'
162 "SUB", '\032'
165 "ESC", '\033'
168 "IS4", '\034'
171 "FS", '\034'
174 "IS3", '\035'
177 "GS", '\035'
180 "IS2", '\036'
183 "RS", '\036'
186 "IS1", '\037'
189 "US", '\037'
192 "space", ' '
195 "exclamation-mark", '!'
198 "quotation-mark", '"'
201 "number-sign", '#'
204 "dollar-sign", '$'
207 "percent-sign", '%'
210 "ampersand", '&'
213 "apostrophe", '\''
216 "left-parenthesis", '('
219 "right-parenthesis", ')'
222 "asterisk", '*'
225 "plus-sign", '+'
228 "comma", ','
231 "hyphen", '-'
234 "hyphen-minus", '-'
237 "period", '.'
240 "full-stop", '.'
243 "slash", '/'
246 "solidus", '/'
249 "zero", '0'
252 "one", '1'
255 "two", '2'
258 "three", '3'
261 "four", '4'
264 "five", '5'
267 "six", '6'
270 "seven", '7'
273 "eight", '8'
276 "nine", '9'
279 "colon", ':'
282 "semicolon", ';'
285 "less-than-sign", '<'
288 "equals-sign", '='
291 "greater-than-sign", '>'
294 "question-mark", '?'
297 "commercial-at", '@'
300 "left-square-bracket", '['
303 "backslash", '\\'
306 "reverse-solidus", '\\'
309 "right-square-bracket", ']'
312 "circumflex", '^'
315 "circumflex-accent", '^'
318 "underscore", '_'
321 "low-line", '_'
324 "grave-accent", '`'
327 "left-brace", '{'
330 "left-curly-bracket", '{'
333 "vertical-line", '|'
336 "right-brace", '}'
339 "right-curly-bracket", '}'
342 "tilde", '~'
345 "DEL", '\177'
348 NULL, 0
353 * some ctype functions with non-ascii-char guard
355 static int
356 pg_wc_isdigit(pg_wchar c)
358 return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
361 static int
362 pg_wc_isalpha(pg_wchar c)
364 return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
367 static int
368 pg_wc_isalnum(pg_wchar c)
370 return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
373 static int
374 pg_wc_isupper(pg_wchar c)
376 return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
379 static int
380 pg_wc_islower(pg_wchar c)
382 return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
385 static int
386 pg_wc_isgraph(pg_wchar c)
388 return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
391 static int
392 pg_wc_isprint(pg_wchar c)
394 return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
397 static int
398 pg_wc_ispunct(pg_wchar c)
400 return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
403 static int
404 pg_wc_isspace(pg_wchar c)
406 return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
409 static pg_wchar
410 pg_wc_toupper(pg_wchar c)
412 if (c >= 0 && c <= UCHAR_MAX)
413 return toupper((unsigned char) c);
414 return c;
417 static pg_wchar
418 pg_wc_tolower(pg_wchar c)
420 if (c >= 0 && c <= UCHAR_MAX)
421 return tolower((unsigned char) c);
422 return c;
427 * element - map collating-element name to celt
429 static celt
430 element(struct vars * v, /* context */
431 const chr *startp, /* points to start of name */
432 const chr *endp) /* points just past end of name */
434 const struct cname *cn;
435 size_t len;
437 /* generic: one-chr names stand for themselves */
438 assert(startp < endp);
439 len = endp - startp;
440 if (len == 1)
441 return *startp;
443 NOTE(REG_ULOCALE);
445 /* search table */
446 for (cn = cnames; cn->name != NULL; cn++)
448 if (strlen(cn->name) == len &&
449 pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
451 break; /* NOTE BREAK OUT */
454 if (cn->name != NULL)
455 return CHR(cn->code);
457 /* couldn't find it */
458 ERR(REG_ECOLLATE);
459 return 0;
463 * range - supply cvec for a range, including legality check
465 static struct cvec *
466 range(struct vars * v, /* context */
467 celt a, /* range start */
468 celt b, /* range end, might equal a */
469 int cases) /* case-independent? */
471 int nchrs;
472 struct cvec *cv;
473 celt c,
477 if (a != b && !before(a, b))
479 ERR(REG_ERANGE);
480 return NULL;
483 if (!cases)
484 { /* easy version */
485 cv = getcvec(v, 0, 1);
486 NOERRN();
487 addrange(cv, a, b);
488 return cv;
492 * When case-independent, it's hard to decide when cvec ranges are usable,
493 * so for now at least, we won't try. We allocate enough space for two
494 * case variants plus a little extra for the two title case variants.
497 nchrs = (b - a + 1) * 2 + 4;
499 cv = getcvec(v, nchrs, 0);
500 NOERRN();
502 for (c = a; c <= b; c++)
504 addchr(cv, c);
505 lc = pg_wc_tolower((chr) c);
506 if (c != lc)
507 addchr(cv, lc);
508 uc = pg_wc_toupper((chr) c);
509 if (c != uc)
510 addchr(cv, uc);
513 return cv;
517 * before - is celt x before celt y, for purposes of range legality?
519 static int /* predicate */
520 before(celt x, celt y)
522 if (x < y)
523 return 1;
524 return 0;
528 * eclass - supply cvec for an equivalence class
529 * Must include case counterparts on request.
531 static struct cvec *
532 eclass(struct vars * v, /* context */
533 celt c, /* Collating element representing the
534 * equivalence class. */
535 int cases) /* all cases? */
537 struct cvec *cv;
539 /* crude fake equivalence class for testing */
540 if ((v->cflags & REG_FAKE) && c == 'x')
542 cv = getcvec(v, 4, 0);
543 addchr(cv, (chr) 'x');
544 addchr(cv, (chr) 'y');
545 if (cases)
547 addchr(cv, (chr) 'X');
548 addchr(cv, (chr) 'Y');
550 return cv;
553 /* otherwise, none */
554 if (cases)
555 return allcases(v, c);
556 cv = getcvec(v, 1, 0);
557 assert(cv != NULL);
558 addchr(cv, (chr) c);
559 return cv;
563 * cclass - supply cvec for a character class
565 * Must include case counterparts on request.
567 static struct cvec *
568 cclass(struct vars * v, /* context */
569 const chr *startp, /* where the name starts */
570 const chr *endp, /* just past the end of the name */
571 int cases) /* case-independent? */
573 size_t len;
574 struct cvec *cv = NULL;
575 const char **namePtr;
576 int i,
577 index;
580 * The following arrays define the valid character class names.
583 static const char *classNames[] = {
584 "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
585 "lower", "print", "punct", "space", "upper", "xdigit", NULL
588 enum classes
590 CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
591 CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
595 * Map the name to the corresponding enumerated value.
597 len = endp - startp;
598 index = -1;
599 for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
601 if (strlen(*namePtr) == len &&
602 pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
604 index = i;
605 break;
608 if (index == -1)
610 ERR(REG_ECTYPE);
611 return NULL;
615 * Remap lower and upper to alpha if the match is case insensitive.
618 if (cases &&
619 ((enum classes) index == CC_LOWER ||
620 (enum classes) index == CC_UPPER))
621 index = (int) CC_ALPHA;
624 * Now compute the character class contents.
626 * For the moment, assume that only char codes < 256 can be in these
627 * classes.
630 switch ((enum classes) index)
632 case CC_PRINT:
633 cv = getcvec(v, UCHAR_MAX, 0);
634 if (cv)
636 for (i = 0; i <= UCHAR_MAX; i++)
638 if (pg_wc_isprint((chr) i))
639 addchr(cv, (chr) i);
642 break;
643 case CC_ALNUM:
644 cv = getcvec(v, UCHAR_MAX, 0);
645 if (cv)
647 for (i = 0; i <= UCHAR_MAX; i++)
649 if (pg_wc_isalnum((chr) i))
650 addchr(cv, (chr) i);
653 break;
654 case CC_ALPHA:
655 cv = getcvec(v, UCHAR_MAX, 0);
656 if (cv)
658 for (i = 0; i <= UCHAR_MAX; i++)
660 if (pg_wc_isalpha((chr) i))
661 addchr(cv, (chr) i);
664 break;
665 case CC_ASCII:
666 cv = getcvec(v, 0, 1);
667 if (cv)
668 addrange(cv, 0, 0x7f);
669 break;
670 case CC_BLANK:
671 cv = getcvec(v, 2, 0);
672 addchr(cv, '\t');
673 addchr(cv, ' ');
674 break;
675 case CC_CNTRL:
676 cv = getcvec(v, 0, 2);
677 addrange(cv, 0x0, 0x1f);
678 addrange(cv, 0x7f, 0x9f);
679 break;
680 case CC_DIGIT:
681 cv = getcvec(v, 0, 1);
682 if (cv)
683 addrange(cv, (chr) '0', (chr) '9');
684 break;
685 case CC_PUNCT:
686 cv = getcvec(v, UCHAR_MAX, 0);
687 if (cv)
689 for (i = 0; i <= UCHAR_MAX; i++)
691 if (pg_wc_ispunct((chr) i))
692 addchr(cv, (chr) i);
695 break;
696 case CC_XDIGIT:
697 cv = getcvec(v, 0, 3);
698 if (cv)
700 addrange(cv, '0', '9');
701 addrange(cv, 'a', 'f');
702 addrange(cv, 'A', 'F');
704 break;
705 case CC_SPACE:
706 cv = getcvec(v, UCHAR_MAX, 0);
707 if (cv)
709 for (i = 0; i <= UCHAR_MAX; i++)
711 if (pg_wc_isspace((chr) i))
712 addchr(cv, (chr) i);
715 break;
716 case CC_LOWER:
717 cv = getcvec(v, UCHAR_MAX, 0);
718 if (cv)
720 for (i = 0; i <= UCHAR_MAX; i++)
722 if (pg_wc_islower((chr) i))
723 addchr(cv, (chr) i);
726 break;
727 case CC_UPPER:
728 cv = getcvec(v, UCHAR_MAX, 0);
729 if (cv)
731 for (i = 0; i <= UCHAR_MAX; i++)
733 if (pg_wc_isupper((chr) i))
734 addchr(cv, (chr) i);
737 break;
738 case CC_GRAPH:
739 cv = getcvec(v, UCHAR_MAX, 0);
740 if (cv)
742 for (i = 0; i <= UCHAR_MAX; i++)
744 if (pg_wc_isgraph((chr) i))
745 addchr(cv, (chr) i);
748 break;
750 if (cv == NULL)
751 ERR(REG_ESPACE);
752 return cv;
756 * allcases - supply cvec for all case counterparts of a chr (including itself)
758 * This is a shortcut, preferably an efficient one, for simple characters;
759 * messy cases are done via range().
761 static struct cvec *
762 allcases(struct vars * v, /* context */
763 chr pc) /* character to get case equivs of */
765 struct cvec *cv;
766 chr c = (chr) pc;
767 chr lc,
770 lc = pg_wc_tolower((chr) c);
771 uc = pg_wc_toupper((chr) c);
773 cv = getcvec(v, 2, 0);
774 addchr(cv, lc);
775 if (lc != uc)
776 addchr(cv, uc);
777 return cv;
781 * cmp - chr-substring compare
783 * Backrefs need this. It should preferably be efficient.
784 * Note that it does not need to report anything except equal/unequal.
785 * Note also that the length is exact, and the comparison should not
786 * stop at embedded NULs!
788 static int /* 0 for equal, nonzero for unequal */
789 cmp(const chr *x, const chr *y, /* strings to compare */
790 size_t len) /* exact length of comparison */
792 return memcmp(VS(x), VS(y), len * sizeof(chr));
796 * casecmp - case-independent chr-substring compare
798 * REG_ICASE backrefs need this. It should preferably be efficient.
799 * Note that it does not need to report anything except equal/unequal.
800 * Note also that the length is exact, and the comparison should not
801 * stop at embedded NULs!
803 static int /* 0 for equal, nonzero for unequal */
804 casecmp(const chr *x, const chr *y, /* strings to compare */
805 size_t len) /* exact length of comparison */
807 for (; len > 0; len--, x++, y++)
809 if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
810 return 1;
812 return 0;