import less(1)
[unleashed/tickless.git] / usr / src / lib / libc / port / regex / regcmp.c
blob0ce1b492eff8c60a7cad9376f17bea509c38fc58
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
28 /* All Rights Reserved */
31 * IMPORTANT NOTE:
33 * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
34 * IT IS **NOT** CHARACTER SET INDEPENDENT.
38 #pragma weak _regcmp = regcmp
40 #include "lint.h"
41 #include "mtlib.h"
42 #include <limits.h>
43 #include <stdarg.h>
44 #include <stdlib.h>
45 #include <thread.h>
46 #include <wctype.h>
47 #include <widec.h>
48 #include <string.h>
49 #include "tsd.h"
52 /* CONSTANTS SHARED WITH regex() */
54 #include "regex.h"
56 /* PRIVATE CONSTANTS */
58 #define BACKSLASH '\\'
59 #define CIRCUMFLEX '^'
60 #define COMMA ','
61 #define DASH '-'
62 #define DOLLAR_SIGN '$'
63 #define DOT '.'
64 #define LEFT_CURLY_BRACE '{'
65 #define LEFT_PAREN '('
66 #define LEFT_SQUARE_BRACKET '['
67 #define PLUS '+'
68 #define RIGHT_CURLY_BRACE '}'
69 #define RIGHT_PAREN ')'
70 #define RIGHT_SQUARE_BRACKET ']'
71 #define SINGLE_BYTE_MASK 0xff
72 #define STRINGP_STACK_SIZE 50
73 #define STAR '*'
75 /* PRIVATE GLOBAL VARIABLES */
77 static char *compilep_stack[STRINGP_STACK_SIZE];
78 static char **compilep_stackp;
79 static mutex_t regcmp_lock = DEFAULTMUTEX;
81 /* DECLARATIONS OF PRIVATE FUNCTIONS */
83 static int add_char(char *compilep, wchar_t wchar);
84 static int add_single_char_expr(char *compilep, wchar_t wchar);
86 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
88 va_end(arg_listp); \
89 lmutex_unlock(mutex_lockp); \
90 free((void *)compile_startp); \
91 return ((char *)0)
93 static int get_count(int *countp, const char *regexp);
94 static int get_digit(const char *regexp);
95 static int get_wchar(wchar_t *wchar, const char *regexp);
96 static char *pop_compilep(void);
97 static char *push_compilep(char *compilep);
98 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
101 /* DEFINITIONS OF PUBLIC VARIABLES */
103 int __i_size;
106 * define thread-specific storage for __i_size
109 int *
110 ___i_size(void)
112 if (thr_main())
113 return (&__i_size);
114 return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
117 #define __i_size (*(___i_size()))
119 /* DEFINITION OF regcmp() */
121 extern char *
122 regcmp(const char *regexp, ...)
124 va_list arg_listp;
125 size_t arg_strlen;
126 boolean_t can_repeat;
127 int char_size;
128 unsigned int class_length;
129 char *compilep;
130 char *compile_startp = NULL;
131 int count_length;
132 wchar_t current_char;
133 int expr_length;
134 int groupn;
135 unsigned int group_length;
136 unsigned int high_bits;
137 boolean_t dash_indicates_range;
138 unsigned int low_bits;
139 int max_count;
140 int min_count;
141 const char *next_argp;
142 wchar_t first_char_in_range;
143 char *regex_typep;
144 int return_arg_number;
145 int substringn;
147 if (___i_size() == NULL)
148 return (NULL);
151 * When compiling a regular expression, regcmp() generates at most
152 * two extra single-byte characters for each character in the
153 * expression, so allocating three times the number of bytes in all
154 * the strings that comprise the regular expression will ensure that
155 * regcmp() won't overwrite the end of the allocated block when
156 * compiling the expression.
159 va_start(arg_listp, regexp);
160 next_argp = regexp;
161 arg_strlen = 0;
162 while (next_argp != NULL) {
163 arg_strlen += strlen(next_argp);
164 next_argp = va_arg(arg_listp, /* const */ char *);
166 va_end(arg_listp);
168 if (arg_strlen == 0)
169 return (NULL);
170 compile_startp = (char *)malloc(3 * arg_strlen + 1);
171 if (compile_startp == NULL)
172 return (NULL);
174 lmutex_lock(&regcmp_lock);
175 __i_size = 0;
176 compilep = compile_startp;
177 compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
179 /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
180 va_start(arg_listp, regexp);
181 next_argp = va_arg(arg_listp, /* const */ char *);
182 char_size = get_wchar(&current_char, regexp);
183 if (char_size < 0) {
184 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
185 } else if (char_size > 0) {
186 regexp += char_size;
187 } else /* (char_size == 0 ) */ {
188 regexp = next_argp;
189 next_argp = va_arg(arg_listp, /* const */ char *);
190 char_size = get_wchar(&current_char, regexp);
191 if (char_size <= 0) {
192 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
193 } else {
194 regexp += char_size;
198 /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
200 if (current_char == CIRCUMFLEX) {
201 char_size = get_wchar(&current_char, regexp);
202 if (char_size < 0) {
203 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
204 } else if (char_size > 0) {
205 regexp += char_size;
206 *compilep = (unsigned char)START_OF_STRING_MARK;
207 compilep++;
208 } else if /* (char_size == 0) && */ (next_argp != NULL) {
209 regexp = next_argp;
210 next_argp = va_arg(arg_listp, /* const */ char *);
211 char_size = get_wchar(&current_char, regexp);
212 if (char_size <= 0) {
213 ERROR_EXIT(&regcmp_lock, arg_listp,
214 compile_startp);
215 } else {
216 regexp += char_size;
218 *compilep = (unsigned char)START_OF_STRING_MARK;
219 compilep++;
220 } else {
221 /* ((char_size==0) && (next_argp==(char *)0)) */
223 * the regular expression is "^"
225 *compilep = (unsigned char)START_OF_STRING_MARK;
226 compilep++;
227 *compilep = (unsigned char)END_REGEX;
228 compilep++;
229 *compilep = '\0';
230 compilep++;
231 __i_size = (int)(compilep - compile_startp);
232 va_end(arg_listp);
233 lmutex_unlock(&regcmp_lock);
234 return (compile_startp);
238 /* COMPILE THE REGULAR EXPRESSION */
240 groupn = 0;
241 substringn = 0;
242 can_repeat = B_FALSE;
243 for (;;) {
246 * At the end of each iteration get the next character
247 * from the regular expression and increment regexp to
248 * point to the following character. Exit when all
249 * the characters in all the strings in the argument
250 * list have been read.
253 switch (current_char) {
256 * No fall-through. Each case ends with either
257 * a break or an error exit. Each case starts
258 * with compilep addressing the next location to
259 * be written in the compiled regular expression,
260 * and with regexp addressing the next character
261 * to be read from the regular expression being
262 * compiled. Each case that doesn't return
263 * increments regexp to address the next character
264 * to be read from the regular expression and
265 * increments compilep to address the next
266 * location to be written in the compiled
267 * regular expression.
269 * NOTE: The comments for each case give the meaning
270 * of the regular expression compiled by the case
271 * and the character string written to the compiled
272 * regular expression by the case. Each single
273 * character
274 * written to the compiled regular expression is
275 * shown enclosed in angle brackets (<>). Each
276 * compiled regular expression begins with a marker
277 * character which is shown as a named constant
278 * (e.g. <ASCII_CHAR>). Character constants are
279 * shown enclosed in single quotes (e.g. <'$'>).
280 * All other single characters written to the
281 * compiled regular expression are shown as lower
282 * case variable names (e.g. <ascii_char> or
283 * <multibyte_char>). Multicharacter
284 * strings written to the compiled regular expression
285 * are shown as variable names followed by elipses
286 * (e.g. <regex...>).
289 case DOLLAR_SIGN:
290 /* end of string marker or simple dollar sign */
291 /* compiles to <END_OF_STRING_MARK> or */
292 /* <ASCII_CHAR><'$'> */
294 char_size = get_wchar(&current_char, regexp);
295 if ((char_size == 0) && (next_argp == NULL)) {
296 can_repeat = B_FALSE;
297 *compilep = (unsigned char)END_OF_STRING_MARK;
298 compilep++;
299 } else {
300 can_repeat = B_TRUE;
301 *compilep = (unsigned char)ASCII_CHAR;
302 regex_typep = compilep;
303 compilep++;
304 *compilep = DOLLAR_SIGN;
305 compilep++;
307 break; /* end case DOLLAR_SIGN */
309 case DOT: /* any character */
311 /* compiles to <ANY_CHAR> */
313 can_repeat = B_TRUE;
314 *compilep = (unsigned char)ANY_CHAR;
315 regex_typep = compilep;
316 compilep++;
318 break; /* end case DOT */
320 case BACKSLASH: /* escaped character */
323 * compiles to <ASCII_CHAR><ascii_char> or
324 * <MULTIBYTE_CHAR><multibyte_char>
327 char_size = get_wchar(&current_char, regexp);
328 if (char_size <= 0) {
329 ERROR_EXIT(&regcmp_lock, arg_listp,
330 compile_startp);
331 } else {
332 regexp += char_size;
333 can_repeat = B_TRUE;
334 expr_length = add_single_char_expr(
335 compilep, current_char);
336 regex_typep = compilep;
337 compilep += expr_length;
339 break; /* end case '\\' */
341 case LEFT_SQUARE_BRACKET:
342 /* start of a character class expression */
345 * [^...c...] compiles to
346 * <NOT_IN_CLASS><class_length><...c...>
347 * [^...a-z...] compiles to
348 * <NOT_IN_CLASS><class_length><...a<THRU>z...>
349 * [...c...] compiles to
350 * <IN_CLASS><class_length><...c...>
351 * [...a-z...] compiles to
352 * <IN_CLASS><class_length><...a<THRU>z...>
354 * NOTE: <class_length> includes the
355 * <class_length> byte
358 can_repeat = B_TRUE;
359 regex_typep = compilep;
361 /* DETERMINE THE CLASS TYPE */
364 * NOTE: This algorithm checks the value of the
365 * "multibyte"
366 * macro in <euc.h> (included in <widec.h> )
367 * to find out if regcmp()
368 * is compiling the regular expression in a
369 * multibyte locale.
371 char_size = get_wchar(&current_char, regexp);
372 if (char_size <= 0) {
373 ERROR_EXIT(&regcmp_lock, arg_listp,
374 compile_startp);
375 } else if (current_char == CIRCUMFLEX) {
376 regexp++;
377 char_size = get_wchar(&current_char, regexp);
378 if (char_size <= 0) {
379 ERROR_EXIT(&regcmp_lock,
380 arg_listp, compile_startp);
381 } else {
382 regexp += char_size;
383 if (!multibyte) {
384 *compilep = (unsigned char)
385 NOT_IN_ASCII_CHAR_CLASS;
386 } else {
387 *compilep = (unsigned char)
388 NOT_IN_MULTIBYTE_CHAR_CLASS;
390 /* leave space for <class_length> */
391 compilep += 2;
393 } else {
394 regexp += char_size;
395 if (!multibyte) {
396 *compilep = (unsigned char)
397 IN_ASCII_CHAR_CLASS;
398 } else {
399 *compilep = (unsigned char)
400 IN_MULTIBYTE_CHAR_CLASS;
402 /* leave space for <class_length> */
403 compilep += 2;
406 /* COMPILE THE CLASS */
408 * check for a leading right square bracket,
409 * which is allowed
412 if (current_char == RIGHT_SQUARE_BRACKET) {
414 * the leading RIGHT_SQUARE_BRACKET may
415 * be part of a character range
416 * expression like "[]-\]"
418 dash_indicates_range = B_TRUE;
419 first_char_in_range = current_char;
420 char_size = get_wchar(&current_char, regexp);
421 if (char_size <= 0) {
422 ERROR_EXIT(&regcmp_lock,
423 arg_listp, compile_startp);
424 } else {
425 regexp += char_size;
426 *compilep = RIGHT_SQUARE_BRACKET;
427 compilep++;
429 } else {
431 * decode the character in the following
432 * while loop and decide then if it can
433 * be the first character
434 * in a character range expression
436 dash_indicates_range = B_FALSE;
439 while (current_char != RIGHT_SQUARE_BRACKET) {
440 if (current_char != DASH) {
442 * if a DASH follows current_char,
443 * current_char, the DASH and the
444 * character that follows the DASH
445 * may form a character range
446 * expression
448 dash_indicates_range = B_TRUE;
449 first_char_in_range = current_char;
450 expr_length = add_char(
451 compilep, current_char);
452 compilep += expr_length;
454 } else if /* (current_char == DASH) && */
455 (dash_indicates_range == B_FALSE) {
457 * current_char is a DASH, but
458 * either begins the entire
459 * character class or follows a
460 * character that's already
461 * part of a character range
462 * expression, so it simply
463 * represents the DASH character
464 * itself
466 *compilep = DASH;
467 compilep ++;
469 * if another DASH follows this
470 * one, this DASH is part
471 * of a character range expression
472 * like "[--\]"
474 dash_indicates_range = B_TRUE;
475 first_char_in_range = current_char;
477 } else {
479 * ((current_char == DASH &&/
480 * (dash_indicates_range == B_TRUE))
484 * the DASH appears after a single
485 * character that isn't
486 * already part of a character
487 * range expression, so it
488 * and the characters preceding
489 * and following it can form a
490 * character range expression
491 * like "[a-z]"
493 char_size = get_wchar(
494 &current_char, regexp);
495 if (char_size <= 0) {
496 ERROR_EXIT(&regcmp_lock,
497 arg_listp, compile_startp);
499 } else if (current_char ==
500 RIGHT_SQUARE_BRACKET) {
502 * the preceding DASH is
503 * the last character in the
504 * class and represents the
505 * DASH character itself
507 *compilep = DASH;
508 compilep++;
510 } else if (valid_range(
511 first_char_in_range,
512 current_char) == B_FALSE) {
513 ERROR_EXIT(&regcmp_lock,
514 arg_listp, compile_startp);
515 } else {
517 * the DASH is part of a
518 * character range
519 * expression; encode the
520 * rest of the expression
522 regexp += char_size;
523 *compilep = (unsigned char)
524 THRU;
525 compilep++;
526 expr_length = add_char(
527 compilep, current_char);
528 compilep += expr_length;
530 * if a DASH follows this
531 * character range
532 * expression,
533 * it represents the DASH
534 * character itself
536 dash_indicates_range =
537 B_FALSE;
541 /* GET THE NEXT CHARACTER */
543 char_size = get_wchar(&current_char, regexp);
544 if (char_size <= 0) {
545 ERROR_EXIT(&regcmp_lock,
546 arg_listp, compile_startp);
547 } else {
548 regexp += char_size;
552 /* end while (current_char != RIGHT_SQUARE_BRACKET) */
554 /* INSERT THE LENGTH OF THE CLASS INTO THE */
555 /* COMPILED EXPRESSION */
557 class_length = (unsigned int)
558 (compilep - regex_typep - 1);
559 if ((class_length < 2) ||
560 (class_length > MAX_SINGLE_BYTE_INT)) {
561 ERROR_EXIT(&regcmp_lock, arg_listp,
562 compile_startp);
563 } else {
564 *(regex_typep + 1) = (unsigned char)
565 class_length;
567 break; /* end case LEFT_SQUARE_BRACKET */
569 case LEFT_PAREN:
572 * start of a parenthesized group of regular
573 * expressions compiles to <'\0'><'\0'>, leaving
574 * space in the compiled regular expression for
575 * <group_type|ADDED_LENGTH_BITS><group_length>
578 if (push_compilep(compilep) == NULL) {
580 * groups can contain groups, so group
581 * start pointers
582 * must be saved and restored in sequence
584 ERROR_EXIT(&regcmp_lock, arg_listp,
585 compile_startp);
586 } else {
587 can_repeat = B_FALSE;
588 *compilep = '\0'; /* for debugging */
589 compilep++;
590 *compilep = '\0'; /* for debugging */
591 compilep++;
593 break; /* end case LEFT_PAREN */
595 case RIGHT_PAREN:
596 /* end of a marked group of regular expressions */
599 * (<regex>)$0-9 compiles to
600 * <SAVED_GROUP><substringn><compiled_regex...>\
601 * <END_SAVED_GROUP><substringn><return_arg_number>
602 * (<regex>)* compiles to
603 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
604 * <group_length> <compiled_regex...>
605 * <END_GROUP|ZERO_OR_MORE><groupn>
606 * (<regex>)+ compiles to
607 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
608 * <group_length>\
609 * <compiled_regex...><END_GROUP|ONE_OR_MORE>
610 * <groupn>
611 * (<regex>){...} compiles to
612 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
613 * <compiled_regex...><END_GROUP|COUNT><groupn>\
614 * <minimum_repeat_count><maximum_repeat_count>
615 * otherwise (<regex>) compiles to
616 * <SIMPLE_GROUP><blank><compiled_regex...>
617 * <END_GROUP><groupn>
619 * NOTE:
621 * group_length + (256 * ADDED_LENGTH_BITS) ==
622 * length_of(<compiled_regex...><END_GROUP|...>
623 * <groupn>)
624 * which also ==
625 * length_of(<group_type|ADDED_LENGTH_BITS>
626 * <group_length>\ <compiled_regex...>)
627 * groupn no longer seems to be used, but the code
628 * still computes it to preserve backward
629 * compatibility
630 * with earlier versions of regex().
633 /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
635 regex_typep = pop_compilep();
636 if (regex_typep == NULL) {
637 ERROR_EXIT(&regcmp_lock, arg_listp,
638 compile_startp);
640 char_size = get_wchar(&current_char, regexp);
641 if (char_size < 0) {
642 ERROR_EXIT(&regcmp_lock, arg_listp,
643 compile_startp);
644 } else if (char_size == 0) {
645 *regex_typep = SIMPLE_GROUP;
646 can_repeat = B_TRUE;
647 *compilep = (unsigned char)END_GROUP;
648 regex_typep = compilep;
649 compilep++;
650 *compilep = (unsigned char)groupn;
651 groupn++;
652 compilep++;
653 } else if (current_char == DOLLAR_SIGN) {
654 *regex_typep = SAVED_GROUP;
655 regex_typep++;
656 *regex_typep = (char)substringn;
657 can_repeat = B_FALSE;
658 regexp ++;
659 return_arg_number = get_digit(regexp);
660 if ((return_arg_number < 0) ||
661 (substringn >= NSUBSTRINGS)) {
662 ERROR_EXIT(&regcmp_lock, arg_listp,
663 compile_startp);
665 regexp++;
666 *compilep = (unsigned char)END_SAVED_GROUP;
667 compilep++;
668 *compilep = (unsigned char)substringn;
669 substringn++;
670 compilep++;
671 *compilep = (unsigned char)return_arg_number;
672 compilep++;
673 } else {
674 switch (current_char) {
675 case STAR:
676 *regex_typep = ZERO_OR_MORE_GROUP;
677 break;
678 case PLUS:
679 *regex_typep = ONE_OR_MORE_GROUP;
680 break;
681 case LEFT_CURLY_BRACE:
682 *regex_typep = COUNTED_GROUP;
683 break;
684 default:
685 *regex_typep = SIMPLE_GROUP;
687 if (*regex_typep != SIMPLE_GROUP) {
688 group_length = (unsigned int)
689 (compilep - regex_typep);
690 if (group_length >= 1024) {
691 ERROR_EXIT(&regcmp_lock,
692 arg_listp, compile_startp);
694 high_bits = group_length >>
695 TIMES_256_SHIFT;
696 low_bits = group_length &
697 SINGLE_BYTE_MASK;
698 *regex_typep =
699 (unsigned char)
700 ((unsigned int)
701 *regex_typep | high_bits);
702 regex_typep++;
703 *regex_typep =
704 (unsigned char)low_bits;
706 can_repeat = B_TRUE;
707 *compilep = (unsigned char)END_GROUP;
708 regex_typep = compilep;
709 compilep++;
710 *compilep = (unsigned char)groupn;
711 groupn++;
712 compilep++;
715 break; /* end case RIGHT_PAREN */
717 case STAR: /* zero or more repetitions of the */
718 /* preceding expression */
721 * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
722 * <compiled_regex...>
723 * (<regex...>)* compiles to
724 * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
725 * <group_length><compiled_regex...>\
726 * <END_GROUP|ZERO_OR_MORE><groupn>
729 if (can_repeat == B_FALSE) {
730 ERROR_EXIT(&regcmp_lock, arg_listp,
731 compile_startp);
732 } else {
733 can_repeat = B_FALSE;
734 *regex_typep = (unsigned char)
735 ((unsigned int)*regex_typep | ZERO_OR_MORE);
737 break; /* end case '*' */
739 case PLUS:
740 /* one or more repetitions of the preceding */
741 /* expression */
744 * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
745 * <compiled_regex...> (<regex...>)+ compiles to
746 * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
747 * <group_length><compiled_regex...>\
748 * <END_GROUP|ONE_OR_MORE><groupn>
751 if (can_repeat == B_FALSE) {
752 ERROR_EXIT(&regcmp_lock, arg_listp,
753 compile_startp);
754 } else {
755 can_repeat = B_FALSE;
756 *regex_typep =
757 (unsigned char)((unsigned int)*
758 regex_typep | ONE_OR_MORE);
760 break; /* end case '+' */
762 case LEFT_CURLY_BRACE:
765 * repeat the preceding regular expression
766 * at least min_count times
767 * and at most max_count times
769 * <regex...>{min_count} compiles to
770 * <regex type|COUNT><compiled_regex...>
771 * <min_count><min_count>
773 * <regex...>{min_count,} compiles to
774 * <regex type|COUNT><compiled_regex...>
775 * <min_count><UNLIMITED>
777 * <regex...>{min_count,max_count} compiles to
778 * <regex type>|COUNT><compiled_regex...>
779 * <min_count><max_count>
781 * (<regex...>){min_count,max_count} compiles to
782 * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
783 * <compiled_regex...><END_GROUP|COUNT><groupn>\
784 * <minimum_match_count><maximum_match_count>
787 if (can_repeat == B_FALSE) {
788 ERROR_EXIT(&regcmp_lock, arg_listp,
789 compile_startp);
791 can_repeat = B_FALSE;
792 *regex_typep = (unsigned char)((unsigned int)*
793 regex_typep | COUNT);
794 count_length = get_count(&min_count, regexp);
795 if (count_length <= 0) {
796 ERROR_EXIT(&regcmp_lock, arg_listp,
797 compile_startp);
799 regexp += count_length;
801 if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
802 regexp++;
803 max_count = min_count;
804 } else if (*regexp == COMMA) { /* {min_count,..} */
805 regexp++;
806 /* {min_count,} */
807 if (*regexp == RIGHT_CURLY_BRACE) {
808 regexp++;
809 max_count = UNLIMITED;
810 } else { /* {min_count,max_count} */
811 count_length = get_count(
812 &max_count, regexp);
813 if (count_length <= 0) {
814 ERROR_EXIT(&regcmp_lock,
815 arg_listp, compile_startp);
817 regexp += count_length;
818 if (*regexp != RIGHT_CURLY_BRACE) {
819 ERROR_EXIT(&regcmp_lock,
820 arg_listp, compile_startp);
822 regexp++;
824 } else { /* invalid expression */
825 ERROR_EXIT(&regcmp_lock, arg_listp,
826 compile_startp);
829 if ((min_count > MAX_SINGLE_BYTE_INT) ||
830 ((max_count != UNLIMITED) &&
831 (min_count > max_count))) {
832 ERROR_EXIT(&regcmp_lock, arg_listp,
833 compile_startp);
834 } else {
835 *compilep = (unsigned char)min_count;
836 compilep++;
837 *compilep = (unsigned char)max_count;
838 compilep++;
840 break; /* end case LEFT_CURLY_BRACE */
842 default: /* a single non-special character */
845 * compiles to <ASCII_CHAR><ascii_char> or
846 * <MULTIBYTE_CHAR><multibyte_char>
849 can_repeat = B_TRUE;
850 regex_typep = compilep;
851 expr_length = add_single_char_expr(compilep,
852 current_char);
853 compilep += expr_length;
855 } /* end switch (current_char) */
857 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
859 char_size = get_wchar(&current_char, regexp);
860 if (char_size < 0) {
861 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
862 } else if (char_size > 0) {
863 regexp += char_size;
864 } else if /* (char_size == 0) && */ (next_argp != NULL) {
865 regexp = next_argp;
866 next_argp = va_arg(arg_listp, /* const */ char *);
867 char_size = get_wchar(&current_char, regexp);
868 if (char_size <= 0) {
869 ERROR_EXIT(&regcmp_lock, arg_listp,
870 compile_startp);
871 } else {
872 regexp += char_size;
874 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
875 if (pop_compilep() != NULL) {
876 /* unmatched parentheses */
877 ERROR_EXIT(&regcmp_lock, arg_listp,
878 compile_startp);
880 *compilep = (unsigned char)END_REGEX;
881 compilep++;
882 *compilep = '\0';
883 compilep++;
884 __i_size = (int)(compilep - compile_startp);
885 va_end(arg_listp);
886 lmutex_unlock(&regcmp_lock);
887 return (compile_startp);
889 } /* end for (;;) */
891 } /* regcmp() */
894 /* DEFINITIONS OF PRIVATE FUNCTIONS */
896 static int
897 add_char(char *compilep, wchar_t wchar)
899 int expr_length;
901 if ((unsigned int)wchar <= (unsigned int)0x7f) {
902 *compilep = (unsigned char)wchar;
903 expr_length = 1;
904 } else {
905 expr_length = wctomb(compilep, wchar);
907 return (expr_length);
910 static int
911 add_single_char_expr(char *compilep, wchar_t wchar)
913 int expr_length = 0;
915 if ((unsigned int)wchar <= (unsigned int)0x7f) {
916 *compilep = (unsigned char)ASCII_CHAR;
917 compilep++;
918 *compilep = (unsigned char)wchar;
919 expr_length += 2;
920 } else {
921 *compilep = (unsigned char)MULTIBYTE_CHAR;
922 compilep++;
923 expr_length++;
924 expr_length += wctomb(compilep, wchar);
926 return (expr_length);
929 static int
930 get_count(int *countp, const char *regexp)
932 char count_char = '0';
933 int count = 0;
934 int count_length = 0;
936 if (regexp == NULL) {
937 return ((int)0);
938 } else {
939 count_char = *regexp;
940 while (('0' <= count_char) && (count_char <= '9')) {
941 count = (10 * count) + (int)(count_char - '0');
942 count_length++;
943 regexp++;
944 count_char = *regexp;
947 *countp = count;
948 return (count_length);
951 static int
952 get_digit(const char *regexp)
954 char digit;
956 if (regexp == NULL) {
957 return ((int)-1);
958 } else {
959 digit = *regexp;
960 if (('0' <= digit) && (digit <= '9')) {
961 return ((int)(digit - '0'));
962 } else {
963 return ((int)-1);
968 static int
969 get_wchar(wchar_t *wcharp, const char *regexp)
971 int char_size;
973 if (regexp == NULL) {
974 char_size = 0;
975 *wcharp = (wchar_t)((unsigned int)'\0');
976 } else if (*regexp == '\0') {
977 char_size = 0;
978 *wcharp = (wchar_t)((unsigned int)*regexp);
979 } else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
980 char_size = 1;
981 *wcharp = (wchar_t)((unsigned int)*regexp);
982 } else {
983 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
985 return (char_size);
988 static char *
989 pop_compilep(void)
991 char *compilep;
993 if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
994 return (NULL);
995 } else {
996 compilep = *compilep_stackp;
997 compilep_stackp++;
998 return (compilep);
1002 static char *
1003 push_compilep(char *compilep)
1005 if (compilep_stackp <= &compilep_stack[0]) {
1006 return (NULL);
1007 } else {
1008 compilep_stackp--;
1009 *compilep_stackp = compilep;
1010 return (compilep);
1014 static boolean_t
1015 valid_range(wchar_t lower_char, wchar_t upper_char)
1017 return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1018 !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1019 (lower_char < upper_char)) ||
1020 (((lower_char & WCHAR_CSMASK) ==
1021 (upper_char & WCHAR_CSMASK)) &&
1022 (lower_char < upper_char)));