usr/src/lib/libc/port/regex/regcmp.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29
  30 /*
  31  * IMPORTANT NOTE:
  32  *
  33  * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
  34  * IT IS **NOT** CHARACTER SET INDEPENDENT.
  35  *
  36  */
  37
  38 #pragma weak _regcmp = regcmp
  39
  40 #include "lint.h"
  41 #include "mtlib.h"
  42 #include <limits.h>
  43 #include <stdarg.h>
  44 #include <stdlib.h>
  45 #include <thread.h>
  46 #include <wctype.h>
  47 #include <widec.h>
  48 #include <string.h>
  49 #include "tsd.h"
  50
  51
  52 /* CONSTANTS SHARED WITH regex() */
  53
  54 #include "regex.h"
  55
  56 /* PRIVATE CONSTANTS */
  57
  58 #define BACKSLASH               '\\'
  59 #define CIRCUMFLEX              '^'
  60 #define COMMA                   ','
  61 #define DASH                    '-'
  62 #define DOLLAR_SIGN             '$'
  63 #define DOT                     '.'
  64 #define LEFT_CURLY_BRACE        '{'
  65 #define LEFT_PAREN              '('
  66 #define LEFT_SQUARE_BRACKET     '['
  67 #define PLUS                    '+'
  68 #define RIGHT_CURLY_BRACE       '}'
  69 #define RIGHT_PAREN             ')'
  70 #define RIGHT_SQUARE_BRACKET    ']'
  71 #define SINGLE_BYTE_MASK        0xff
  72 #define STRINGP_STACK_SIZE      50
  73 #define STAR                    '*'
  74
  75 /* PRIVATE GLOBAL VARIABLES */
  76
  77 static char     *compilep_stack[STRINGP_STACK_SIZE];
  78 static char     **compilep_stackp;
  79 static mutex_t  regcmp_lock = DEFAULTMUTEX;
  80
  81 /* DECLARATIONS OF PRIVATE FUNCTIONS */
  82
  83 static int add_char(char *compilep, wchar_t wchar);
  84 static int add_single_char_expr(char *compilep, wchar_t wchar);
  85
  86 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
  87 \
  88         va_end(arg_listp); \
  89         lmutex_unlock(mutex_lockp); \
  90         free((void *)compile_startp); \
  91         return ((char *)0)
  92
  93 static int get_count(int *countp, const char *regexp);
  94 static int get_digit(const char *regexp);
  95 static int get_wchar(wchar_t *wchar, const char *regexp);
  96 static char *pop_compilep(void);
  97 static char *push_compilep(char *compilep);
  98 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
  99
 100
 101 /* DEFINITIONS OF PUBLIC VARIABLES */
 102
 103 int __i_size;
 104
 105 /*
 106  * define thread-specific storage for __i_size
 107  *
 108  */
 109 int *
 110 ___i_size(void)
 111 {
 112         if (thr_main())
 113                 return (&__i_size);
 114         return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
 115 }
 116
 117 #define         __i_size (*(___i_size()))
 118
 119 /* DEFINITION OF regcmp() */
 120
 121 extern char *
 122 regcmp(const char *regexp, ...)
 123 {
 124         va_list         arg_listp;
 125         size_t          arg_strlen;
 126         boolean_t       can_repeat;
 127         int             char_size;
 128         unsigned int    class_length;
 129         char            *compilep;
 130         char *compile_startp = NULL;
 131         int             count_length;
 132         wchar_t         current_char;
 133         int             expr_length;
 134         int             groupn;
 135         unsigned int    group_length;
 136         unsigned int    high_bits;
 137         boolean_t       dash_indicates_range;
 138         unsigned int    low_bits;
 139         int             max_count;
 140         int             min_count;
 141         const char      *next_argp;
 142         wchar_t         first_char_in_range;
 143         char            *regex_typep;
 144         int             return_arg_number;
 145         int             substringn;
 146
 147         if (___i_size() == NULL)
 148                 return (NULL);
 149
 150         /*
 151          * When compiling a regular expression, regcmp() generates at most
 152          * two extra single-byte characters for each character in the
 153          * expression, so allocating three times the number of bytes in all
 154          * the strings that comprise the regular expression will ensure that
 155          * regcmp() won't overwrite the end of the allocated block when
 156          * compiling the expression.
 157          */
 158
 159         va_start(arg_listp, regexp);
 160         next_argp = regexp;
 161         arg_strlen = 0;
 162         while (next_argp != NULL) {
 163                 arg_strlen += strlen(next_argp);
 164                 next_argp = va_arg(arg_listp, /* const */ char *);
 165         }
 166         va_end(arg_listp);
 167
 168         if (arg_strlen == 0)
 169                 return (NULL);
 170         compile_startp = (char *)malloc(3 * arg_strlen + 1);
 171         if (compile_startp == NULL)
 172                 return (NULL);
 173
 174         lmutex_lock(&regcmp_lock);
 175         __i_size = 0;
 176         compilep = compile_startp;
 177         compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
 178
 179         /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
 180         va_start(arg_listp, regexp);
 181         next_argp = va_arg(arg_listp, /* const */ char *);
 182         char_size = get_wchar(&current_char, regexp);
 183         if (char_size < 0) {
 184                 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 185         } else if (char_size > 0) {
 186                 regexp += char_size;
 187         } else /* (char_size == 0 ) */ {
 188                 regexp = next_argp;
 189                 next_argp = va_arg(arg_listp, /* const */ char *);
 190                 char_size = get_wchar(&current_char, regexp);
 191                 if (char_size <= 0) {
 192                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 193                 } else {
 194                         regexp += char_size;
 195                 }
 196         }
 197
 198         /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
 199
 200         if (current_char == CIRCUMFLEX) {
 201                 char_size = get_wchar(&current_char, regexp);
 202                 if (char_size < 0) {
 203                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 204                 } else if (char_size > 0) {
 205                         regexp += char_size;
 206                         *compilep = (unsigned char)START_OF_STRING_MARK;
 207                         compilep++;
 208                 } else if /* (char_size == 0) && */ (next_argp != NULL) {
 209                         regexp = next_argp;
 210                         next_argp = va_arg(arg_listp, /* const */ char *);
 211                         char_size = get_wchar(&current_char, regexp);
 212                         if (char_size <= 0) {
 213                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 214                                     compile_startp);
 215                         } else {
 216                                 regexp += char_size;
 217                         }
 218                         *compilep = (unsigned char)START_OF_STRING_MARK;
 219                         compilep++;
 220                 } else {
 221                         /* ((char_size==0) && (next_argp==(char *)0)) */
 222                         /*
 223                          * the regular expression is "^"
 224                          */
 225                         *compilep = (unsigned char)START_OF_STRING_MARK;
 226                         compilep++;
 227                         *compilep = (unsigned char)END_REGEX;
 228                         compilep++;
 229                         *compilep = '\0';
 230                         compilep++;
 231                         __i_size = (int)(compilep - compile_startp);
 232                         va_end(arg_listp);
 233                         lmutex_unlock(&regcmp_lock);
 234                         return (compile_startp);
 235                 }
 236         }
 237
 238         /* COMPILE THE REGULAR EXPRESSION */
 239
 240         groupn = 0;
 241         substringn = 0;
 242         can_repeat = B_FALSE;
 243         for (;;) {
 244
 245                 /*
 246                  * At the end of each iteration get the next character
 247                  * from the regular expression and increment regexp to
 248                  * point to the following character.  Exit when all
 249                  * the characters in all the strings in the argument
 250                  * list have been read.
 251                  */
 252
 253                 switch (current_char) {
 254
 255                         /*
 256                          * No fall-through.  Each case ends with either
 257                          * a break or an error exit.  Each case starts
 258                          * with compilep addressing the next location to
 259                          * be written in the compiled regular expression,
 260                          * and with regexp addressing the next character
 261                          * to be read from the regular expression being
 262                          * compiled.  Each case that doesn't return
 263                          * increments regexp to address the next character
 264                          * to be read from the regular expression and
 265                          * increments compilep to address the next
 266                          * location to be written in the compiled
 267                          * regular expression.
 268                          *
 269                          * NOTE: The comments for each case give the meaning
 270                          * of the regular expression compiled by the case
 271                          * and the character string written to the compiled
 272                          * regular expression by the case.  Each single
 273                          * character
 274                          * written to the compiled regular expression is
 275                          * shown enclosed in angle brackets (<>).  Each
 276                          * compiled regular expression begins with a marker
 277                          * character which is shown as a named constant
 278                          * (e.g. <ASCII_CHAR>). Character constants are
 279                          * shown enclosed in single quotes (e.g. <'$'>).
 280                          * All other single characters written to the
 281                          * compiled regular expression are shown as lower
 282                          * case variable names (e.g. <ascii_char> or
 283                          * <multibyte_char>). Multicharacter
 284                          * strings written to the compiled regular expression
 285                          * are shown as variable names followed by elipses
 286                          * (e.g. <regex...>).
 287                          */
 288
 289                 case DOLLAR_SIGN:
 290                         /* end of string marker or simple dollar sign */
 291                         /* compiles to <END_OF_STRING_MARK> or */
 292                         /* <ASCII_CHAR><'$'> */
 293
 294                         char_size = get_wchar(&current_char, regexp);
 295                         if ((char_size == 0) && (next_argp == NULL)) {
 296                                 can_repeat = B_FALSE;
 297                                 *compilep = (unsigned char)END_OF_STRING_MARK;
 298                                 compilep++;
 299                         } else {
 300                                 can_repeat = B_TRUE;
 301                                 *compilep = (unsigned char)ASCII_CHAR;
 302                                 regex_typep = compilep;
 303                                 compilep++;
 304                                 *compilep = DOLLAR_SIGN;
 305                                 compilep++;
 306                         }
 307                         break; /* end case DOLLAR_SIGN */
 308
 309                 case DOT: /* any character */
 310
 311                         /* compiles to <ANY_CHAR> */
 312
 313                         can_repeat = B_TRUE;
 314                         *compilep = (unsigned char)ANY_CHAR;
 315                         regex_typep = compilep;
 316                         compilep++;
 317
 318                         break; /* end case DOT */
 319
 320                 case BACKSLASH: /* escaped character */
 321
 322                         /*
 323                          * compiles to <ASCII_CHAR><ascii_char> or
 324                          * <MULTIBYTE_CHAR><multibyte_char>
 325                          */
 326
 327                         char_size = get_wchar(&current_char, regexp);
 328                         if (char_size <= 0) {
 329                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 330                                     compile_startp);
 331                         } else {
 332                                 regexp += char_size;
 333                                 can_repeat = B_TRUE;
 334                                 expr_length = add_single_char_expr(
 335                                     compilep, current_char);
 336                                 regex_typep = compilep;
 337                                 compilep += expr_length;
 338                         }
 339                         break; /* end case '\\' */
 340
 341                 case LEFT_SQUARE_BRACKET:
 342                         /* start of a character class expression */
 343
 344                         /*
 345                          * [^...c...] compiles to
 346                          * <NOT_IN_CLASS><class_length><...c...>
 347                          * [^...a-z...] compiles to
 348                          * <NOT_IN_CLASS><class_length><...a<THRU>z...>
 349                          * [...c...] compiles to
 350                          * <IN_CLASS><class_length><...c...>
 351                          * [...a-z...] compiles to
 352                          * <IN_CLASS><class_length><...a<THRU>z...>
 353                          *
 354                          * NOTE: <class_length> includes the
 355                          * <class_length> byte
 356                          */
 357
 358                         can_repeat = B_TRUE;
 359                         regex_typep = compilep;
 360
 361                         /* DETERMINE THE CLASS TYPE */
 362
 363                         /*
 364                          * NOTE: This algorithm checks the value of the
 365                          * "multibyte"
 366                          * macro in <euc.h> (included in <widec.h> )
 367                          * to find out if regcmp()
 368                          * is compiling the regular expression in a
 369                          * multibyte locale.
 370                          */
 371                         char_size = get_wchar(&current_char, regexp);
 372                         if (char_size <= 0) {
 373                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 374                                     compile_startp);
 375                         } else if (current_char == CIRCUMFLEX) {
 376                                 regexp++;
 377                                 char_size = get_wchar(&current_char, regexp);
 378                                 if (char_size <= 0) {
 379                                         ERROR_EXIT(&regcmp_lock,
 380                                             arg_listp, compile_startp);
 381                                 } else {
 382                                         regexp += char_size;
 383                                         if (!multibyte) {
 384                                                 *compilep = (unsigned char)
 385                                                     NOT_IN_ASCII_CHAR_CLASS;
 386                                         } else {
 387                                                 *compilep = (unsigned char)
 388                                                     NOT_IN_MULTIBYTE_CHAR_CLASS;
 389                                         }
 390                                         /* leave space for <class_length> */
 391                                         compilep += 2;
 392                                 }
 393                         } else {
 394                                 regexp += char_size;
 395                                 if (!multibyte) {
 396                                         *compilep = (unsigned char)
 397                                             IN_ASCII_CHAR_CLASS;
 398                                 } else {
 399                                         *compilep = (unsigned char)
 400                                             IN_MULTIBYTE_CHAR_CLASS;
 401                                 }
 402                                 /* leave space for <class_length> */
 403                                 compilep += 2;
 404                         }
 405
 406                         /* COMPILE THE CLASS */
 407                         /*
 408                          * check for a leading right square bracket,
 409                          * which is allowed
 410                          */
 411
 412                         if (current_char == RIGHT_SQUARE_BRACKET) {
 413                                 /*
 414                                  * the leading RIGHT_SQUARE_BRACKET may
 415                                  * be part of a character range
 416                                  * expression like "[]-\]"
 417                                  */
 418                                 dash_indicates_range = B_TRUE;
 419                                 first_char_in_range = current_char;
 420                                 char_size = get_wchar(&current_char, regexp);
 421                                 if (char_size <= 0) {
 422                                         ERROR_EXIT(&regcmp_lock,
 423                                             arg_listp, compile_startp);
 424                                 } else {
 425                                         regexp += char_size;
 426                                         *compilep = RIGHT_SQUARE_BRACKET;
 427                                         compilep++;
 428                                 }
 429                         } else {
 430                                 /*
 431                                  * decode the character in the following
 432                                  * while loop and decide then if it can
 433                                  * be the first character
 434                                  * in a character range expression
 435                                  */
 436                                 dash_indicates_range = B_FALSE;
 437                         }
 438
 439                         while (current_char != RIGHT_SQUARE_BRACKET) {
 440                                 if (current_char != DASH) {
 441                                         /*
 442                                          * if a DASH follows current_char,
 443                                          *  current_char, the DASH and the
 444                                          * character that follows the DASH
 445                                          * may form a character range
 446                                          * expression
 447                                          */
 448                                         dash_indicates_range = B_TRUE;
 449                                         first_char_in_range = current_char;
 450                                         expr_length = add_char(
 451                                             compilep, current_char);
 452                                         compilep += expr_length;
 453
 454                                 } else if /* (current_char == DASH) && */
 455                                     (dash_indicates_range == B_FALSE) {
 456                                         /*
 457                                          * current_char is a DASH, but
 458                                          * either begins the entire
 459                                          * character class or follows a
 460                                          * character that's already
 461                                          * part of a character range
 462                                          * expression, so it simply
 463                                          * represents the DASH character
 464                                          * itself
 465                                          */
 466                                         *compilep = DASH;
 467                                         compilep ++;
 468                                         /*
 469                                          * if another DASH follows this
 470                                          * one, this DASH is part
 471                                          * of a character range expression
 472                                          * like "[--\]"
 473                                          */
 474                                         dash_indicates_range = B_TRUE;
 475                                         first_char_in_range = current_char;
 476
 477                                 } else {
 478                                         /*
 479                                          * ((current_char == DASH &&/
 480                                          * (dash_indicates_range == B_TRUE))
 481                                          */
 482
 483                                         /*
 484                                          * the DASH appears after a single
 485                                          * character that isn't
 486                                          * already part of a character
 487                                          * range expression, so it
 488                                          * and the characters preceding
 489                                          * and following it can form a
 490                                          * character range expression
 491                                          * like "[a-z]"
 492                                          */
 493                                         char_size = get_wchar(
 494                                             &current_char, regexp);
 495                                         if (char_size <= 0) {
 496                                                 ERROR_EXIT(&regcmp_lock,
 497                                                     arg_listp, compile_startp);
 498
 499                                         } else if (current_char ==
 500                                             RIGHT_SQUARE_BRACKET) {
 501                                                 /*
 502                                                  * the preceding DASH is
 503                                                  * the last character in the
 504                                                  * class and represents the
 505                                                  * DASH character itself
 506                                                  */
 507                                                 *compilep = DASH;
 508                                                 compilep++;
 509
 510                                         } else if (valid_range(
 511                                             first_char_in_range,
 512                                             current_char) == B_FALSE) {
 513                                                 ERROR_EXIT(&regcmp_lock,
 514                                                     arg_listp, compile_startp);
 515                                         } else {
 516                                                 /*
 517                                                  * the DASH is part of a
 518                                                  * character range
 519                                                  * expression; encode the
 520                                                  * rest of the expression
 521                                                  */
 522                                                 regexp += char_size;
 523                                                 *compilep = (unsigned char)
 524                                                     THRU;
 525                                                 compilep++;
 526                                                 expr_length = add_char(
 527                                                     compilep, current_char);
 528                                                 compilep += expr_length;
 529                                                 /*
 530                                                  * if a DASH follows this
 531                                                  * character range
 532                                                  * expression,
 533                                                  * it represents the DASH
 534                                                  * character itself
 535                                                  */
 536                                                 dash_indicates_range =
 537                                                     B_FALSE;
 538                                         }
 539                                 }
 540
 541                                 /* GET THE NEXT CHARACTER */
 542
 543                                 char_size = get_wchar(&current_char, regexp);
 544                                 if (char_size <= 0) {
 545                                         ERROR_EXIT(&regcmp_lock,
 546                                             arg_listp, compile_startp);
 547                                 } else {
 548                                         regexp += char_size;
 549                                 }
 550
 551                         }
 552                         /* end while (current_char != RIGHT_SQUARE_BRACKET) */
 553
 554                         /* INSERT THE LENGTH OF THE CLASS INTO THE */
 555                         /* COMPILED EXPRESSION */
 556
 557                         class_length = (unsigned int)
 558                             (compilep - regex_typep - 1);
 559                         if ((class_length < 2) ||
 560                             (class_length > MAX_SINGLE_BYTE_INT)) {
 561                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 562                                     compile_startp);
 563                         } else {
 564                                 *(regex_typep + 1) = (unsigned char)
 565                                     class_length;
 566                         }
 567                         break; /* end case LEFT_SQUARE_BRACKET */
 568
 569                 case LEFT_PAREN:
 570
 571                         /*
 572                          * start of a parenthesized group of regular
 573                          * expressions compiles to <'\0'><'\0'>, leaving
 574                          * space in the compiled regular expression for
 575                          * <group_type|ADDED_LENGTH_BITS><group_length>
 576                          */
 577
 578                         if (push_compilep(compilep) == NULL) {
 579                                 /*
 580                                  * groups can contain groups, so group
 581                                  * start pointers
 582                                  * must be saved and restored in sequence
 583                                  */
 584                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 585                                     compile_startp);
 586                         } else {
 587                                 can_repeat = B_FALSE;
 588                                 *compilep = '\0';       /* for debugging */
 589                                 compilep++;
 590                                 *compilep = '\0';       /* for debugging */
 591                                 compilep++;
 592                         }
 593                         break; /* end case LEFT_PAREN */
 594
 595                 case RIGHT_PAREN:
 596                         /* end of a marked group of regular expressions */
 597
 598                         /*
 599                          * (<regex>)$0-9 compiles to
 600                          * <SAVED_GROUP><substringn><compiled_regex...>\
 601                          * <END_SAVED_GROUP><substringn><return_arg_number>
 602                          * (<regex>)* compiles to
 603                          * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
 604                          * <group_length> <compiled_regex...>
 605                          * <END_GROUP|ZERO_OR_MORE><groupn>
 606                          * (<regex>)+ compiles to
 607                          * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
 608                          * <group_length>\
 609                          * <compiled_regex...><END_GROUP|ONE_OR_MORE>
 610                          * <groupn>
 611                          * (<regex>){...} compiles to
 612                          * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
 613                          * <compiled_regex...><END_GROUP|COUNT><groupn>\
 614                          * <minimum_repeat_count><maximum_repeat_count>
 615                          * otherwise (<regex>) compiles to
 616                          * <SIMPLE_GROUP><blank><compiled_regex...>
 617                          * <END_GROUP><groupn>
 618                          *
 619                          * NOTE:
 620                          *
 621                          * group_length + (256 * ADDED_LENGTH_BITS) ==
 622                          * length_of(<compiled_regex...><END_GROUP|...>
 623                          * <groupn>)
 624                          * which also ==
 625                          * length_of(<group_type|ADDED_LENGTH_BITS>
 626                          * <group_length>\ <compiled_regex...>)
 627                          * groupn no longer seems to be used, but the code
 628                          * still computes it to preserve backward
 629                          * compatibility
 630                          * with earlier versions of regex().
 631                          */
 632
 633                         /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
 634
 635                         regex_typep = pop_compilep();
 636                         if (regex_typep == NULL) {
 637                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 638                                     compile_startp);
 639                         }
 640                         char_size = get_wchar(&current_char, regexp);
 641                         if (char_size < 0) {
 642                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 643                                     compile_startp);
 644                         } else if (char_size == 0) {
 645                                 *regex_typep = SIMPLE_GROUP;
 646                                 can_repeat = B_TRUE;
 647                                 *compilep = (unsigned char)END_GROUP;
 648                                 regex_typep = compilep;
 649                                 compilep++;
 650                                 *compilep = (unsigned char)groupn;
 651                                 groupn++;
 652                                 compilep++;
 653                         } else if (current_char == DOLLAR_SIGN) {
 654                                 *regex_typep = SAVED_GROUP;
 655                                 regex_typep++;
 656                                 *regex_typep = (char)substringn;
 657                                 can_repeat = B_FALSE;
 658                                 regexp ++;
 659                                 return_arg_number = get_digit(regexp);
 660                                 if ((return_arg_number < 0) ||
 661                                     (substringn >= NSUBSTRINGS)) {
 662                                         ERROR_EXIT(&regcmp_lock, arg_listp,
 663                                             compile_startp);
 664                                 }
 665                                 regexp++;
 666                                 *compilep = (unsigned char)END_SAVED_GROUP;
 667                                 compilep++;
 668                                 *compilep = (unsigned char)substringn;
 669                                 substringn++;
 670                                 compilep++;
 671                                 *compilep = (unsigned char)return_arg_number;
 672                                 compilep++;
 673                         } else {
 674                                 switch (current_char) {
 675                                 case STAR:
 676                                         *regex_typep = ZERO_OR_MORE_GROUP;
 677                                         break;
 678                                 case PLUS:
 679                                         *regex_typep = ONE_OR_MORE_GROUP;
 680                                         break;
 681                                 case LEFT_CURLY_BRACE:
 682                                         *regex_typep = COUNTED_GROUP;
 683                                         break;
 684                                 default:
 685                                         *regex_typep = SIMPLE_GROUP;
 686                                 }
 687                                 if (*regex_typep != SIMPLE_GROUP) {
 688                                         group_length = (unsigned int)
 689                                             (compilep - regex_typep);
 690                                         if (group_length >= 1024) {
 691                                                 ERROR_EXIT(&regcmp_lock,
 692                                                     arg_listp, compile_startp);
 693                                         }
 694                                         high_bits = group_length >>
 695                                             TIMES_256_SHIFT;
 696                                         low_bits = group_length &
 697                                             SINGLE_BYTE_MASK;
 698                                         *regex_typep =
 699                                             (unsigned char)
 700                                             ((unsigned int)
 701                                             *regex_typep | high_bits);
 702                                         regex_typep++;
 703                                         *regex_typep =
 704                                             (unsigned char)low_bits;
 705                                 }
 706                                 can_repeat = B_TRUE;
 707                                 *compilep = (unsigned char)END_GROUP;
 708                                 regex_typep = compilep;
 709                                 compilep++;
 710                                 *compilep = (unsigned char)groupn;
 711                                 groupn++;
 712                                 compilep++;
 713                         }
 714
 715                         break; /* end case RIGHT_PAREN */
 716
 717                 case STAR: /* zero or more repetitions of the */
 718                                 /* preceding expression */
 719
 720                         /*
 721                          * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
 722                          * <compiled_regex...>
 723                          * (<regex...>)* compiles to
 724                          * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
 725                          * <group_length><compiled_regex...>\
 726                          * <END_GROUP|ZERO_OR_MORE><groupn>
 727                          */
 728
 729                         if (can_repeat == B_FALSE) {
 730                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 731                                     compile_startp);
 732                         } else {
 733                                 can_repeat = B_FALSE;
 734                                 *regex_typep = (unsigned char)
 735                                     ((unsigned int)*regex_typep | ZERO_OR_MORE);
 736                         }
 737                         break; /* end case '*' */
 738
 739                 case PLUS:
 740                         /* one or more repetitions of the preceding */
 741                                 /* expression */
 742
 743                         /*
 744                          * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
 745                          * <compiled_regex...> (<regex...>)+ compiles to
 746                          * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
 747                          * <group_length><compiled_regex...>\
 748                          * <END_GROUP|ONE_OR_MORE><groupn>
 749                          */
 750
 751                         if (can_repeat == B_FALSE) {
 752                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 753                                     compile_startp);
 754                         } else {
 755                                 can_repeat = B_FALSE;
 756                                 *regex_typep =
 757                                     (unsigned char)((unsigned int)*
 758                                     regex_typep | ONE_OR_MORE);
 759                         }
 760                         break; /* end case '+' */
 761
 762                 case LEFT_CURLY_BRACE:
 763
 764                         /*
 765                          * repeat the preceding regular expression
 766                          * at least min_count times
 767                          * and at most max_count times
 768                          *
 769                          * <regex...>{min_count} compiles to
 770                          * <regex type|COUNT><compiled_regex...>
 771                          * <min_count><min_count>
 772                          *
 773                          * <regex...>{min_count,} compiles to
 774                          * <regex type|COUNT><compiled_regex...>
 775                          * <min_count><UNLIMITED>
 776                          *
 777                          * <regex...>{min_count,max_count} compiles to
 778                          * <regex type>|COUNT><compiled_regex...>
 779                          * <min_count><max_count>
 780                          *
 781                          * (<regex...>){min_count,max_count} compiles to
 782                          * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
 783                          * <compiled_regex...><END_GROUP|COUNT><groupn>\
 784                          * <minimum_match_count><maximum_match_count>
 785                          */
 786
 787                         if (can_repeat == B_FALSE) {
 788                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 789                                     compile_startp);
 790                         }
 791                         can_repeat = B_FALSE;
 792                         *regex_typep = (unsigned char)((unsigned int)*
 793                             regex_typep | COUNT);
 794                         count_length = get_count(&min_count, regexp);
 795                         if (count_length <= 0) {
 796                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 797                                     compile_startp);
 798                         }
 799                         regexp += count_length;
 800
 801                         if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
 802                                 regexp++;
 803                                 max_count = min_count;
 804                         } else if (*regexp == COMMA) { /* {min_count,..} */
 805                                 regexp++;
 806                                 /* {min_count,}   */
 807                                 if (*regexp == RIGHT_CURLY_BRACE) {
 808                                         regexp++;
 809                                         max_count = UNLIMITED;
 810                                 } else { /* {min_count,max_count} */
 811                                         count_length = get_count(
 812                                             &max_count, regexp);
 813                                         if (count_length <= 0) {
 814                                                 ERROR_EXIT(&regcmp_lock,
 815                                                     arg_listp, compile_startp);
 816                                         }
 817                                         regexp += count_length;
 818                                         if (*regexp != RIGHT_CURLY_BRACE) {
 819                                                 ERROR_EXIT(&regcmp_lock,
 820                                                     arg_listp, compile_startp);
 821                                         }
 822                                         regexp++;
 823                                 }
 824                         } else { /* invalid expression */
 825                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 826                                     compile_startp);
 827                         }
 828
 829                         if ((min_count > MAX_SINGLE_BYTE_INT) ||
 830                             ((max_count != UNLIMITED) &&
 831                             (min_count > max_count))) {
 832                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 833                                     compile_startp);
 834                         } else {
 835                                 *compilep = (unsigned char)min_count;
 836                                 compilep++;
 837                                 *compilep = (unsigned char)max_count;
 838                                 compilep++;
 839                         }
 840                         break; /* end case LEFT_CURLY_BRACE */
 841
 842                 default: /* a single non-special character */
 843
 844                         /*
 845                          * compiles to <ASCII_CHAR><ascii_char> or
 846                          * <MULTIBYTE_CHAR><multibyte_char>
 847                          */
 848
 849                         can_repeat = B_TRUE;
 850                         regex_typep = compilep;
 851                         expr_length = add_single_char_expr(compilep,
 852                             current_char);
 853                         compilep += expr_length;
 854
 855                 } /* end switch (current_char) */
 856
 857                 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
 858
 859                 char_size = get_wchar(&current_char, regexp);
 860                 if (char_size < 0) {
 861                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 862                 } else if (char_size > 0) {
 863                         regexp += char_size;
 864                 } else if /* (char_size == 0) && */ (next_argp != NULL) {
 865                         regexp = next_argp;
 866                         next_argp = va_arg(arg_listp, /* const */ char *);
 867                         char_size = get_wchar(&current_char, regexp);
 868                         if (char_size <= 0) {
 869                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 870                                     compile_startp);
 871                         } else {
 872                                 regexp += char_size;
 873                         }
 874                 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
 875                         if (pop_compilep() != NULL) {
 876                                 /* unmatched parentheses */
 877                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 878                                     compile_startp);
 879                         }
 880                         *compilep = (unsigned char)END_REGEX;
 881                         compilep++;
 882                         *compilep = '\0';
 883                         compilep++;
 884                         __i_size = (int)(compilep - compile_startp);
 885                         va_end(arg_listp);
 886                         lmutex_unlock(&regcmp_lock);
 887                         return (compile_startp);
 888                 }
 889         } /* end for (;;) */
 890
 891 } /* regcmp() */
 892
 893
 894 /* DEFINITIONS OF PRIVATE FUNCTIONS */
 895
 896 static int
 897 add_char(char *compilep, wchar_t wchar)
 898 {
 899         int expr_length;
 900
 901         if ((unsigned int)wchar <= (unsigned int)0x7f) {
 902                 *compilep = (unsigned char)wchar;
 903                 expr_length = 1;
 904         } else {
 905                 expr_length = wctomb(compilep, wchar);
 906         }
 907         return (expr_length);
 908 }
 909
 910 static int
 911 add_single_char_expr(char *compilep, wchar_t wchar)
 912 {
 913         int expr_length = 0;
 914
 915         if ((unsigned int)wchar <= (unsigned int)0x7f) {
 916                 *compilep = (unsigned char)ASCII_CHAR;
 917                 compilep++;
 918                 *compilep = (unsigned char)wchar;
 919                 expr_length += 2;
 920         } else {
 921                 *compilep = (unsigned char)MULTIBYTE_CHAR;
 922                 compilep++;
 923                 expr_length++;
 924                 expr_length += wctomb(compilep, wchar);
 925         }
 926         return (expr_length);
 927 }
 928
 929 static int
 930 get_count(int *countp, const char *regexp)
 931 {
 932         char count_char = '0';
 933         int count = 0;
 934         int count_length = 0;
 935
 936         if (regexp == NULL) {
 937                 return ((int)0);
 938         } else {
 939                 count_char = *regexp;
 940                 while (('0' <= count_char) && (count_char <= '9')) {
 941                         count = (10 * count) + (int)(count_char - '0');
 942                         count_length++;
 943                         regexp++;
 944                         count_char = *regexp;
 945                 }
 946         }
 947         *countp = count;
 948         return (count_length);
 949 }
 950
 951 static int
 952 get_digit(const char *regexp)
 953 {
 954         char digit;
 955
 956         if (regexp == NULL) {
 957                 return ((int)-1);
 958         } else {
 959                 digit = *regexp;
 960                 if (('0' <= digit) && (digit <= '9')) {
 961                         return ((int)(digit - '0'));
 962                 } else {
 963                         return ((int)-1);
 964                 }
 965         }
 966 }
 967
 968 static int
 969 get_wchar(wchar_t *wcharp, const char *regexp)
 970 {
 971         int char_size;
 972
 973         if (regexp == NULL) {
 974                 char_size = 0;
 975                 *wcharp = (wchar_t)((unsigned int)'\0');
 976         } else if (*regexp == '\0') {
 977                 char_size = 0;
 978                 *wcharp = (wchar_t)((unsigned int)*regexp);
 979         } else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
 980                 char_size = 1;
 981                 *wcharp = (wchar_t)((unsigned int)*regexp);
 982         } else {
 983                 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
 984         }
 985         return (char_size);
 986 }
 987
 988 static char *
 989 pop_compilep(void)
 990 {
 991         char *compilep;
 992
 993         if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
 994                 return (NULL);
 995         } else {
 996                 compilep = *compilep_stackp;
 997                 compilep_stackp++;
 998                 return (compilep);
 999         }
1000 }
1001
1002 static char *
1003 push_compilep(char *compilep)
1004 {
1005         if (compilep_stackp <= &compilep_stack[0]) {
1006                 return (NULL);
1007         } else {
1008                 compilep_stackp--;
1009                 *compilep_stackp = compilep;
1010                 return (compilep);
1011         }
1012 }
1013
1014 static boolean_t
1015 valid_range(wchar_t lower_char, wchar_t upper_char)
1016 {
1017         return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1018             !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1019             (lower_char < upper_char)) ||
1020             (((lower_char & WCHAR_CSMASK) ==
1021             (upper_char & WCHAR_CSMASK)) &&
1022             (lower_char < upper_char)));
1023 }