usr/src/lib/libc/port/regex/regcmp.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29
  30 /*
  31  * IMPORTANT NOTE:
  32  *
  33  * regcmp() WORKS **ONLY** WITH THE ASCII AND THE Solaris EUC CHARACTER SETS.
  34  * IT IS **NOT** CHARACTER SET INDEPENDENT.
  35  *
  36  */
  37
  38 #pragma weak _regcmp = regcmp
  39
  40 #include "lint.h"
  41 #include "mtlib.h"
  42 #include <limits.h>
  43 #include <stdarg.h>
  44 #include <stdlib.h>
  45 #include <thread.h>
  46 #include <wctype.h>
  47 #include <widec.h>
  48 #include <string.h>
  49 #include "tsd.h"
  50
  51
  52 /* CONSTANTS SHARED WITH regex() */
  53
  54 #include "regex.h"
  55
  56 /* PRIVATE CONSTANTS */
  57
  58 #define BACKSLASH               '\\'
  59 #define CIRCUMFLEX              '^'
  60 #define COMMA                   ','
  61 #define DASH                    '-'
  62 #define DOLLAR_SIGN             '$'
  63 #define DOT                     '.'
  64 #define LEFT_CURLY_BRACE        '{'
  65 #define LEFT_PAREN              '('
  66 #define LEFT_SQUARE_BRACKET     '['
  67 #define PLUS                    '+'
  68 #define RIGHT_CURLY_BRACE       '}'
  69 #define RIGHT_PAREN             ')'
  70 #define RIGHT_SQUARE_BRACKET    ']'
  71 #define SINGLE_BYTE_MASK        0xff
  72 #define STRINGP_STACK_SIZE      50
  73 #define STAR                    '*'
  74
  75 /* PRIVATE GLOBAL VARIABLES */
  76
  77 static char     *compilep_stack[STRINGP_STACK_SIZE];
  78 static char     **compilep_stackp;
  79 static mutex_t  regcmp_lock = DEFAULTMUTEX;
  80
  81 /* DECLARATIONS OF PRIVATE FUNCTIONS */
  82
  83 static int add_char(char *compilep, wchar_t wchar);
  84 static int add_single_char_expr(char *compilep, wchar_t wchar);
  85
  86 #define ERROR_EXIT(mutex_lockp, arg_listp, compile_startp) \
  87 \
  88         va_end(arg_listp); \
  89         lmutex_unlock(mutex_lockp); \
  90         if ((compile_startp) != (char *)0) \
  91                 free((void *)compile_startp); \
  92         return ((char *)0)
  93
  94 static int get_count(int *countp, const char *regexp);
  95 static int get_digit(const char *regexp);
  96 static int get_wchar(wchar_t *wchar, const char *regexp);
  97 static char *pop_compilep(void);
  98 static char *push_compilep(char *compilep);
  99 static boolean_t valid_range(wchar_t lower_char, wchar_t upper_char);
 100
 101
 102 /* DEFINITIONS OF PUBLIC VARIABLES */
 103
 104 int __i_size;
 105
 106 /*
 107  * define thread-specific storage for __i_size
 108  *
 109  */
 110 int *
 111 ___i_size(void)
 112 {
 113         if (thr_main())
 114                 return (&__i_size);
 115         return ((int *)tsdalloc(_T_REGCMP_ISIZE, sizeof (int), NULL));
 116 }
 117
 118 #define         __i_size (*(___i_size()))
 119
 120 /* DEFINITION OF regcmp() */
 121
 122 extern char *
 123 regcmp(const char *regexp, ...)
 124 {
 125         va_list         arg_listp;
 126         size_t          arg_strlen;
 127         boolean_t       can_repeat;
 128         int             char_size;
 129         unsigned int    class_length;
 130         char            *compilep;
 131         char            *compile_startp = (char *)0;
 132         int             count_length;
 133         wchar_t         current_char;
 134         int             expr_length;
 135         int             groupn;
 136         unsigned int    group_length;
 137         unsigned int    high_bits;
 138         boolean_t       dash_indicates_range;
 139         unsigned int    low_bits;
 140         int             max_count;
 141         int             min_count;
 142         const char      *next_argp;
 143         wchar_t         first_char_in_range;
 144         char            *regex_typep;
 145         int             return_arg_number;
 146         int             substringn;
 147
 148         if (___i_size() == (int *)0)
 149                 return ((char *)0);
 150
 151         /*
 152          * When compiling a regular expression, regcmp() generates at most
 153          * two extra single-byte characters for each character in the
 154          * expression, so allocating three times the number of bytes in all
 155          * the strings that comprise the regular expression will ensure that
 156          * regcmp() won't overwrite the end of the allocated block when
 157          * compiling the expression.
 158          */
 159
 160         va_start(arg_listp, regexp);
 161         next_argp = regexp;
 162         arg_strlen = 0;
 163         while (next_argp != (char *)0) {
 164                 arg_strlen += strlen(next_argp);
 165                 next_argp = va_arg(arg_listp, /* const */ char *);
 166         }
 167         va_end(arg_listp);
 168
 169         if (arg_strlen == 0)
 170                 return ((char *)0);
 171         compile_startp = (char *)malloc(3 * arg_strlen + 1);
 172         if (compile_startp == (char *)0)
 173                 return ((char *)0);
 174
 175         lmutex_lock(&regcmp_lock);
 176         __i_size = 0;
 177         compilep = compile_startp;
 178         compilep_stackp = &compilep_stack[STRINGP_STACK_SIZE];
 179
 180         /* GET THE FIRST CHARACTER IN THE REGULAR EXPRESSION */
 181         va_start(arg_listp, regexp);
 182         next_argp = va_arg(arg_listp, /* const */ char *);
 183         char_size = get_wchar(&current_char, regexp);
 184         if (char_size < 0) {
 185                 ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 186         } else if (char_size > 0) {
 187                 regexp += char_size;
 188         } else /* (char_size == 0 ) */ {
 189                 regexp = next_argp;
 190                 next_argp = va_arg(arg_listp, /* const */ char *);
 191                 char_size = get_wchar(&current_char, regexp);
 192                 if (char_size <= 0) {
 193                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 194                 } else {
 195                         regexp += char_size;
 196                 }
 197         }
 198
 199         /* FIND OUT IF THE EXPRESSION MUST START AT THE START OF A STRING */
 200
 201         if (current_char == CIRCUMFLEX) {
 202                 char_size = get_wchar(&current_char, regexp);
 203                 if (char_size < 0) {
 204                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 205                 } else if (char_size > 0) {
 206                         regexp += char_size;
 207                         *compilep = (unsigned char)START_OF_STRING_MARK;
 208                         compilep++;
 209                 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
 210                         regexp = next_argp;
 211                         next_argp = va_arg(arg_listp, /* const */ char *);
 212                         char_size = get_wchar(&current_char, regexp);
 213                         if (char_size <= 0) {
 214                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 215                                     compile_startp);
 216                         } else {
 217                                 regexp += char_size;
 218                         }
 219                         *compilep = (unsigned char)START_OF_STRING_MARK;
 220                         compilep++;
 221                 } else {
 222                         /* ((char_size==0) && (next_argp==(char *)0)) */
 223                         /*
 224                          * the regular expression is "^"
 225                          */
 226                         *compilep = (unsigned char)START_OF_STRING_MARK;
 227                         compilep++;
 228                         *compilep = (unsigned char)END_REGEX;
 229                         compilep++;
 230                         *compilep = '\0';
 231                         compilep++;
 232                         __i_size = (int)(compilep - compile_startp);
 233                         va_end(arg_listp);
 234                         lmutex_unlock(&regcmp_lock);
 235                         return (compile_startp);
 236                 }
 237         }
 238
 239         /* COMPILE THE REGULAR EXPRESSION */
 240
 241         groupn = 0;
 242         substringn = 0;
 243         can_repeat = B_FALSE;
 244         for (;;) {
 245
 246                 /*
 247                  * At the end of each iteration get the next character
 248                  * from the regular expression and increment regexp to
 249                  * point to the following character.  Exit when all
 250                  * the characters in all the strings in the argument
 251                  * list have been read.
 252                  */
 253
 254                 switch (current_char) {
 255
 256                         /*
 257                          * No fall-through.  Each case ends with either
 258                          * a break or an error exit.  Each case starts
 259                          * with compilep addressing the next location to
 260                          * be written in the compiled regular expression,
 261                          * and with regexp addressing the next character
 262                          * to be read from the regular expression being
 263                          * compiled.  Each case that doesn't return
 264                          * increments regexp to address the next character
 265                          * to be read from the regular expression and
 266                          * increments compilep to address the next
 267                          * location to be written in the compiled
 268                          * regular expression.
 269                          *
 270                          * NOTE: The comments for each case give the meaning
 271                          * of the regular expression compiled by the case
 272                          * and the character string written to the compiled
 273                          * regular expression by the case.  Each single
 274                          * character
 275                          * written to the compiled regular expression is
 276                          * shown enclosed in angle brackets (<>).  Each
 277                          * compiled regular expression begins with a marker
 278                          * character which is shown as a named constant
 279                          * (e.g. <ASCII_CHAR>). Character constants are
 280                          * shown enclosed in single quotes (e.g. <'$'>).
 281                          * All other single characters written to the
 282                          * compiled regular expression are shown as lower
 283                          * case variable names (e.g. <ascii_char> or
 284                          * <multibyte_char>). Multicharacter
 285                          * strings written to the compiled regular expression
 286                          * are shown as variable names followed by elipses
 287                          * (e.g. <regex...>).
 288                          */
 289
 290                 case DOLLAR_SIGN:
 291                         /* end of string marker or simple dollar sign */
 292                         /* compiles to <END_OF_STRING_MARK> or */
 293                         /* <ASCII_CHAR><'$'> */
 294
 295                         char_size = get_wchar(&current_char, regexp);
 296                         if ((char_size == 0) && (next_argp == (char *)0)) {
 297                                 can_repeat = B_FALSE;
 298                                 *compilep = (unsigned char)END_OF_STRING_MARK;
 299                                 compilep++;
 300                         } else {
 301                                 can_repeat = B_TRUE;
 302                                 *compilep = (unsigned char)ASCII_CHAR;
 303                                 regex_typep = compilep;
 304                                 compilep++;
 305                                 *compilep = DOLLAR_SIGN;
 306                                 compilep++;
 307                         }
 308                         break; /* end case DOLLAR_SIGN */
 309
 310                 case DOT: /* any character */
 311
 312                         /* compiles to <ANY_CHAR> */
 313
 314                         can_repeat = B_TRUE;
 315                         *compilep = (unsigned char)ANY_CHAR;
 316                         regex_typep = compilep;
 317                         compilep++;
 318
 319                         break; /* end case DOT */
 320
 321                 case BACKSLASH: /* escaped character */
 322
 323                         /*
 324                          * compiles to <ASCII_CHAR><ascii_char> or
 325                          * <MULTIBYTE_CHAR><multibyte_char>
 326                          */
 327
 328                         char_size = get_wchar(&current_char, regexp);
 329                         if (char_size <= 0) {
 330                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 331                                     compile_startp);
 332                         } else {
 333                                 regexp += char_size;
 334                                 can_repeat = B_TRUE;
 335                                 expr_length = add_single_char_expr(
 336                                     compilep, current_char);
 337                                 regex_typep = compilep;
 338                                 compilep += expr_length;
 339                         }
 340                         break; /* end case '\\' */
 341
 342                 case LEFT_SQUARE_BRACKET:
 343                         /* start of a character class expression */
 344
 345                         /*
 346                          * [^...c...] compiles to
 347                          * <NOT_IN_CLASS><class_length><...c...>
 348                          * [^...a-z...] compiles to
 349                          * <NOT_IN_CLASS><class_length><...a<THRU>z...>
 350                          * [...c...] compiles to
 351                          * <IN_CLASS><class_length><...c...>
 352                          * [...a-z...] compiles to
 353                          * <IN_CLASS><class_length><...a<THRU>z...>
 354                          *
 355                          * NOTE: <class_length> includes the
 356                          * <class_length> byte
 357                          */
 358
 359                         can_repeat = B_TRUE;
 360                         regex_typep = compilep;
 361
 362                         /* DETERMINE THE CLASS TYPE */
 363
 364                         /*
 365                          * NOTE: This algorithm checks the value of the
 366                          * "multibyte"
 367                          * macro in <euc.h> (included in <widec.h> )
 368                          * to find out if regcmp()
 369                          * is compiling the regular expression in a
 370                          * multibyte locale.
 371                          */
 372                         char_size = get_wchar(&current_char, regexp);
 373                         if (char_size <= 0) {
 374                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 375                                     compile_startp);
 376                         } else if (current_char == CIRCUMFLEX) {
 377                                 regexp++;
 378                                 char_size = get_wchar(&current_char, regexp);
 379                                 if (char_size <= 0) {
 380                                         ERROR_EXIT(&regcmp_lock,
 381                                             arg_listp, compile_startp);
 382                                 } else {
 383                                         regexp += char_size;
 384                                         if (!multibyte) {
 385                                                 *compilep = (unsigned char)
 386                                                     NOT_IN_ASCII_CHAR_CLASS;
 387                                         } else {
 388                                                 *compilep = (unsigned char)
 389                                                     NOT_IN_MULTIBYTE_CHAR_CLASS;
 390                                         }
 391                                         /* leave space for <class_length> */
 392                                         compilep += 2;
 393                                 }
 394                         } else {
 395                                 regexp += char_size;
 396                                 if (!multibyte) {
 397                                         *compilep = (unsigned char)
 398                                             IN_ASCII_CHAR_CLASS;
 399                                 } else {
 400                                         *compilep = (unsigned char)
 401                                             IN_MULTIBYTE_CHAR_CLASS;
 402                                 }
 403                                 /* leave space for <class_length> */
 404                                 compilep += 2;
 405                         }
 406
 407                         /* COMPILE THE CLASS */
 408                         /*
 409                          * check for a leading right square bracket,
 410                          * which is allowed
 411                          */
 412
 413                         if (current_char == RIGHT_SQUARE_BRACKET) {
 414                                 /*
 415                                  * the leading RIGHT_SQUARE_BRACKET may
 416                                  * be part of a character range
 417                                  * expression like "[]-\]"
 418                                  */
 419                                 dash_indicates_range = B_TRUE;
 420                                 first_char_in_range = current_char;
 421                                 char_size = get_wchar(&current_char, regexp);
 422                                 if (char_size <= 0) {
 423                                         ERROR_EXIT(&regcmp_lock,
 424                                             arg_listp, compile_startp);
 425                                 } else {
 426                                         regexp += char_size;
 427                                         *compilep = RIGHT_SQUARE_BRACKET;
 428                                         compilep++;
 429                                 }
 430                         } else {
 431                                 /*
 432                                  * decode the character in the following
 433                                  * while loop and decide then if it can
 434                                  * be the first character
 435                                  * in a character range expression
 436                                  */
 437                                 dash_indicates_range = B_FALSE;
 438                         }
 439
 440                         while (current_char != RIGHT_SQUARE_BRACKET) {
 441                                 if (current_char != DASH) {
 442                                         /*
 443                                          * if a DASH follows current_char,
 444                                          *  current_char, the DASH and the
 445                                          * character that follows the DASH
 446                                          * may form a character range
 447                                          * expression
 448                                          */
 449                                         dash_indicates_range = B_TRUE;
 450                                         first_char_in_range = current_char;
 451                                         expr_length = add_char(
 452                                             compilep, current_char);
 453                                         compilep += expr_length;
 454
 455                                 } else if /* (current_char == DASH) && */
 456                                     (dash_indicates_range == B_FALSE) {
 457                                         /*
 458                                          * current_char is a DASH, but
 459                                          * either begins the entire
 460                                          * character class or follows a
 461                                          * character that's already
 462                                          * part of a character range
 463                                          * expression, so it simply
 464                                          * represents the DASH character
 465                                          * itself
 466                                          */
 467                                         *compilep = DASH;
 468                                         compilep ++;
 469                                         /*
 470                                          * if another DASH follows this
 471                                          * one, this DASH is part
 472                                          * of a character range expression
 473                                          * like "[--\]"
 474                                          */
 475                                         dash_indicates_range = B_TRUE;
 476                                         first_char_in_range = current_char;
 477
 478                                 } else {
 479                                         /*
 480                                          * ((current_char == DASH &&/
 481                                          * (dash_indicates_range == B_TRUE))
 482                                          */
 483
 484                                         /*
 485                                          * the DASH appears after a single
 486                                          * character that isn't
 487                                          * already part of a character
 488                                          * range expression, so it
 489                                          * and the characters preceding
 490                                          * and following it can form a
 491                                          * character range expression
 492                                          * like "[a-z]"
 493                                          */
 494                                         char_size = get_wchar(
 495                                             &current_char, regexp);
 496                                         if (char_size <= 0) {
 497                                                 ERROR_EXIT(&regcmp_lock,
 498                                                     arg_listp, compile_startp);
 499
 500                                         } else if (current_char ==
 501                                             RIGHT_SQUARE_BRACKET) {
 502                                                 /*
 503                                                  * the preceding DASH is
 504                                                  * the last character in the
 505                                                  * class and represents the
 506                                                  * DASH character itself
 507                                                  */
 508                                                 *compilep = DASH;
 509                                                 compilep++;
 510
 511                                         } else if (valid_range(
 512                                             first_char_in_range,
 513                                             current_char) == B_FALSE) {
 514                                                 ERROR_EXIT(&regcmp_lock,
 515                                                     arg_listp, compile_startp);
 516                                         } else {
 517                                                 /*
 518                                                  * the DASH is part of a
 519                                                  * character range
 520                                                  * expression; encode the
 521                                                  * rest of the expression
 522                                                  */
 523                                                 regexp += char_size;
 524                                                 *compilep = (unsigned char)
 525                                                     THRU;
 526                                                 compilep++;
 527                                                 expr_length = add_char(
 528                                                     compilep, current_char);
 529                                                 compilep += expr_length;
 530                                                 /*
 531                                                  * if a DASH follows this
 532                                                  * character range
 533                                                  * expression,
 534                                                  * it represents the DASH
 535                                                  * character itself
 536                                                  */
 537                                                 dash_indicates_range =
 538                                                     B_FALSE;
 539                                         }
 540                                 }
 541
 542                                 /* GET THE NEXT CHARACTER */
 543
 544                                 char_size = get_wchar(&current_char, regexp);
 545                                 if (char_size <= 0) {
 546                                         ERROR_EXIT(&regcmp_lock,
 547                                             arg_listp, compile_startp);
 548                                 } else {
 549                                         regexp += char_size;
 550                                 }
 551
 552                         }
 553                         /* end while (current_char != RIGHT_SQUARE_BRACKET) */
 554
 555                         /* INSERT THE LENGTH OF THE CLASS INTO THE */
 556                         /* COMPILED EXPRESSION */
 557
 558                         class_length = (unsigned int)
 559                             (compilep - regex_typep - 1);
 560                         if ((class_length < 2) ||
 561                             (class_length > MAX_SINGLE_BYTE_INT)) {
 562                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 563                                     compile_startp);
 564                         } else {
 565                                 *(regex_typep + 1) = (unsigned char)
 566                                     class_length;
 567                         }
 568                         break; /* end case LEFT_SQUARE_BRACKET */
 569
 570                 case LEFT_PAREN:
 571
 572                         /*
 573                          * start of a parenthesized group of regular
 574                          * expressions compiles to <'\0'><'\0'>, leaving
 575                          * space in the compiled regular expression for
 576                          * <group_type|ADDED_LENGTH_BITS><group_length>
 577                          */
 578
 579                         if (push_compilep(compilep) == (char *)0) {
 580                                 /*
 581                                  * groups can contain groups, so group
 582                                  * start pointers
 583                                  * must be saved and restored in sequence
 584                                  */
 585                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 586                                     compile_startp);
 587                         } else {
 588                                 can_repeat = B_FALSE;
 589                                 *compilep = '\0';       /* for debugging */
 590                                 compilep++;
 591                                 *compilep = '\0';       /* for debugging */
 592                                 compilep++;
 593                         }
 594                         break; /* end case LEFT_PAREN */
 595
 596                 case RIGHT_PAREN:
 597                         /* end of a marked group of regular expressions */
 598
 599                         /*
 600                          * (<regex>)$0-9 compiles to
 601                          * <SAVED_GROUP><substringn><compiled_regex...>\
 602                          * <END_SAVED_GROUP><substringn><return_arg_number>
 603                          * (<regex>)* compiles to
 604                          * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>
 605                          * <group_length> <compiled_regex...>
 606                          * <END_GROUP|ZERO_OR_MORE><groupn>
 607                          * (<regex>)+ compiles to
 608                          * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>
 609                          * <group_length>\
 610                          * <compiled_regex...><END_GROUP|ONE_OR_MORE>
 611                          * <groupn>
 612                          * (<regex>){...} compiles to
 613                          * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
 614                          * <compiled_regex...><END_GROUP|COUNT><groupn>\
 615                          * <minimum_repeat_count><maximum_repeat_count>
 616                          * otherwise (<regex>) compiles to
 617                          * <SIMPLE_GROUP><blank><compiled_regex...>
 618                          * <END_GROUP><groupn>
 619                          *
 620                          * NOTE:
 621                          *
 622                          * group_length + (256 * ADDED_LENGTH_BITS) ==
 623                          * length_of(<compiled_regex...><END_GROUP|...>
 624                          * <groupn>)
 625                          * which also ==
 626                          * length_of(<group_type|ADDED_LENGTH_BITS>
 627                          * <group_length>\ <compiled_regex...>)
 628                          * groupn no longer seems to be used, but the code
 629                          * still computes it to preserve backward
 630                          * compatibility
 631                          * with earlier versions of regex().
 632                          */
 633
 634                         /* RETRIEVE THE ADDRESS OF THE START OF THE GROUP */
 635
 636                         regex_typep = pop_compilep();
 637                         if (regex_typep == (char *)0) {
 638                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 639                                     compile_startp);
 640                         }
 641                         char_size = get_wchar(&current_char, regexp);
 642                         if (char_size < 0) {
 643                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 644                                     compile_startp);
 645                         } else if (char_size == 0) {
 646                                 *regex_typep = SIMPLE_GROUP;
 647                                 can_repeat = B_TRUE;
 648                                 *compilep = (unsigned char)END_GROUP;
 649                                 regex_typep = compilep;
 650                                 compilep++;
 651                                 *compilep = (unsigned char)groupn;
 652                                 groupn++;
 653                                 compilep++;
 654                         } else if (current_char == DOLLAR_SIGN) {
 655                                 *regex_typep = SAVED_GROUP;
 656                                 regex_typep++;
 657                                 *regex_typep = (char)substringn;
 658                                 can_repeat = B_FALSE;
 659                                 regexp ++;
 660                                 return_arg_number = get_digit(regexp);
 661                                 if ((return_arg_number < 0) ||
 662                                     (substringn >= NSUBSTRINGS)) {
 663                                         ERROR_EXIT(&regcmp_lock, arg_listp,
 664                                             compile_startp);
 665                                 }
 666                                 regexp++;
 667                                 *compilep = (unsigned char)END_SAVED_GROUP;
 668                                 compilep++;
 669                                 *compilep = (unsigned char)substringn;
 670                                 substringn++;
 671                                 compilep++;
 672                                 *compilep = (unsigned char)return_arg_number;
 673                                 compilep++;
 674                         } else {
 675                                 switch (current_char) {
 676                                 case STAR:
 677                                         *regex_typep = ZERO_OR_MORE_GROUP;
 678                                         break;
 679                                 case PLUS:
 680                                         *regex_typep = ONE_OR_MORE_GROUP;
 681                                         break;
 682                                 case LEFT_CURLY_BRACE:
 683                                         *regex_typep = COUNTED_GROUP;
 684                                         break;
 685                                 default:
 686                                         *regex_typep = SIMPLE_GROUP;
 687                                 }
 688                                 if (*regex_typep != SIMPLE_GROUP) {
 689                                         group_length = (unsigned int)
 690                                             (compilep - regex_typep);
 691                                         if (group_length >= 1024) {
 692                                                 ERROR_EXIT(&regcmp_lock,
 693                                                     arg_listp, compile_startp);
 694                                         }
 695                                         high_bits = group_length >>
 696                                             TIMES_256_SHIFT;
 697                                         low_bits = group_length &
 698                                             SINGLE_BYTE_MASK;
 699                                         *regex_typep =
 700                                             (unsigned char)
 701                                             ((unsigned int)
 702                                             *regex_typep | high_bits);
 703                                         regex_typep++;
 704                                         *regex_typep =
 705                                             (unsigned char)low_bits;
 706                                 }
 707                                 can_repeat = B_TRUE;
 708                                 *compilep = (unsigned char)END_GROUP;
 709                                 regex_typep = compilep;
 710                                 compilep++;
 711                                 *compilep = (unsigned char)groupn;
 712                                 groupn++;
 713                                 compilep++;
 714                         }
 715
 716                         break; /* end case RIGHT_PAREN */
 717
 718                 case STAR: /* zero or more repetitions of the */
 719                                 /* preceding expression */
 720
 721                         /*
 722                          * <regex...>* compiles to <regex_type|ZERO_OR_MORE>\
 723                          * <compiled_regex...>
 724                          * (<regex...>)* compiles to
 725                          * <ZERO_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
 726                          * <group_length><compiled_regex...>\
 727                          * <END_GROUP|ZERO_OR_MORE><groupn>
 728                          */
 729
 730                         if (can_repeat == B_FALSE) {
 731                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 732                                     compile_startp);
 733                         } else {
 734                                 can_repeat = B_FALSE;
 735                                 *regex_typep = (unsigned char)
 736                                     ((unsigned int)*regex_typep | ZERO_OR_MORE);
 737                         }
 738                         break; /* end case '*' */
 739
 740                 case PLUS:
 741                         /* one or more repetitions of the preceding */
 742                                 /* expression */
 743
 744                         /*
 745                          * <regex...>+ compiles to <regex_type|ONE_OR_MORE>\
 746                          * <compiled_regex...> (<regex...>)+ compiles to
 747                          * <ONE_OR_MORE_GROUP|ADDED_LENGTH_BITS>\
 748                          * <group_length><compiled_regex...>\
 749                          * <END_GROUP|ONE_OR_MORE><groupn>
 750                          */
 751
 752                         if (can_repeat == B_FALSE) {
 753                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 754                                     compile_startp);
 755                         } else {
 756                                 can_repeat = B_FALSE;
 757                                 *regex_typep =
 758                                     (unsigned char)((unsigned int)*
 759                                     regex_typep | ONE_OR_MORE);
 760                         }
 761                         break; /* end case '+' */
 762
 763                 case LEFT_CURLY_BRACE:
 764
 765                         /*
 766                          * repeat the preceding regular expression
 767                          * at least min_count times
 768                          * and at most max_count times
 769                          *
 770                          * <regex...>{min_count} compiles to
 771                          * <regex type|COUNT><compiled_regex...>
 772                          * <min_count><min_count>
 773                          *
 774                          * <regex...>{min_count,} compiles to
 775                          * <regex type|COUNT><compiled_regex...>
 776                          * <min_count><UNLIMITED>
 777                          *
 778                          * <regex...>{min_count,max_count} compiles to
 779                          * <regex type>|COUNT><compiled_regex...>
 780                          * <min_count><max_count>
 781                          *
 782                          * (<regex...>){min_count,max_count} compiles to
 783                          * <COUNTED_GROUP|ADDED_LENGTH_BITS><group_length>\
 784                          * <compiled_regex...><END_GROUP|COUNT><groupn>\
 785                          * <minimum_match_count><maximum_match_count>
 786                          */
 787
 788                         if (can_repeat == B_FALSE) {
 789                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 790                                     compile_startp);
 791                         }
 792                         can_repeat = B_FALSE;
 793                         *regex_typep = (unsigned char)((unsigned int)*
 794                             regex_typep | COUNT);
 795                         count_length = get_count(&min_count, regexp);
 796                         if (count_length <= 0) {
 797                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 798                                     compile_startp);
 799                         }
 800                         regexp += count_length;
 801
 802                         if (*regexp == RIGHT_CURLY_BRACE) { /* {min_count} */
 803                                 regexp++;
 804                                 max_count = min_count;
 805                         } else if (*regexp == COMMA) { /* {min_count,..} */
 806                                 regexp++;
 807                                 /* {min_count,}   */
 808                                 if (*regexp == RIGHT_CURLY_BRACE) {
 809                                         regexp++;
 810                                         max_count = UNLIMITED;
 811                                 } else { /* {min_count,max_count} */
 812                                         count_length = get_count(
 813                                             &max_count, regexp);
 814                                         if (count_length <= 0) {
 815                                                 ERROR_EXIT(&regcmp_lock,
 816                                                     arg_listp, compile_startp);
 817                                         }
 818                                         regexp += count_length;
 819                                         if (*regexp != RIGHT_CURLY_BRACE) {
 820                                                 ERROR_EXIT(&regcmp_lock,
 821                                                     arg_listp, compile_startp);
 822                                         }
 823                                         regexp++;
 824                                 }
 825                         } else { /* invalid expression */
 826                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 827                                     compile_startp);
 828                         }
 829
 830                         if ((min_count > MAX_SINGLE_BYTE_INT) ||
 831                             ((max_count != UNLIMITED) &&
 832                             (min_count > max_count))) {
 833                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 834                                     compile_startp);
 835                         } else {
 836                                 *compilep = (unsigned char)min_count;
 837                                 compilep++;
 838                                 *compilep = (unsigned char)max_count;
 839                                 compilep++;
 840                         }
 841                         break; /* end case LEFT_CURLY_BRACE */
 842
 843                 default: /* a single non-special character */
 844
 845                         /*
 846                          * compiles to <ASCII_CHAR><ascii_char> or
 847                          * <MULTIBYTE_CHAR><multibyte_char>
 848                          */
 849
 850                         can_repeat = B_TRUE;
 851                         regex_typep = compilep;
 852                         expr_length = add_single_char_expr(compilep,
 853                             current_char);
 854                         compilep += expr_length;
 855
 856                 } /* end switch (current_char) */
 857
 858                 /* GET THE NEXT CHARACTER FOR THE WHILE LOOP */
 859
 860                 char_size = get_wchar(&current_char, regexp);
 861                 if (char_size < 0) {
 862                         ERROR_EXIT(&regcmp_lock, arg_listp, compile_startp);
 863                 } else if (char_size > 0) {
 864                         regexp += char_size;
 865                 } else if /* (char_size == 0) && */ (next_argp != (char *)0) {
 866                         regexp = next_argp;
 867                         next_argp = va_arg(arg_listp, /* const */ char *);
 868                         char_size = get_wchar(&current_char, regexp);
 869                         if (char_size <= 0) {
 870                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 871                                     compile_startp);
 872                         } else {
 873                                 regexp += char_size;
 874                         }
 875                 } else /* ((char_size == 0) && (next_argp == (char *)0)) */ {
 876                         if (pop_compilep() != (char *)0) {
 877                                 /* unmatched parentheses */
 878                                 ERROR_EXIT(&regcmp_lock, arg_listp,
 879                                     compile_startp);
 880                         }
 881                         *compilep = (unsigned char)END_REGEX;
 882                         compilep++;
 883                         *compilep = '\0';
 884                         compilep++;
 885                         __i_size = (int)(compilep - compile_startp);
 886                         va_end(arg_listp);
 887                         lmutex_unlock(&regcmp_lock);
 888                         return (compile_startp);
 889                 }
 890         } /* end for (;;) */
 891
 892 } /* regcmp() */
 893
 894
 895 /* DEFINITIONS OF PRIVATE FUNCTIONS */
 896
 897 static int
 898 add_char(char *compilep, wchar_t wchar)
 899 {
 900         int expr_length;
 901
 902         if ((unsigned int)wchar <= (unsigned int)0x7f) {
 903                 *compilep = (unsigned char)wchar;
 904                 expr_length = 1;
 905         } else {
 906                 expr_length = wctomb(compilep, wchar);
 907         }
 908         return (expr_length);
 909 }
 910
 911 static int
 912 add_single_char_expr(char *compilep, wchar_t wchar)
 913 {
 914         int expr_length = 0;
 915
 916         if ((unsigned int)wchar <= (unsigned int)0x7f) {
 917                 *compilep = (unsigned char)ASCII_CHAR;
 918                 compilep++;
 919                 *compilep = (unsigned char)wchar;
 920                 expr_length += 2;
 921         } else {
 922                 *compilep = (unsigned char)MULTIBYTE_CHAR;
 923                 compilep++;
 924                 expr_length++;
 925                 expr_length += wctomb(compilep, wchar);
 926         }
 927         return (expr_length);
 928 }
 929
 930 static int
 931 get_count(int *countp, const char *regexp)
 932 {
 933         char count_char = '0';
 934         int count = 0;
 935         int count_length = 0;
 936
 937         if (regexp == (char *)0) {
 938                 return ((int)0);
 939         } else {
 940                 count_char = *regexp;
 941                 while (('0' <= count_char) && (count_char <= '9')) {
 942                         count = (10 * count) + (int)(count_char - '0');
 943                         count_length++;
 944                         regexp++;
 945                         count_char = *regexp;
 946                 }
 947         }
 948         *countp = count;
 949         return (count_length);
 950 }
 951
 952 static int
 953 get_digit(const char *regexp)
 954 {
 955         char digit;
 956
 957         if (regexp == (char *)0) {
 958                 return ((int)-1);
 959         } else {
 960                 digit = *regexp;
 961                 if (('0' <= digit) && (digit <= '9')) {
 962                         return ((int)(digit - '0'));
 963                 } else {
 964                         return ((int)-1);
 965                 }
 966         }
 967 }
 968
 969 static int
 970 get_wchar(wchar_t *wcharp, const char *regexp)
 971 {
 972         int char_size;
 973
 974         if (regexp == (char *)0) {
 975                 char_size = 0;
 976                 *wcharp = (wchar_t)((unsigned int)'\0');
 977         } else if (*regexp == '\0') {
 978                 char_size = 0;
 979                 *wcharp = (wchar_t)((unsigned int)*regexp);
 980         } else if ((unsigned char)*regexp <= (unsigned char)0x7f) {
 981                 char_size = 1;
 982                 *wcharp = (wchar_t)((unsigned int)*regexp);
 983         } else {
 984                 char_size = mbtowc(wcharp, regexp, MB_LEN_MAX);
 985         }
 986         return (char_size);
 987 }
 988
 989 static char *
 990 pop_compilep(void)
 991 {
 992         char *compilep;
 993
 994         if (compilep_stackp >= &compilep_stack[STRINGP_STACK_SIZE]) {
 995                 return ((char *)0);
 996         } else {
 997                 compilep = *compilep_stackp;
 998                 compilep_stackp++;
 999                 return (compilep);
1000         }
1001 }
1002
1003 static char *
1004 push_compilep(char *compilep)
1005 {
1006         if (compilep_stackp <= &compilep_stack[0]) {
1007                 return ((char *)0);
1008         } else {
1009                 compilep_stackp--;
1010                 *compilep_stackp = compilep;
1011                 return (compilep);
1012         }
1013 }
1014
1015 static boolean_t
1016 valid_range(wchar_t lower_char, wchar_t upper_char)
1017 {
1018         return (((lower_char <= 0x7f) && (upper_char <= 0x7f) &&
1019             !iswcntrl(lower_char) && !iswcntrl(upper_char) &&
1020             (lower_char < upper_char)) ||
1021             (((lower_char & WCHAR_CSMASK) ==
1022             (upper_char & WCHAR_CSMASK)) &&
1023             (lower_char < upper_char)));
1024 }