regexp/source/reclass.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2000, 2010 Oracle and/or its affiliates.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * This file is part of OpenOffice.org.
  10  *
  11  * OpenOffice.org is free software: you can redistribute it and/or modify
  12  * it under the terms of the GNU Lesser General Public License version 3
  13  * only, as published by the Free Software Foundation.
  14  *
  15  * OpenOffice.org is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU Lesser General Public License version 3 for more details
  19  * (a copy is included in the LICENSE file that accompanied this code).
  20  *
  21  * You should have received a copy of the GNU Lesser General Public License
  22  * version 3 along with OpenOffice.org.  If not, see
  23  * <http://www.openoffice.org/license.html>
  24  * for a copy of the LGPLv3 License.
  25  *
  26  ************************************************************************/
  27
  28
  29 // MARKER(update_precomp.py): autogen include statement, do not remove
  30 #include "precompiled_regexp.hxx"
  31 /* Extended regular expression matching and search library,
  32    version 0.12.
  33    (Implements POSIX draft P1003.2/D11.2, except for some of the
  34    internationalization features.)
  35    Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
  36
  37    The GNU C Library is free software; you can redistribute it and/or
  38    modify it under the terms of the GNU Library General Public License as
  39    published by the Free Software Foundation; either version 2 of the
  40    License, or (at your option) any later version.
  41
  42    The GNU C Library is distributed in the hope that it will be useful,
  43    but WITHOUT ANY WARRANTY; without even the implied warranty of
  44    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  45    Library General Public License for more details.
  46
  47    You should have received a copy of the GNU Library General Public
  48    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  49    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  50    Boston, MA 02111-1307, USA.  */
  51
  52 /*
  53     Modified for OpenOffice.org to use sal_Unicode and Transliteration service.
  54  */
  55
  56
  57 #if 0
  58 /* If for any reason (porting, debug) we can't use alloca() use malloc()
  59    instead.  Use alloca() if possible for performance reasons, this _is_
  60    significant, with malloc() the re_match2() method makes heavy use of regexps
  61    through the TextSearch interface up to three times slower.  This is _the_
  62    bottleneck in some spreadsheet documents.  */
  63 #define REGEX_MALLOC
  64 #endif
  65
  66 /* AIX requires this to be the first thing in the file. */
  67 #if defined _AIX && !defined REGEX_MALLOC
  68   #pragma alloca
  69 #endif
  70
  71 #include <string.h>
  72 #include <assert.h>
  73
  74 #include <rtl/ustring.hxx>
  75 #include <com/sun/star/i18n/TransliterationModules.hpp>
  76
  77 #include "reclass.hxx"
  78
  79
  80 /* Maximum number of duplicates an interval can allow.  Some systems
  81    (erroneously) define this in other header files, but we want our
  82    value, so remove any previous define.  */
  83 #ifdef RE_DUP_MAX
  84 # undef RE_DUP_MAX
  85 #endif
  86 /* If sizeof(int) == 2, then ((1 << 15) - 1) overflows.  */
  87 #define RE_DUP_MAX (0x7fff)
  88
  89
  90 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
  91    `re_match_2' returns information about at least this many registers
  92    the first time a `regs' structure is passed.  */
  93 #ifndef RE_NREGS
  94 # define RE_NREGS 30
  95 #endif
  96
  97
  98 // Macros
  99 #define INIT_COMPILE_STACK_SIZE     32
 100 #define INIT_BUF_SIZE           ((1 << BYTEWIDTH)/BYTEWIDTH)
 101 #define MAX_BUF_SIZE            65535L
 102 #define NO_HIGHEST_ACTIVE_REG       (1 << BYTEWIDTH)
 103 #define NO_LOWEST_ACTIVE_REG        (NO_HIGHEST_ACTIVE_REG + 1)
 104
 105 /* Since we have one byte reserved for the register number argument to
 106    {start,stop}_memory, the maximum number of groups we can report
 107    things about is what fits in that byte.  */
 108 #define MAX_REGNUM 255
 109
 110 #define MIN(x, y) ( (x) < (y) ? (x) : (y) )
 111 #define MAX(x, y) ( (x) > (y) ? (x) : (y) )
 112
 113
 114 // Always. We're not in Emacs and don't use relocating allocators.
 115 #define MATCH_MAY_ALLOCATE
 116
 117 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
 118    use `alloca' instead of `malloc'.  This is because malloc is slower and
 119    causes storage fragmentation.  On the other hand, malloc is more portable,
 120    and easier to debug.
 121
 122    Because we sometimes use alloca, some routines have to be macros,
 123    not functions -- `alloca'-allocated space disappears at the end of the
 124    function it is called in.  */
 125
 126 #ifdef REGEX_MALLOC
 127
 128 # define REGEX_ALLOCATE malloc
 129 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
 130 # define REGEX_FREE free
 131
 132 #else /* not REGEX_MALLOC  */
 133
 134 /* Emacs already defines alloca, sometimes. So does MSDEV.  */
 135 # ifndef alloca
 136
 137 /* Make alloca work the best possible way.  */
 138 #  ifdef __GNUC__
 139 #   define alloca __builtin_alloca
 140 #  else /* not __GNUC__ */
 141 #   include <sal/alloca.h>
 142 #  endif /* not __GNUC__ */
 143
 144 # endif /* not alloca */
 145
 146 # define REGEX_ALLOCATE alloca
 147
 148 /* Assumes a `char *destination' variable.  */
 149 # define REGEX_REALLOCATE(source, osize, nsize)             \
 150   (destination = (char *) alloca (nsize),               \
 151    memcpy (destination, source, osize))
 152
 153 /* No need to do anything to free, after alloca.  */
 154 # define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
 155
 156 #endif /* not REGEX_MALLOC */
 157
 158
 159 /* Define how to allocate the failure stack.  */
 160
 161 #ifdef REGEX_MALLOC
 162
 163 # define REGEX_ALLOCATE_STACK malloc
 164 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
 165 # define REGEX_FREE_STACK free
 166
 167 #else /* not REGEX_MALLOC */
 168
 169 # define REGEX_ALLOCATE_STACK alloca
 170
 171 # define REGEX_REALLOCATE_STACK(source, osize, nsize)           \
 172    REGEX_REALLOCATE (source, osize, nsize)
 173 /* No need to explicitly free anything.  */
 174 # define REGEX_FREE_STACK(arg)
 175
 176 #endif /* not REGEX_MALLOC */
 177
 178
 179 /* (Re)Allocate N items of type T using malloc, or fail.  */
 180 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
 181 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
 182 #define RETALLOC_IF(addr, n, t) \
 183   if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
 184 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
 185
 186 #define BYTEWIDTH 16    /* In bits (assuming sizeof(sal_Unicode)*8) */
 187
 188
 189 #define CHAR_CLASS_MAX_LENGTH 256
 190
 191 /* Fetch the next character in the uncompiled pattern, with no
 192    translation.  */
 193 #define PATFETCH_RAW(c)                                                 \
 194     do {                                \
 195         if (p == pend) return REG_EEND;                         \
 196         c = (sal_Unicode) *p++;                                 \
 197     } while (0)
 198
 199 /* Go backwards one character in the pattern.  */
 200 #define PATUNFETCH p--
 201
 202 #define FREE_STACK_RETURN(value)                    \
 203     return(free(compile_stack.stack), value)
 204
 205 #define GET_BUFFER_SPACE(n)                     \
 206     while ((sal_uInt32)(b - bufp->buffer + (n)) > bufp->allocated)  \
 207         EXTEND_BUFFER()
 208
 209 /* Extend the buffer by twice its current size via realloc and
 210    reset the pointers that pointed into the old block to point to the
 211    correct places in the new one.  If extending the buffer results in it
 212    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
 213 #define EXTEND_BUFFER()                                                 \
 214   do {                                                                  \
 215     sal_Unicode *old_buffer = bufp->buffer;                           \
 216     if (bufp->allocated == MAX_BUF_SIZE)                                \
 217       return REG_ESIZE;                                                 \
 218     bufp->allocated <<= 1;                                              \
 219     if (bufp->allocated > MAX_BUF_SIZE)                                 \
 220       bufp->allocated = MAX_BUF_SIZE;                                   \
 221     bufp->buffer = (sal_Unicode *) realloc(bufp->buffer,        \
 222                        bufp->allocated *        \
 223                        sizeof(sal_Unicode));    \
 224     if (bufp->buffer == NULL)                                           \
 225       return REG_ESPACE;                                                \
 226     /* If the buffer moved, move all the pointers into it.  */          \
 227     if (old_buffer != bufp->buffer) {                                   \
 228         b = (b - old_buffer) + bufp->buffer;                            \
 229         begalt = (begalt - old_buffer) + bufp->buffer;                  \
 230         if (fixup_alt_jump)                                             \
 231           fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
 232         if (laststart)                                                  \
 233           laststart = (laststart - old_buffer) + bufp->buffer;          \
 234         if (pending_exact)                                              \
 235           pending_exact = (pending_exact - old_buffer) + bufp->buffer;  \
 236       }                                                                 \
 237   } while (0)
 238
 239 #define BUF_PUSH(c)                         \
 240     do {                                \
 241         GET_BUFFER_SPACE(1);                    \
 242         *b++ = (sal_Unicode)(c);                \
 243     } while(0)
 244
 245 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
 246 #define BUF_PUSH_2(c1, c2)                                              \
 247   do {                                                                  \
 248     GET_BUFFER_SPACE(2);                        \
 249     *b++ = (sal_Unicode) (c1);                                          \
 250     *b++ = (sal_Unicode) (c2);                                          \
 251   } while (0)
 252
 253 /* As with BUF_PUSH_2, except for three bytes.  */
 254 #define BUF_PUSH_3(c1, c2, c3)                                          \
 255   do {                                                                  \
 256     GET_BUFFER_SPACE(3);                        \
 257     *b++ = (sal_Unicode) (c1);                                          \
 258     *b++ = (sal_Unicode) (c2);                                          \
 259     *b++ = (sal_Unicode) (c3);                                          \
 260   } while (0)
 261
 262 /* Store a jump with opcode OP at LOC to location TO.  We store a
 263    relative address offset by the three bytes the jump itself occupies.  */
 264 #define STORE_JUMP(op, loc, to)                     \
 265     store_op1(op, loc, (int) ((to) - (loc) - 3))
 266
 267 /* Likewise, for a two-argument jump.  */
 268 #define STORE_JUMP2(op, loc, to, arg)                   \
 269     store_op2(op, loc, (int) ((to) - (loc) - 3), arg)
 270
 271 /* Store NUMBER in two contiguous sal_Unicode starting at DESTINATION.  */
 272
 273 inline
 274 void
 275 Regexpr::store_number( sal_Unicode * destination, sal_Int32 number )
 276 {
 277   (destination)[0] = sal_Unicode((number) & 0xffff);
 278   (destination)[1] = sal_Unicode((number) >> 16);
 279 }
 280
 281 /* Same as STORE_NUMBER, except increment DESTINATION to
 282    the byte after where the number is stored.  Therefore, DESTINATION
 283    must be an lvalue.  */
 284
 285 inline
 286 void
 287 Regexpr::store_number_and_incr( sal_Unicode *& destination, sal_Int32 number )
 288 {
 289   store_number( destination, number );
 290   (destination) += 2;
 291 }
 292
 293 /* Put into DESTINATION a number stored in two contiguous sal_Unicode starting
 294    at SOURCE.  */
 295
 296 inline void Regexpr::extract_number( sal_Int32 & dest, sal_Unicode *source )
 297 {
 298   dest = (((sal_Int32) source[1]) << 16) | (source[0] & 0xffff);
 299 }
 300
 301 /* Like `STORE_JUMP', but for inserting.  Assume `b' is the buffer end.  */
 302 #define INSERT_JUMP(op, loc, to)                    \
 303     insert_op1(op, loc, (sal_Int32) ((to) - (loc) - 3), b)
 304
 305 /* Like `STORE_JUMP2', but for inserting.  Assume `b' is the buffer end.  */
 306 #define INSERT_JUMP2(op, loc, to, arg)                  \
 307     insert_op2(op, loc, (sal_Int32) ((to) - (loc) - 3), arg, b)
 308
 309 #define STREQ(s1, s2) (rtl_ustr_compare((s1), (s2)) ? (0) : (1))
 310
 311 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
 312 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
 313
 314 /* The next available element.  */
 315 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
 316
 317 /* Get the next unsigned number in the uncompiled pattern.  */
 318 #define GET_UNSIGNED_NUMBER(num) {                                      \
 319     if (p != pend) {                                                \
 320         PATFETCH_RAW(c);                    \
 321         while (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') {    \
 322             if (num < 0)                                    \
 323                 num = 0;                                \
 324             num = num * 10 + c - (sal_Unicode)'0';      \
 325             if (p == pend)                  \
 326                 break;                  \
 327             PATFETCH_RAW(c);                \
 328         }                                                       \
 329     }                                                               \
 330 }
 331
 332 /* Get the next hex number in the uncompiled pattern.  */
 333 #define GET_HEX_NUMBER(num) {                                       \
 334     if (p != pend) {                                                \
 335         sal_Bool stop = false;                  \
 336         sal_Int16 hexcnt = 1;                   \
 337         PATFETCH_RAW(c);                    \
 338         while ( (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') || (c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f') || (c >= (sal_Unicode)'A' && c <= (sal_Unicode)'F') ) {    \
 339             if (num < 0)                                    \
 340                 num = 0;                                \
 341             if ( c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9' ) \
 342                 num = num * 16 + c - (sal_Unicode)'0';      \
 343             else if ( c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f' ) \
 344                 num = num * 16 + (10 + c - (sal_Unicode)'a');       \
 345             else                        \
 346                 num = num * 16 + (10 + c - (sal_Unicode)'A');       \
 347             if (p == pend || hexcnt == 4) {         \
 348                 stop = true;                \
 349                 break;                  \
 350             }                       \
 351             PATFETCH_RAW(c);                \
 352             hexcnt++;                   \
 353         }                                                       \
 354                                     \
 355         if ( ! stop ) {                     \
 356             PATUNFETCH;                 \
 357             hexcnt--;                   \
 358         }                           \
 359         if ( hexcnt > 4 || (num < 0 || num > 0xffff) ) num = -1;\
 360     }                                                               \
 361 }
 362
 363
 364 /* Number of failure points for which to initially allocate space
 365    when matching.  If this number is exceeded, we allocate more
 366    space, so it is not a hard limit.  */
 367 #ifndef INIT_FAILURE_ALLOC
 368 # define INIT_FAILURE_ALLOC 5
 369 #endif
 370
 371 #define INIT_FAIL_STACK()                       \
 372   do {                                  \
 373     fail_stack.stack = (fail_stack_elt_t *)             \
 374       REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
 375                                     \
 376     if (fail_stack.stack == NULL)                   \
 377       return -2;                            \
 378                                     \
 379     fail_stack.size = INIT_FAILURE_ALLOC;               \
 380     fail_stack.avail = 0;                       \
 381   } while (0)
 382
 383 #define RESET_FAIL_STACK()  REGEX_FREE_STACK (fail_stack.stack)
 384
 385 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
 386
 387    Return 1 if succeeds, and 0 if either ran out of memory
 388    allocating space for it or it was already too large.
 389
 390    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
 391
 392 #define DOUBLE_FAIL_STACK(fail_stack)                   \
 393   ((fail_stack).size > (sal_uInt32) (re_max_failures * MAX_FAILURE_ITEMS)   \
 394    ? 0                                  \
 395    : ((fail_stack).stack = (fail_stack_elt_t *)             \
 396         REGEX_REALLOCATE_STACK ((fail_stack).stack,             \
 397           (fail_stack).size * sizeof (fail_stack_elt_t),        \
 398           ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)),    \
 399                                     \
 400       (fail_stack).stack == NULL                    \
 401       ? 0                               \
 402       : ((fail_stack).size <<= 1,                   \
 403          1)))
 404
 405
 406 #define REG_UNSET_VALUE (&reg_unset_dummy)
 407 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
 408
 409 #define REG_MATCH_NULL_STRING_P(R)  ((R).bits.match_null_string_p)
 410 #define IS_ACTIVE(R)  ((R).bits.is_active)
 411 #define MATCHED_SOMETHING(R)  ((R).bits.matched_something)
 412 #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)
 413
 414 /* Call this when have matched a real character; it sets `matched' flags
 415    for the subexpressions which we are currently inside.  Also records
 416    that those subexprs have matched.  */
 417 #define SET_REGS_MATCHED()                                              \
 418   do {                                  \
 419       if (!set_regs_matched_done) {                 \
 420           sal_uInt32 r;                         \
 421           set_regs_matched_done = 1;                                    \
 422           for (r = lowest_active_reg; r <= highest_active_reg; r++) {   \
 423               MATCHED_SOMETHING(reg_info[r])                \
 424                 = EVER_MATCHED_SOMETHING(reg_info[r])           \
 425                 = 1;                                                    \
 426             }                                                           \
 427         }                                                               \
 428     }                                                                   \
 429   while (0)
 430
 431 #define FAIL_STACK_EMPTY()     (fail_stack.avail == 0)
 432
 433 /* This converts PTR, a pointer into the search string `string2' into an offset from the beginning of that string.  */
 434 #define POINTER_TO_OFFSET(ptr) ((sal_Int32) ((ptr) - string2))
 435
 436 /* This is the number of items that are pushed and popped on the stack
 437    for each register.  */
 438 #define NUM_REG_ITEMS  3
 439
 440 /* Individual items aside from the registers.  */
 441 # define NUM_NONREG_ITEMS 4
 442
 443 /* We push at most this many items on the stack.  */
 444 /* We used to use (num_regs - 1), which is the number of registers
 445    this regexp will save; but that was changed to 5
 446    to avoid stack overflow for a regexp with lots of parens.  */
 447 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
 448
 449 /* We actually push this many items.  */
 450 #define NUM_FAILURE_ITEMS                               \
 451   (((0                                                  \
 452      ? 0 : highest_active_reg - lowest_active_reg + 1)  \
 453     * NUM_REG_ITEMS)                                    \
 454    + NUM_NONREG_ITEMS)
 455
 456 /* How many items can still be added to the stack without overflowing it.  */
 457 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
 458
 459 /* Push a pointer value onto the failure stack.
 460    Assumes the variable `fail_stack'.  Probably should only
 461    be called from within `PUSH_FAILURE_POINT'.  */
 462 #define PUSH_FAILURE_POINTER(item)                                      \
 463   fail_stack.stack[fail_stack.avail++].pointer = (sal_Unicode *) (item)
 464
 465 /* This pushes an integer-valued item onto the failure stack.
 466    Assumes the variable `fail_stack'.  Probably should only
 467    be called from within `PUSH_FAILURE_POINT'.  */
 468 #define PUSH_FAILURE_INT(item)                                  \
 469   fail_stack.stack[fail_stack.avail++].integer = (item)
 470
 471 /* Push a fail_stack_elt_t value onto the failure stack.
 472    Assumes the variable `fail_stack'.  Probably should only
 473    be called from within `PUSH_FAILURE_POINT'.  */
 474 #define PUSH_FAILURE_ELT(item)                                  \
 475   fail_stack.stack[fail_stack.avail++] =  (item)
 476
 477 /* These three POP... operations complement the three PUSH... operations.
 478    All assume that `fail_stack' is nonempty.  */
 479 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
 480 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
 481 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
 482
 483 /* Test if at very beginning or at very end of `string2'. */
 484 #define AT_STRINGS_BEG(d) ((d) == string2 || !size2)
 485 #define AT_STRINGS_END(d) ((d) == end2)
 486
 487 /* Checking for end of string */
 488 #define PREFETCH() \
 489 do { \
 490     if ( d == end2 ) { \
 491         goto fail; \
 492     } \
 493 } while (0)
 494
 495
 496 sal_Bool
 497 Regexpr::iswordbegin(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
 498 {
 499    if ( d == string || ! ssize ) return true;
 500
 501    if ( !unicode::isAlphaDigit(d[-1]) && unicode::isAlphaDigit(d[0])) {
 502     return true;
 503    }
 504    return false;
 505 }
 506
 507 sal_Bool
 508 Regexpr::iswordend(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
 509 {
 510    if ( d == (string+ssize) ) return true;
 511
 512    if ( !unicode::isAlphaDigit(d[0]) && unicode::isAlphaDigit(d[-1])) {
 513     return true;
 514    }
 515    return false;
 516 }
 517
 518 /* Push the information about the state we will need
 519    if we ever fail back to it.
 520
 521    Requires variables fail_stack, regstart, regend, and reg_info
 522    be declared.  DOUBLE_FAIL_STACK requires `destination'
 523    be declared.
 524
 525    Does `return FAILURE_CODE' if runs out of memory.  */
 526
 527 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code)   \
 528   do {                                                                  \
 529     char *destination;                          \
 530     /* Must be int, so when we don't save any registers, the arithmetic \
 531        of 0 + -1 isn't done as unsigned.  */                            \
 532     /* Can't be int, since there is not a shred of a guarantee that int \
 533        is wide enough to hold a value of something to which pointer can \
 534        be assigned */                                                   \
 535     sal_uInt32 this_reg;                                                \
 536                                                                         \
 537     /* Ensure we have enough space allocated for what we will push.  */ \
 538     while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) {                  \
 539         if (!DOUBLE_FAIL_STACK(fail_stack))                            \
 540           return failure_code;                                          \
 541       }                                                                 \
 542                                                                         \
 543     /* Push the info, starting with the registers.  */                  \
 544     if (1)                                                              \
 545       for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
 546            this_reg++) {                                                 \
 547           PUSH_FAILURE_POINTER(regstart[this_reg]);                    \
 548                                                                         \
 549           PUSH_FAILURE_POINTER (regend[this_reg]);                      \
 550                                                                         \
 551           PUSH_FAILURE_ELT(reg_info[this_reg].word);                   \
 552         }                                                               \
 553                                                                         \
 554     PUSH_FAILURE_INT(lowest_active_reg);                               \
 555                                                                         \
 556     PUSH_FAILURE_INT(highest_active_reg);                              \
 557                                                                         \
 558     PUSH_FAILURE_POINTER(pattern_place);                               \
 559                                                                         \
 560     PUSH_FAILURE_POINTER(string_place);                                \
 561                                                                         \
 562   } while (0)
 563
 564 /* Pops what PUSH_FAIL_STACK pushes.
 565
 566    We restore into the parameters, all of which should be lvalues:
 567      STR -- the saved data position.
 568      PAT -- the saved pattern position.
 569      LOW_REG, HIGH_REG -- the highest and lowest active registers.
 570      REGSTART, REGEND -- arrays of string positions.
 571      REG_INFO -- array of information about each subexpression.
 572
 573    Also assumes the variables `fail_stack' and (if debugging), `bufp',
 574    `pend', `string2', and `size2'.  */
 575
 576 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info) {\
 577     sal_uInt32 this_reg;                                                \
 578     sal_Unicode *string_temp;                                     \
 579                                                                         \
 580   assert(!FAIL_STACK_EMPTY());                                        \
 581                                                                         \
 582   /* Remove failure points and point to how many regs pushed.  */       \
 583   assert(fail_stack.avail >= NUM_NONREG_ITEMS);                        \
 584                                                                         \
 585   /* If the saved string location is NULL, it came from an              \
 586      on_failure_keep_string_jump opcode, and we want to throw away the  \
 587      saved NULL, thus retaining our current position in the string.  */ \
 588   string_temp = POP_FAILURE_POINTER();                                 \
 589   if (string_temp != NULL)                                              \
 590     str = (const sal_Unicode *) string_temp;                                   \
 591                                                                         \
 592   pat = (sal_Unicode *) POP_FAILURE_POINTER();                       \
 593                                                                         \
 594   /* Restore register info.  */                                         \
 595   high_reg = (sal_uInt32) POP_FAILURE_INT();                         \
 596                                                                         \
 597   low_reg = (sal_uInt32) POP_FAILURE_INT();                          \
 598                                                                         \
 599   if (1)                                                                \
 600     for (this_reg = high_reg; this_reg >= low_reg; this_reg--) {         \
 601                                                                         \
 602         reg_info[this_reg].word = POP_FAILURE_ELT();                   \
 603                                                                         \
 604         regend[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER();       \
 605                                                                         \
 606         regstart[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER();     \
 607       } else {                                                          \
 608       for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) {\
 609           reg_info[this_reg].word.integer = 0;                          \
 610           regend[this_reg] = 0;                                         \
 611           regstart[this_reg] = 0;                                       \
 612         }                                                               \
 613       highest_active_reg = high_reg;                                    \
 614     }                                                                   \
 615                                                                         \
 616   set_regs_matched_done = 0;                                            \
 617 } /* POP_FAILURE_POINT */
 618
 619 inline
 620 void
 621 Regexpr::extract_number_and_incr( sal_Int32 & destination, sal_Unicode *& source )
 622 {
 623   extract_number(destination, source);
 624   source += 2;
 625 }
 626
 627
 628 inline
 629 void
 630 Regexpr::store_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg)
 631 {
 632   *loc = (sal_Unicode) op;
 633   store_number(loc + 1, arg);
 634 }
 635
 636 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2.  */
 637
 638 inline
 639 void
 640 Regexpr::store_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2)
 641 {
 642   *loc = (sal_Unicode) op;
 643   store_number(loc + 1, arg1);
 644   store_number(loc + 3, arg2);
 645 }
 646
 647 void
 648 Regexpr::insert_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg, sal_Unicode *end)
 649 {
 650   register sal_Unicode *pfrom = end;
 651   register sal_Unicode *pto = end + 3;
 652
 653   while (pfrom != loc) {
 654     *--pto = *--pfrom;
 655   }
 656
 657   store_op1(op, loc, arg);
 658 }
 659
 660
 661 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
 662
 663 void
 664 Regexpr::insert_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2, sal_Unicode *end)
 665 {
 666   register sal_Unicode *pfrom = end;
 667   register sal_Unicode *pto = end + 5;
 668
 669   while (pfrom != loc)
 670     *--pto = *--pfrom;
 671
 672   store_op2 (op, loc, arg1, arg2);
 673 }
 674
 675 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
 676    after an alternative or a begin-subexpression.  We assume there is at
 677    least one character before the ^.  */
 678
 679 sal_Bool
 680 Regexpr::at_begline_loc_p(const sal_Unicode *local_pattern, const sal_Unicode *p)
 681 {
 682   const sal_Unicode *prev = p - 2;
 683   sal_Bool prev_prev_backslash = prev > local_pattern && prev[-1] == '\\';
 684
 685   return(
 686      /* After a subexpression?  */
 687      (*prev == (sal_Unicode)'(' && prev_prev_backslash)
 688      /* After an alternative?  */
 689      || (*prev == (sal_Unicode)'|' && prev_prev_backslash));
 690 }
 691
 692 /* The dual of at_begline_loc_p.  This one is for $.  We assume there is
 693    at least one character after the $, i.e., `P < PEND'.  */
 694
 695 sal_Bool
 696 Regexpr::at_endline_loc_p(const sal_Unicode *p, const sal_Unicode * /* pend */ )
 697 {
 698   const sal_Unicode *next = p;
 699   //sal_Bool next_backslash = *next == (sal_Unicode)'\\';
 700   //const sal_Unicode *next_next = p + 1 < pend ? p + 1 : 0;
 701
 702   return(
 703      /* Before a subexpression?  */
 704      *next == (sal_Unicode)')'
 705      // (next_backslash && next_next && *next_next == (sal_Unicode)')')
 706      /* Before an alternative?  */
 707      || *next == (sal_Unicode)'|' );
 708   //    || (next_backslash && next_next && *next_next == (sal_Unicode)'|'));
 709 }
 710
 711 reg_errcode_t
 712 Regexpr::compile_range(sal_Unicode range_start, sal_Unicode range_end, sal_Unicode *b)
 713 {
 714   sal_uInt32 this_char;
 715
 716   /* If the start is after the end, the range is empty.  */
 717   if (range_start > range_end)
 718     return REG_NOERROR;
 719
 720   /* Here we see why `this_char' has to be larger than an `sal_Unicode'
 721      -- the range is inclusive, so if `range_end' == 0xffff
 722      (assuming 16-bit characters), we would otherwise go into an infinite
 723      loop, since all characters <= 0xffff.  */
 724   for (this_char = range_start; this_char <= range_end; this_char++) {
 725     set_list_bit( sal_Unicode(this_char), b);
 726   }
 727
 728   return REG_NOERROR;
 729 }
 730
 731 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
 732    false if it's not.  */
 733
 734 sal_Bool
 735 Regexpr::group_in_compile_stack(compile_stack_type compile_stack, sal_uInt32 regnum)
 736 {
 737   sal_Int32 this_element;
 738
 739   for (this_element = compile_stack.avail - 1;
 740        this_element >= 0;
 741        this_element--) {
 742     if (compile_stack.stack[this_element].regnum == regnum) {
 743       return true;
 744     }
 745   }
 746
 747   return false;
 748 }
 749
 750
 751 Regexpr::Regexpr( const ::com::sun::star::util::SearchOptions & rOptions,
 752          ::com::sun::star::uno::Reference<
 753          ::com::sun::star::i18n::XExtendedTransliteration > XTrans)
 754 {
 755   bufp = NULL;
 756   pattern = NULL;
 757
 758   if ( rOptions.algorithmType != ::com::sun::star::util::SearchAlgorithms_REGEXP ) {
 759     return;
 760   }
 761
 762   if ( rOptions.searchString == NULL ||
 763        rOptions.searchString.getLength() <= 0) {
 764     return;
 765   }
 766
 767   pattern = (sal_Unicode *)rOptions.searchString.getStr();
 768   patsize = rOptions.searchString.getLength();
 769
 770   re_max_failures = 2000;
 771
 772   translit = XTrans;
 773   translate = translit.is() ? 1 : 0;
 774
 775   bufp = NULL;
 776
 777   isIgnoreCase = ((rOptions.transliterateFlags &
 778         ::com::sun::star::i18n::TransliterationModules_IGNORE_CASE) != 0);
 779
 780   // Compile Regular expression pattern
 781   if ( regcomp() != REG_NOERROR )
 782     {
 783       if ( bufp )
 784     {
 785       if ( bufp->buffer )
 786         free(bufp->buffer);
 787       if( bufp->fastmap )
 788         free(bufp->fastmap);
 789
 790       free(bufp);
 791       bufp = NULL;
 792         }
 793     }
 794 }
 795
 796 Regexpr::~Regexpr()
 797 {
 798   //    translit->remove();
 799   if( bufp )
 800     {
 801       if( bufp->buffer )
 802     free(bufp->buffer);
 803       if( bufp->fastmap )
 804     free(bufp->fastmap);
 805
 806       free(bufp);
 807       bufp = NULL;
 808     }
 809
 810 }
 811
 812 // sets a new line to search in (restore start/end_ptr)
 813 void
 814 Regexpr::set_line(const sal_Unicode *new_line, sal_Int32 len)
 815 {
 816   line = new_line;
 817   linelen = len;
 818 }
 819
 820 // main function for searching the pattern
 821 // returns negative or startpos and sets regs
 822 sal_Int32
 823 Regexpr::re_search(struct re_registers *regs, sal_Int32 pOffset)
 824 {
 825   // Check if pattern buffer is NULL
 826   if ( bufp == NULL ) {
 827     return(-3);
 828   }
 829
 830   sal_Int32 range;
 831   sal_Int32 startpos;
 832   sal_Int32 stoppos;
 833
 834   startpos = pOffset;
 835   if ( linelen < 0 ) {
 836     range = linelen + 1;
 837     linelen = -(linelen);
 838     stoppos = pOffset + 1;
 839   } else {
 840     range = linelen - 1;
 841     stoppos = linelen;
 842   }
 843   for ( ; ; ) {
 844     sal_Int32 val = re_match2(regs, startpos, stoppos);
 845
 846 #ifndef REGEX_MALLOC
 847 # ifdef C_ALLOCA
 848     alloca (0);
 849 # endif
 850 #endif
 851
 852     // Return success if match found
 853     if (val == 0) {
 854       break;
 855     }
 856
 857     if (val == -2) {
 858       return(-2);
 859     }
 860
 861     // If match only beginning of string (startpos)
 862     if (!range) {
 863       break;
 864     }
 865
 866     // If search match from startpos to startpos+range
 867     else if (range > 0) {   // Forward string search
 868       range--;
 869       startpos++;
 870     } else {        // Reverse string search
 871       range++;
 872       startpos--;
 873     }
 874   }
 875
 876   if ( regs->num_of_match > 0 )
 877     return(0);
 878   else
 879     return(-1);
 880 }
 881
 882 sal_Int32
 883 Regexpr::regcomp()
 884 {
 885   bufp = (struct re_pattern_buffer *)malloc(sizeof(struct re_pattern_buffer));
 886   if ( bufp == NULL ) {
 887     return(-1);
 888   }
 889
 890   bufp->buffer = 0;
 891   bufp->allocated = 0;
 892   bufp->used = 0;
 893
 894   //bufp->fastmap = (sal_Unicode*) malloc((1 << BYTEWIDTH) * sizeof(sal_Unicode));
 895   // No fastmap with Unicode
 896   bufp->fastmap = NULL;
 897
 898   return(regex_compile());
 899 }
 900
 901 sal_Int32
 902 Regexpr::regex_compile()
 903 {
 904   register sal_Unicode c, c1;
 905   const sal_Unicode *p1;
 906   register sal_Unicode *b;
 907
 908   /* Keeps track of unclosed groups.  */
 909   compile_stack_type compile_stack;
 910
 911   /* Points to the current (ending) position in the pattern.  */
 912   const sal_Unicode *p = pattern;
 913   const sal_Unicode *pend = pattern + patsize;
 914
 915   /* Address of the count-byte of the most recently inserted `exactn'
 916      command.  This makes it possible to tell if a new exact-match
 917      character can be added to that command or if the character requires
 918      a new `exactn' command.  */
 919   sal_Unicode *pending_exact = 0;
 920
 921   /* Address of start of the most recently finished expression.
 922      This tells, e.g., postfix * where to find the start of its
 923      operand.  Reset at the beginning of groups and alternatives.  */
 924   sal_Unicode *laststart = 0;
 925
 926   /* Address of beginning of regexp, or inside of last group.  */
 927   sal_Unicode *begalt;
 928
 929   /* Place in the uncompiled pattern (i.e., the {) to
 930      which to go back if the interval is invalid.  */
 931   const sal_Unicode *beg_interval;
 932
 933   /* Address of the place where a forward jump should go to the end of
 934      the containing expression.  Each alternative of an `or' -- except the
 935      last -- ends with a forward jump of this sort.  */
 936   sal_Unicode *fixup_alt_jump = 0;
 937
 938   /* Counts open-groups as they are encountered.  Remembered for the
 939      matching close-group on the compile stack, so the same register
 940      number is put in the stop_memory as the start_memory.  */
 941   sal_Int32 regnum = 0;
 942
 943   /* Initialize the compile stack.  */
 944   compile_stack.stack = (compile_stack_elt_t *)malloc(INIT_COMPILE_STACK_SIZE * sizeof(compile_stack_elt_t));
 945   if (compile_stack.stack == NULL)
 946     return(REG_ESPACE);
 947
 948   compile_stack.size = INIT_COMPILE_STACK_SIZE;
 949   compile_stack.avail = 0;
 950
 951   /* Initialize the pattern buffer.  */
 952   bufp->fastmap_accurate = 0;
 953   bufp->not_bol = 0;
 954   bufp->not_eol = 0;
 955   bufp->newline_anchor = 1;
 956
 957   /* Set `used' to zero, so that if we return an error, the pattern
 958      printer (for debugging) will think there's no pattern.  We reset it
 959      at the end.  */
 960   bufp->used = 0;
 961
 962   /* Always count groups. */
 963   bufp->re_nsub = 0;
 964
 965   if (bufp->allocated == 0) {
 966     if (bufp->buffer) {
 967       /* If zero allocated, but buffer is non-null, try to realloc
 968      enough space.  This loses if buffer's address is bogus, but
 969      that is the user's responsibility.  */
 970       bufp->buffer = (sal_Unicode *)realloc(bufp->buffer, INIT_BUF_SIZE * sizeof(sal_Unicode));
 971     } else { /* Caller did not allocate a buffer.  Do it for them.  */
 972       bufp->buffer = (sal_Unicode *)malloc(INIT_BUF_SIZE * sizeof(sal_Unicode));
 973     }
 974     if (!bufp->buffer) FREE_STACK_RETURN(REG_ESPACE);
 975
 976     bufp->allocated = INIT_BUF_SIZE;
 977   }
 978
 979   begalt = b = bufp->buffer;
 980
 981   /* Loop through the uncompiled pattern until we're at the end.  */
 982   while (p != pend) {
 983     PATFETCH_RAW(c);
 984
 985     switch (c) {
 986     case (sal_Unicode)'^': {
 987       if (   /* If at start of pattern, it's an operator.  */
 988       p == pattern + 1
 989       /* Otherwise, depends on what's come before.  */
 990       || at_begline_loc_p(pattern, p))
 991     BUF_PUSH(begline);
 992       else
 993     goto normal_char;
 994     }
 995     break;
 996
 997     case (sal_Unicode)'$': {
 998       if (   /* If at end of pattern, it's an operator.  */
 999       p == pend
1000       /* Otherwise, depends on what's next.  */
1001       || at_endline_loc_p(p, pend)) {
1002     BUF_PUSH(endline);
1003       } else {
1004     goto normal_char;
1005       }
1006     }
1007     break;
1008
1009     case (sal_Unicode)'+':
1010     case (sal_Unicode)'?':
1011     case (sal_Unicode)'*':
1012       /* If there is no previous pattern... */
1013       if (!laststart) {
1014     goto normal_char;
1015       }
1016
1017       {
1018     /* Are we optimizing this jump?  */
1019     sal_Bool keep_string_p = false;
1020
1021     /* 1 means zero (many) matches is allowed.  */
1022     sal_Unicode zero_times_ok = 0, many_times_ok = 0;
1023
1024     /* If there is a sequence of repetition chars, collapse it
1025        down to just one (the right one).  We can't combine
1026        interval operators with these because of, e.g., `a{2}*',
1027        which should only match an even number of `a's.  */
1028
1029     for (;;) {
1030       zero_times_ok |= c != (sal_Unicode)'+';
1031       many_times_ok |= c != (sal_Unicode)'?';
1032
1033       if (p == pend)
1034         break;
1035
1036       PATFETCH_RAW(c);
1037
1038       if (c == (sal_Unicode)'*' || (c == (sal_Unicode)'+'
1039                     || c == (sal_Unicode)'?')) {
1040       } else {
1041         PATUNFETCH;
1042         break;
1043       }
1044
1045       /* If we get here, we found another repeat character.  */
1046     }
1047
1048     /* Star, etc. applied to an empty pattern is equivalent
1049        to an empty pattern.  */
1050     if (!laststart) {
1051       break;
1052     }
1053
1054     /* Now we know whether or not zero matches is allowed
1055        and also whether or not two or more matches is allowed.  */
1056     if (many_times_ok) {
1057       /* More than one repetition is allowed, so put in at the
1058          end a backward relative jump from `b' to before the next
1059          jump we're going to put in below (which jumps from
1060          laststart to after this jump).
1061
1062          But if we are at the `*' in the exact sequence `.*\n',
1063          insert an unconditional jump backwards to the .,
1064          instead of the beginning of the loop.  This way we only
1065          push a failure point once, instead of every time
1066          through the loop.  */
1067       assert(p - 1 > pattern);
1068
1069       /* Allocate the space for the jump.  */
1070       GET_BUFFER_SPACE(3);
1071
1072       /* We know we are not at the first character of the pattern,
1073          because laststart was nonzero.  And we've already
1074          incremented `p', by the way, to be the character after
1075          the `*'.  Do we have to do something analogous here
1076          for null bytes, because of RE_DOT_NOT_NULL?  */
1077       if (*(p - 2) == (sal_Unicode)'.'
1078           && zero_times_ok
1079           && p < pend && *p == (sal_Unicode)'\n') {
1080         /* We have .*\n.  */
1081         STORE_JUMP(jump, b, laststart);
1082         keep_string_p = true;
1083       } else {
1084         /* Anything else.  */
1085         STORE_JUMP(maybe_pop_jump, b, laststart - 3);
1086       }
1087
1088       /* We've added more stuff to the buffer.  */
1089       b += 3;
1090     }
1091
1092     /* On failure, jump from laststart to b + 3, which will be the
1093        end of the buffer after this jump is inserted.  */
1094     GET_BUFFER_SPACE(3);
1095     INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump
1096             : on_failure_jump,
1097             laststart, b + 3);
1098     pending_exact = 0;
1099     b += 3;
1100
1101     if (!zero_times_ok) {
1102       /* At least one repetition is required, so insert a
1103          `dummy_failure_jump' before the initial
1104          `on_failure_jump' instruction of the loop. This
1105          effects a skip over that instruction the first time
1106          we hit that loop.  */
1107       GET_BUFFER_SPACE(3);
1108       INSERT_JUMP(dummy_failure_jump, laststart, laststart + 6);
1109       b += 3;
1110     }
1111       }
1112       break;
1113
1114     case (sal_Unicode)'.':
1115       laststart = b;
1116       BUF_PUSH(anychar);
1117       break;
1118
1119
1120     case (sal_Unicode)'[': {
1121       sal_Bool have_range = false;
1122       sal_Unicode last_char = 0xffff;
1123       sal_Unicode first_range = 0xffff;
1124       sal_Unicode second_range = 0xffff;
1125       sal_Int16 bsiz;
1126
1127       if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1128
1129       /* Ensure that we have enough space to push a charset: the
1130      opcode, the length count, and the bitset;
1131      1 + 1 + (1 << BYTEWIDTH) / BYTEWIDTH "bytes" in all.  */
1132       bsiz = 2 + ((1 << BYTEWIDTH) / BYTEWIDTH);
1133       GET_BUFFER_SPACE(bsiz);
1134
1135       laststart = b;
1136
1137       /* We test `*p == '^' twice, instead of using an if
1138      statement, so we only need one BUF_PUSH.  */
1139       BUF_PUSH (*p == (sal_Unicode)'^' ? charset_not : charset);
1140       if (*p == (sal_Unicode)'^')
1141     p++;
1142
1143       /* Remember the first position in the bracket expression.  */
1144       p1 = p;
1145
1146       /* Push the number of "bytes" in the bitmap.  */
1147       BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1148
1149       /* Clear the whole map.  */
1150       memset(b, 0, ((1 << BYTEWIDTH) / BYTEWIDTH) * sizeof(sal_Unicode));
1151
1152       /* Read in characters and ranges, setting map bits.  */
1153       for (;;) {
1154     if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1155
1156     PATFETCH_RAW(c);
1157
1158     if ( c == (sal_Unicode)'\\' ) {
1159
1160       PATFETCH_RAW(c);
1161
1162       if ( c == (sal_Unicode)'x' ) {
1163         sal_Int32 UniChar = -1;
1164
1165         GET_HEX_NUMBER(UniChar);
1166         if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1167         c = (sal_Unicode) UniChar;
1168         last_char = c;
1169         set_list_bit(last_char, b);
1170       } else {
1171         last_char = c;
1172         set_list_bit(last_char, b);
1173       }
1174     } else if (c == (sal_Unicode)']') {
1175       /* Could be the end of the bracket expression.  If it's
1176          not (i.e., when the bracket expression is `[]' so
1177          far), the ']' character bit gets set way below.  */
1178         break;
1179     } else if ( c == (sal_Unicode)'-' ) {
1180       if ( !have_range ) {
1181         if ( last_char != 0xffff ) {
1182           first_range = last_char;
1183           have_range = true;
1184           continue;
1185         } else {
1186           last_char = (sal_Unicode)'-';
1187           set_list_bit(last_char, b);
1188         }
1189       }
1190         }
1191
1192     /* See if we're at the beginning of a possible character
1193        class.  */
1194     else if (c == (sal_Unicode)':' && p[-2] == (sal_Unicode)'[') {
1195       /* Leave room for the null.  */
1196       sal_Unicode str[CHAR_CLASS_MAX_LENGTH + 1];
1197
1198       PATFETCH_RAW(c);
1199       c1 = 0;
1200
1201       /* If pattern is `[[:'.  */
1202       if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1203
1204       str[c1++] = c;
1205       for (;;) {
1206         PATFETCH_RAW(c);
1207         if ((c == (sal_Unicode)':' && *p == (sal_Unicode)']') || p == pend)
1208           break;
1209         if (c1 < CHAR_CLASS_MAX_LENGTH)
1210           str[c1++] = c;
1211         else
1212                 /* This is in any case an invalid class name.  */
1213           str[0] = (sal_Unicode)'\0';
1214       }
1215       str[c1] = (sal_Unicode)'\0';
1216
1217       /* If isn't a word bracketed by `[:' and `:]':
1218          undo the ending character, the letters, and leave
1219          the leading `:' and `[' (but set bits for them).  */
1220       if (c == (sal_Unicode)':' && *p == (sal_Unicode)']') {
1221         sal_Int32 ch;
1222         // no support for GRAPH, PUNCT, or XDIGIT yet
1223         sal_Bool is_alnum = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"alnum").getStr());
1224         sal_Bool is_alpha = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"alpha").getStr());
1225         sal_Bool is_cntrl = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"cntrl").getStr());
1226         sal_Bool is_digit = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"digit").getStr());
1227         sal_Bool is_lower = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"lower").getStr());
1228         sal_Bool is_print = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"print").getStr());
1229         sal_Bool is_space = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"space").getStr());
1230         sal_Bool is_upper = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"upper").getStr());
1231
1232         if (!(is_alnum || is_alpha || is_cntrl ||
1233           is_digit || is_lower || is_print || is_space || is_upper) )
1234           FREE_STACK_RETURN(REG_ECTYPE);
1235
1236         /* Throw away the ] at the end of the character
1237            class.  */
1238         PATFETCH_RAW(c);
1239
1240         if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1241
1242         for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1243                 /* This was split into 3 if's to
1244                    avoid an arbitrary limit in some compiler.  */
1245           if (   (is_alnum  && unicode::isAlphaDigit(sal_Unicode(ch))) ||
1246              (is_alpha  && unicode::isAlpha(sal_Unicode(ch))) ||
1247              (is_cntrl  && unicode::isControl(sal_Unicode(ch))))
1248         set_list_bit(sal_Unicode(ch), b);
1249           if (   (is_digit  && unicode::isDigit(sal_Unicode(ch))) ||
1250              (is_lower  && unicode::isLower(sal_Unicode(ch))) ||
1251              (is_print  && unicode::isPrint(sal_Unicode(ch))))
1252         set_list_bit(sal_Unicode(ch), b);
1253           if (   (is_space  && unicode::isSpace(sal_Unicode(ch))) ||
1254              (is_upper  && unicode::isUpper(sal_Unicode(ch))) )
1255         set_list_bit(sal_Unicode(ch), b);
1256           if ( isIgnoreCase && (is_upper || is_lower) &&
1257              (unicode::isUpper(sal_Unicode(ch)) || unicode::isLower(sal_Unicode(ch))))
1258         set_list_bit(sal_Unicode(ch), b);
1259         }
1260         break;
1261       } else {
1262         p = p1+1;
1263         last_char = (sal_Unicode)':';
1264         set_list_bit(last_char, b);
1265       }
1266     } else {
1267       last_char = c;
1268       set_list_bit(last_char, b);
1269     }
1270     if ( have_range ) {
1271       if ( last_char != 0xffff ) {
1272         second_range = last_char;
1273         have_range = false;
1274         compile_range(first_range, second_range, b);
1275       } else FREE_STACK_RETURN(REG_EBRACK);
1276     } else {
1277       if ( last_char != 0xffff ) {
1278         set_list_bit(last_char, b);
1279       } else FREE_STACK_RETURN(REG_EBRACK);
1280     }
1281       }
1282
1283       /* Discard any (non)matching list bytes that are all 0 at the
1284      end of the map.  Decrease the map-length byte too.  */
1285       bsiz = b[-1];
1286       while ((sal_Int16) bsiz > 0 && b[bsiz - 1] == 0)
1287     bsiz--;
1288       b[-1] = (sal_Unicode)bsiz;
1289       b += bsiz;
1290     }
1291     break;
1292
1293     case (sal_Unicode)'(':
1294       goto handle_open;
1295
1296     case (sal_Unicode)')':
1297       goto handle_close;
1298
1299     case (sal_Unicode)'\n':
1300       goto normal_char;
1301
1302     case (sal_Unicode)'|':
1303       goto handle_alt;
1304
1305     case (sal_Unicode)'{':
1306       goto handle_interval;
1307
1308     case (sal_Unicode)'\\':
1309       if (p == pend) FREE_STACK_RETURN(REG_EESCAPE);
1310
1311       /* Do not translate the character after the \, so that we can
1312      distinguish, e.g., \B from \b, even if we normally would
1313      translate, e.g., B to b.  */
1314       PATFETCH_RAW(c);
1315
1316       switch (c) {
1317       case (sal_Unicode)'(':
1318     goto normal_backslash;
1319
1320       handle_open:
1321     bufp->re_nsub++;
1322     regnum++;
1323
1324     if (COMPILE_STACK_FULL) {
1325       compile_stack.stack = (compile_stack_elt_t *)realloc(compile_stack.stack, (compile_stack.size << 1) * sizeof(compile_stack_elt_t));
1326       if (compile_stack.stack == NULL) return(REG_ESPACE);
1327
1328       compile_stack.size <<= 1;
1329     }
1330
1331     /* These are the values to restore when we hit end of this
1332        group.  They are all relative offsets, so that if the
1333        whole pattern moves because of realloc, they will still
1334        be valid.  */
1335     COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1336     COMPILE_STACK_TOP.fixup_alt_jump
1337       = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1338     COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1339     COMPILE_STACK_TOP.regnum = regnum;
1340
1341     /* We will eventually replace the 0 with the number of
1342        groups inner to this one.  But do not push a
1343        start_memory for groups beyond the last one we can
1344        represent in the compiled pattern.  */
1345     if (regnum <= MAX_REGNUM) {
1346       COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1347       BUF_PUSH_3 (start_memory, regnum, 0);
1348     }
1349
1350     compile_stack.avail++;
1351
1352     fixup_alt_jump = 0;
1353     laststart = 0;
1354     begalt = b;
1355     /* If we've reached MAX_REGNUM groups, then this open
1356        won't actually generate any code, so we'll have to
1357        clear pending_exact explicitly.  */
1358     pending_exact = 0;
1359     break;
1360
1361
1362       case (sal_Unicode)')':
1363     goto normal_backslash;
1364
1365     // unreachable (after goto):
1366 #if 0
1367     if (COMPILE_STACK_EMPTY) {
1368       FREE_STACK_RETURN(REG_ERPAREN);
1369     }
1370 #endif
1371
1372       handle_close:
1373     if (fixup_alt_jump) {
1374       /* Push a dummy failure point at the end of the
1375          alternative for a possible future
1376          `pop_failure_jump' to pop.  See comments at
1377          `push_dummy_failure' in `re_match2'.  */
1378       BUF_PUSH(push_dummy_failure);
1379
1380       /* We allocated space for this jump when we assigned
1381          to `fixup_alt_jump', in the `handle_alt' case below.  */
1382       STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1);
1383     }
1384
1385     /* See similar code for backslashed left paren above.  */
1386     if (COMPILE_STACK_EMPTY) {
1387       FREE_STACK_RETURN(REG_ERPAREN);
1388     }
1389
1390     /* Since we just checked for an empty stack above, this
1391        ``can't happen''.  */
1392     assert (compile_stack.avail != 0);
1393
1394     {
1395       /* We don't just want to restore into `regnum', because
1396          later groups should continue to be numbered higher,
1397          as in `(ab)c(de)' -- the second group is #2.  */
1398       sal_Int32 this_group_regnum;
1399
1400       compile_stack.avail--;
1401       begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1402       fixup_alt_jump
1403         = COMPILE_STACK_TOP.fixup_alt_jump
1404         ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1405         : 0;
1406       laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1407       this_group_regnum = COMPILE_STACK_TOP.regnum;
1408       /* If we've reached MAX_REGNUM groups, then this open
1409          won't actually generate any code, so we'll have to
1410          clear pending_exact explicitly.  */
1411       pending_exact = 0;
1412
1413       /* We're at the end of the group, so now we know how many
1414          groups were inside this one.  */
1415       if (this_group_regnum <= MAX_REGNUM) {
1416         sal_Unicode *inner_group_loc
1417           = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1418
1419         *inner_group_loc = sal::static_int_cast<sal_Unicode>( regnum - this_group_regnum );
1420         BUF_PUSH_3 (stop_memory, this_group_regnum,
1421             regnum - this_group_regnum);
1422       }
1423     }
1424     break;
1425
1426
1427       case (sal_Unicode)'|':            /* `\|'.
1428                          * */
1429     goto normal_backslash;
1430       handle_alt:
1431
1432     /* Insert before the previous alternative a jump which
1433        jumps to this alternative if the former fails.  */
1434     GET_BUFFER_SPACE (3);
1435     INSERT_JUMP (on_failure_jump, begalt, b + 6);
1436     pending_exact = 0;
1437     b += 3;
1438
1439     /* The alternative before this one has a jump after it
1440        which gets executed if it gets matched.  Adjust that
1441        jump so it will jump to this alternative's analogous
1442        jump (put in below, which in turn will jump to the next
1443        (if any) alternative's such jump, etc.).  The last such
1444        jump jumps to the correct final destination.  A picture:
1445        _____ _____
1446        |   | |   |
1447        |   v |   v
1448        a | b   | c
1449
1450        If we are at `b', then fixup_alt_jump right now points to a
1451        three-byte space after `a'.  We'll put in the jump, set
1452        fixup_alt_jump to right after `b', and leave behind three
1453        bytes which we'll fill in when we get to after `c'.  */
1454
1455     if (fixup_alt_jump)
1456       STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1457
1458     /* Mark and leave space for a jump after this alternative,
1459        to be filled in later either by next alternative or
1460        when know we're at the end of a series of alternatives.  */
1461     fixup_alt_jump = b;
1462     GET_BUFFER_SPACE (3);
1463     b += 3;
1464
1465     laststart = 0;
1466     begalt = b;
1467     break;
1468
1469
1470       case (sal_Unicode)'{':
1471     goto normal_backslash;
1472
1473       handle_interval:
1474     {
1475       /* allows intervals.  */
1476       /* At least (most) this many matches must be made.  */
1477       sal_Int32 lower_bound = -1, upper_bound = -1;
1478
1479       beg_interval = p - 1;
1480
1481       if (p == pend) {
1482         goto unfetch_interval;
1483       }
1484
1485       GET_UNSIGNED_NUMBER(lower_bound);
1486
1487       if (c == (sal_Unicode)',') {
1488         GET_UNSIGNED_NUMBER(upper_bound);
1489         if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1490       } else
1491         /* Interval such as `{1}' => match exactly once. */
1492         upper_bound = lower_bound;
1493
1494       if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1495           || lower_bound > upper_bound) {
1496         goto unfetch_interval;
1497       }
1498
1499       if (c != (sal_Unicode)'}') {
1500         goto unfetch_interval;
1501       }
1502
1503       /* We just parsed a valid interval.  */
1504
1505       /* If it's invalid to have no preceding re.  */
1506       if (!laststart) {
1507         goto unfetch_interval;
1508       }
1509
1510       /* If the upper bound is zero, don't want to succeed at
1511          all; jump from `laststart' to `b + 3', which will be
1512          the end of the buffer after we insert the jump.  */
1513       if (upper_bound == 0) {
1514         GET_BUFFER_SPACE(3);
1515         INSERT_JUMP(jump, laststart, b + 3);
1516         b += 3;
1517       }
1518
1519       /* Otherwise, we have a nontrivial interval.  When
1520          we're all done, the pattern will look like:
1521          set_number_at <jump count> <upper bound>
1522          set_number_at <succeed_n count> <lower bound>
1523          succeed_n <after jump addr> <succeed_n count>
1524          <body of loop>
1525          jump_n <succeed_n addr> <jump count>
1526          (The upper bound and `jump_n' are omitted if
1527          `upper_bound' is 1, though.)  */
1528       else {
1529         /* If the upper bound is > 1, we need to insert
1530            more at the end of the loop.  */
1531         unsigned nbytes = 10 + (upper_bound > 1) * 10;
1532
1533         GET_BUFFER_SPACE(nbytes);
1534
1535         /* Initialize lower bound of the `succeed_n', even
1536            though it will be set during matching by its
1537            attendant `set_number_at' (inserted next),
1538            because `re_compile_fastmap' needs to know.
1539            Jump to the `jump_n' we might insert below.  */
1540         INSERT_JUMP2(succeed_n, laststart,
1541              b + 5 + (upper_bound > 1) * 5,
1542              lower_bound);
1543         b += 5;
1544
1545         /* Code to initialize the lower bound.  Insert
1546            before the `succeed_n'.  The `5' is the last two
1547            bytes of this `set_number_at', plus 3 bytes of
1548            the following `succeed_n'.  */
1549         insert_op2(set_number_at, laststart, 5, lower_bound, b);
1550         b += 5;
1551
1552         if (upper_bound > 1) {
1553                 /* More than one repetition is allowed, so
1554                    append a backward jump to the `succeed_n'
1555                    that starts this interval.
1556
1557                    When we've reached this during matching,
1558                    we'll have matched the interval once, so
1559                    jump back only `upper_bound - 1' times.  */
1560           STORE_JUMP2(jump_n, b, laststart + 5,
1561               upper_bound - 1);
1562           b += 5;
1563
1564                 /* The location we want to set is the second
1565                    parameter of the `jump_n'; that is `b-2' as
1566                    an absolute address.  `laststart' will be
1567                    the `set_number_at' we're about to insert;
1568                    `laststart+3' the number to set, the source
1569                    for the relative address.  But we are
1570                    inserting into the middle of the pattern --
1571                    so everything is getting moved up by 5.
1572                    Conclusion: (b - 2) - (laststart + 3) + 5,
1573                    i.e., b - laststart.
1574
1575                    We insert this at the beginning of the loop
1576                    so that if we fail during matching, we'll
1577                    reinitialize the bounds.  */
1578           insert_op2(set_number_at, laststart, b - laststart,
1579              upper_bound - 1, b);
1580           b += 5;
1581         }
1582       }
1583       pending_exact = 0;
1584       beg_interval = NULL;
1585     }
1586     break;
1587
1588       unfetch_interval:
1589     /* If an invalid interval, match the characters as literals.  */
1590     assert (beg_interval);
1591     p = beg_interval;
1592     beg_interval = NULL;
1593
1594     /* normal_char and normal_backslash need `c'.  */
1595     PATFETCH_RAW(c);
1596
1597     goto normal_char;
1598
1599       case (sal_Unicode)'`':
1600     BUF_PUSH(begbuf);
1601     break;
1602
1603       case (sal_Unicode)'\'':
1604     BUF_PUSH(endbuf);
1605     break;
1606
1607       case (sal_Unicode)'1': case (sal_Unicode)'2':
1608       case (sal_Unicode)'3': case (sal_Unicode)'4':
1609       case (sal_Unicode)'5': case (sal_Unicode)'6':
1610       case (sal_Unicode)'7': case (sal_Unicode)'8':
1611       case (sal_Unicode)'9':
1612     c1 = c - (sal_Unicode)'0';
1613
1614     if (c1 > regnum)
1615       FREE_STACK_RETURN(REG_ESUBREG);
1616
1617     /* Can't back reference to a subexpression if inside of it.  */
1618     if (group_in_compile_stack(compile_stack, (sal_uInt32) c1)) {
1619       goto normal_char;
1620     }
1621
1622     laststart = b;
1623     BUF_PUSH_2(duplicate, c1);
1624     break;
1625
1626
1627       case (sal_Unicode)'+':
1628       case (sal_Unicode)'?':
1629     goto normal_backslash;
1630
1631       case (sal_Unicode)'x':        // Unicode char
1632     {
1633       sal_Int32 UniChar = -1;
1634
1635       GET_HEX_NUMBER(UniChar);
1636       if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1637       c = (sal_Unicode) UniChar;
1638       goto normal_char;
1639     }
1640     // break;   // unreachable - see goto above
1641
1642       case (sal_Unicode)'<':        // begin Word boundary
1643     BUF_PUSH(wordbeg);
1644     break;
1645
1646       case (sal_Unicode)'>':        // end Word boundary
1647     BUF_PUSH(wordend);
1648     break;
1649
1650       case (sal_Unicode)'n':
1651     c = 0x0a;
1652     goto normal_char;
1653
1654       case (sal_Unicode)'t':
1655     c = 0x09;
1656     goto normal_char;
1657
1658       default:
1659       normal_backslash:
1660     goto normal_char;
1661       }
1662       break;
1663
1664     default:
1665       /* Expects the character in `c'.  */
1666     normal_char:
1667       /* If no exactn currently being built.  */
1668       if ( pending_exact == NULL
1669
1670        /* If last exactn not at current position.  */
1671        || pending_exact + *pending_exact + 1 != b
1672
1673        /* We have only one sal_Unicode char following the
1674           exactn for the count.  */
1675        || *pending_exact == (1 << BYTEWIDTH) - 1
1676
1677        /* If followed by a repetition operator.  */
1678        || *p == (sal_Unicode)'*' || *p == (sal_Unicode)'^'
1679        || *p == (sal_Unicode)'+' || *p == (sal_Unicode)'?'
1680        || *p == (sal_Unicode) '{' ) {
1681     /* Start building a new exactn.  */
1682     laststart = b;
1683     BUF_PUSH_2(exactn, 0);
1684     pending_exact = b - 1;
1685       }
1686
1687       if ( translate ) {
1688         try {
1689             sal_Unicode tmp = translit->transliterateChar2Char(c);
1690             BUF_PUSH(tmp);
1691             (*pending_exact)++;
1692         } catch (::com::sun::star::i18n::MultipleCharsOutputException e) {
1693             ::rtl::OUString o2( translit->transliterateChar2String( c));
1694             sal_Int32 len2 = o2.getLength();
1695             const sal_Unicode * k2 = o2.getStr();
1696             for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
1697               BUF_PUSH(k2[nmatch]);
1698               (*pending_exact)++;
1699             }
1700         }
1701       } else {
1702     BUF_PUSH(c);
1703     (*pending_exact)++;
1704       }
1705       break;
1706     } /* switch (c) */
1707   } /* while p != pend */
1708
1709   /* Through the pattern now.  */
1710
1711   if (fixup_alt_jump)
1712     STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
1713
1714   if (!COMPILE_STACK_EMPTY)
1715     FREE_STACK_RETURN(REG_EPAREN);
1716
1717   // Assumes no backtracking
1718   BUF_PUSH(succeed);
1719
1720   if ( compile_stack.stack )
1721     free(compile_stack.stack);
1722   compile_stack.stack = NULL;
1723
1724   /* We have succeeded; set the length of the buffer.  */
1725   bufp->used = b - bufp->buffer;
1726
1727   return REG_NOERROR;
1728 } /* regex_compile */
1729
1730 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
1731    bytes; nonzero otherwise.  */
1732
1733 sal_Int32
1734 Regexpr::bcmp_translate(const sal_Unicode *s1, const sal_Unicode *s2, sal_Int32 len)
1735 {
1736   for (sal_Int32 nmatch = 0; nmatch < len; nmatch++) {
1737     if (*s1++ != *s2++) {
1738       return(1);
1739     }
1740   }
1741
1742   return(0);
1743 }
1744
1745
1746 /* We are passed P pointing to a register number after a start_memory.
1747
1748    Return true if the pattern up to the corresponding stop_memory can
1749    match the empty string, and false otherwise.
1750
1751    If we find the matching stop_memory, sets P to point to one past its number.
1752    Otherwise, sets P to an undefined byte less than or equal to END.
1753
1754    We don't handle duplicates properly (yet).  */
1755
1756 sal_Bool
1757 Regexpr::group_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1758 {
1759   sal_Int32 mcnt;
1760 /* Point to after the args to the start_memory.  */
1761     sal_Unicode *p1 = *p + 2;
1762
1763     while (p1 < end) {
1764     /* Skip over opcodes that can match nothing, and return true or
1765        false, as appropriate, when we get to one that can't, or to the
1766                       matching stop_memory.  */
1767
1768       switch ((re_opcode_t) *p1) {
1769     /* Could be either a loop or a series of alternatives.  */
1770       case on_failure_jump:
1771     p1++;
1772     extract_number_and_incr(mcnt, p1);
1773
1774     /* If the next operation is not a jump backwards in the
1775        pattern.  */
1776
1777     if (mcnt >= 0) {
1778       /* Go through the on_failure_jumps of the alternatives,
1779          seeing if any of the alternatives cannot match nothing.
1780          The last alternative starts with only a jump,
1781          whereas the rest start with on_failure_jump and end
1782          with a jump, e.g., here is the pattern for `a|b|c':
1783
1784          /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
1785          /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
1786          /exactn/1/c
1787
1788          So, we have to first go through the first (n-1)
1789          alternatives and then deal with the last one separately.  */
1790
1791
1792       /* Deal with the first (n-1) alternatives, which start
1793          with an on_failure_jump (see above) that jumps to right
1794          past a jump_past_alt.  */
1795
1796       while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) {
1797         /* `mcnt' holds how many bytes long the alternative
1798            is, including the ending `jump_past_alt' and
1799            its number.  */
1800
1801         if (!alt_match_null_string_p(p1, p1 + mcnt - 3, reg_info))
1802           return false;
1803
1804         /* Move to right after this alternative, including the
1805            jump_past_alt.  */
1806         p1 += mcnt;
1807
1808         /* Break if it's the beginning of an n-th alternative
1809            that doesn't begin with an on_failure_jump.  */
1810         if ((re_opcode_t) *p1 != on_failure_jump)
1811           break;
1812
1813         /* Still have to check that it's not an n-th
1814            alternative that starts with an on_failure_jump.  */
1815         p1++;
1816         extract_number_and_incr(mcnt, p1);
1817         if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) {
1818           /* Get to the beginning of the n-th alternative.  */
1819           p1 -= 3;
1820           break;
1821         }
1822       }
1823
1824       /* Deal with the last alternative: go back and get number
1825          of the `jump_past_alt' just before it.  `mcnt' contains
1826          the length of the alternative.  */
1827       extract_number(mcnt, p1 - 2);
1828
1829       if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
1830         return false;
1831
1832       p1 += mcnt;       /* Get past the n-th alternative.  */
1833     } /* if mcnt > 0 */
1834     break;
1835
1836
1837       case stop_memory:
1838     assert (p1[1] == **p);
1839     *p = p1 + 2;
1840     return true;
1841
1842
1843       default:
1844     if (!common_op_match_null_string_p(&p1, end, reg_info))
1845       return false;
1846       }
1847     } /* while p1 < end */
1848
1849  return false;
1850 } /* group_match_null_string_p */
1851
1852 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
1853    It expects P to be the first byte of a single alternative and END one
1854    byte past the last. The alternative can contain groups.  */
1855
1856 sal_Bool
1857 Regexpr::alt_match_null_string_p(sal_Unicode *p, sal_Unicode *end, register_info_type *reg_info)
1858 {
1859   sal_Int32 mcnt;
1860   sal_Unicode *p1 = p;
1861
1862   while (p1 < end) {
1863     /* Skip over opcodes that can match nothing, and break when we get
1864        to one that can't.  */
1865
1866     switch ((re_opcode_t) *p1) {
1867       /* It's a loop.  */
1868     case on_failure_jump:
1869       p1++;
1870       extract_number_and_incr(mcnt, p1);
1871       p1 += mcnt;
1872       break;
1873
1874     default:
1875       if (!common_op_match_null_string_p(&p1, end, reg_info))
1876     return false;
1877     }
1878   }  /* while p1 < end */
1879
1880   return true;
1881 } /* alt_match_null_string_p */
1882
1883
1884 /* Deals with the ops common to group_match_null_string_p and
1885    alt_match_null_string_p.
1886
1887    Sets P to one after the op and its arguments, if any.  */
1888
1889 sal_Bool
1890 Regexpr::common_op_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1891 {
1892   sal_Int32 mcnt;
1893   sal_Bool ret;
1894   sal_Int32 reg_no;
1895   sal_Unicode *p1 = *p;
1896
1897   switch ((re_opcode_t) *p1++) {
1898   case no_op:
1899   case begline:
1900   case endline:
1901   case begbuf:
1902   case endbuf:
1903     break;
1904
1905   case start_memory:
1906     reg_no = *p1;
1907     assert (reg_no > 0 && reg_no <= MAX_REGNUM);
1908     ret = group_match_null_string_p(&p1, end, reg_info);
1909     /* Have to set this here in case we're checking a group which
1910        contains a group and a back reference to it.  */
1911
1912     if (REG_MATCH_NULL_STRING_P(reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
1913       REG_MATCH_NULL_STRING_P(reg_info[reg_no]) = ret;
1914
1915     if (!ret)
1916       return false;
1917     break;
1918
1919     /* If this is an optimized succeed_n for zero times, make the jump.  */
1920   case jump:
1921     extract_number_and_incr(mcnt, p1);
1922     if (mcnt >= 0)
1923       p1 += mcnt;
1924     else
1925       return false;
1926     break;
1927
1928   case succeed_n:
1929     /* Get to the number of times to succeed.  */
1930     p1 += 2;
1931     extract_number_and_incr(mcnt, p1);
1932
1933     if (mcnt == 0)
1934       {
1935     p1 -= 4;
1936     extract_number_and_incr(mcnt, p1);
1937     p1 += mcnt;
1938       }
1939     else
1940       return false;
1941     break;
1942
1943   case duplicate:
1944     if (!REG_MATCH_NULL_STRING_P(reg_info[*p1]))
1945       return false;
1946     break;
1947
1948   case set_number_at:
1949     p1 += 4;
1950
1951   default:
1952     /* All other opcodes mean we cannot match the empty string.  */
1953     return false;
1954   }
1955
1956   *p = p1;
1957   return true;
1958 } /* common_op_match_null_string_p */
1959
1960
1961
1962 /* Free everything we malloc.  */
1963 #ifdef MATCH_MAY_ALLOCATE
1964 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
1965 # define FREE_VARIABLES()                       \
1966   do {                                  \
1967     REGEX_FREE_STACK (fail_stack.stack);                \
1968     FREE_VAR (regstart);                        \
1969     FREE_VAR (regend);                          \
1970     FREE_VAR (old_regstart);                        \
1971     FREE_VAR (old_regend);                      \
1972     FREE_VAR (best_regstart);                       \
1973     FREE_VAR (best_regend);                     \
1974     FREE_VAR (reg_info);                        \
1975     FREE_VAR (reg_dummy);                       \
1976     FREE_VAR (reg_info_dummy);                      \
1977   } while (0)
1978 #else
1979 # define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning. */
1980 #endif /* not MATCH_MAY_ALLOCATE */
1981
1982 /* This is a separate function so that we can force an alloca cleanup
1983    afterwards.  */
1984 sal_Int32
1985 Regexpr::re_match2(struct re_registers *regs, sal_Int32 pos, sal_Int32 range)
1986 {
1987   /* General temporaries.  */
1988   sal_Int32 mcnt;
1989   sal_Unicode *p1;
1990
1991   /* Just past the end of the corresponding string.  */
1992   sal_Unicode *end2;
1993
1994   /* Pointers into string2, just past the last characters in
1995        each to consider matching.  */
1996   sal_Unicode *end_match_2;
1997
1998   /* Where we are in the data, and the end of the current string.  */
1999   const sal_Unicode *d, *dend;
2000
2001   /* Where we are in the compiled pattern, and the end of the compiled
2002        pattern.  */
2003   sal_Unicode *p = bufp->buffer;
2004   register sal_Unicode *pend = p + bufp->used;
2005
2006     /* Mark the opcode just after a start_memory, so we can test for an
2007        empty subpattern when we get to the stop_memory.  */
2008   sal_Unicode *just_past_start_mem = 0;
2009
2010   /* Failure point stack.  Each place that can handle a failure further
2011      down the line pushes a failure point on this stack.  It consists of
2012      restart, regend, and reg_info for all registers corresponding to
2013      the subexpressions we're currently inside, plus the number of such
2014      registers, and, finally, two sal_Unicode *'s.  The first
2015      sal_Unicode * is where to resume scanning the pattern; the second
2016      one is where to resume scanning the strings.  If the latter is
2017      zero, the failure point is a ``dummy''; if a failure happens and
2018      the failure point is a dummy, it gets discarded and the next next
2019      one is tried.  */
2020 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
2021   fail_stack_type fail_stack;
2022 #endif
2023
2024   /* We fill all the registers internally, independent of what we
2025      return, for use in backreferences.  The number here includes
2026      an element for register zero.  */
2027   size_t num_regs = bufp->re_nsub + 1;
2028
2029   /* The currently active registers.  */
2030   sal_uInt32 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2031   sal_uInt32 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2032
2033   /* Information on the contents of registers. These are pointers into
2034      the input strings; they record just what was matched (on this
2035      attempt) by a subexpression part of the pattern, that is, the
2036      regnum-th regstart pointer points to where in the pattern we began
2037      matching and the regnum-th regend points to right after where we
2038      stopped matching the regnum-th subexpression.  (The zeroth register
2039      keeps track of what the whole pattern matches.)  */
2040 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2041   const sal_Unicode **regstart, **regend;
2042 #endif
2043
2044   /* If a group that's operated upon by a repetition operator fails to
2045      match anything, then the register for its start will need to be
2046      restored because it will have been set to wherever in the string we
2047      are when we last see its open-group operator.  Similarly for a
2048      register's end.  */
2049 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2050   const sal_Unicode **old_regstart, **old_regend;
2051 #endif
2052
2053   /* The is_active field of reg_info helps us keep track of which (possibly
2054      nested) subexpressions we are currently in. The matched_something
2055      field of reg_info[reg_num] helps us tell whether or not we have
2056      matched any of the pattern so far this time through the reg_num-th
2057      subexpression.  These two fields get reset each time through any
2058      loop their register is in.  */
2059 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
2060   register_info_type *reg_info;
2061 #endif
2062
2063   /* The following record the register info as found in the above
2064      variables when we find a match better than any we've seen before.
2065      This happens as we backtrack through the failure points, which in
2066      turn happens only if we have not yet matched the entire string. */
2067   //unsigned best_regs_set = false;
2068 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2069   const sal_Unicode **best_regstart, **best_regend;
2070 #endif
2071
2072   /* Logically, this is `best_regend[0]'.  But we don't want to have to
2073      allocate space for that if we're not allocating space for anything
2074      else (see below).  Also, we never need info about register 0 for
2075      any of the other register vectors, and it seems rather a kludge to
2076      treat `best_regend' differently than the rest.  So we keep track of
2077      the end of the best match so far in a separate variable.  We
2078      initialize this to NULL so that when we backtrack the first time
2079      and need to test it, it's not garbage.  */
2080   //const sal_Unicode *match_end = NULL;
2081
2082   /* This helps SET_REGS_MATCHED avoid doing redundant work.  */
2083   sal_Int32 set_regs_matched_done = 0;
2084
2085   /* Used when we pop values we don't care about.  */
2086 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2087   const sal_Unicode **reg_dummy;
2088   register_info_type *reg_info_dummy;
2089 #endif
2090
2091   INIT_FAIL_STACK();
2092
2093 #ifdef MATCH_MAY_ALLOCATE
2094   /* Do not bother to initialize all the register variables if there are
2095      no groups in the pattern, as it takes a fair amount of time.  If
2096      there are groups, we include space for register 0 (the whole
2097      pattern), even though we never use it, since it simplifies the
2098      array indexing.  We should fix this.  */
2099   if (bufp->re_nsub)
2100     {
2101       regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2102       regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2103       old_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2104       old_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2105       best_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2106       best_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2107       reg_info = REGEX_TALLOC (num_regs, register_info_type);
2108       reg_dummy = REGEX_TALLOC (num_regs, const sal_Unicode *);
2109       reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
2110
2111       if (!(regstart && regend && old_regstart && old_regend && reg_info
2112             && best_regstart && best_regend && reg_dummy && reg_info_dummy))
2113         {
2114           FREE_VARIABLES ();
2115           return -2;
2116         }
2117     }
2118   else
2119     {
2120       /* We must initialize all our variables to NULL, so that
2121          `FREE_VARIABLES' doesn't try to free them.  */
2122       regstart = regend = old_regstart = old_regend = best_regstart
2123         = best_regend = reg_dummy = NULL;
2124       reg_info = reg_info_dummy = (register_info_type *) NULL;
2125     }
2126 #endif /* MATCH_MAY_ALLOCATE */
2127
2128   sal_Unicode *string2 = (sal_Unicode *)line;
2129   sal_Int32 size2 = linelen;
2130   sal_Int32 stop = range;
2131
2132   /* The starting position is bogus.  */
2133   if (pos < 0 || pos >= size2 || linelen <= 0 ) {
2134       FREE_VARIABLES ();
2135       return(-1);
2136   }
2137
2138   /* Initialize subexpression text positions to -1 to mark ones that no
2139      start_memory/stop_memory has been seen for. Also initialize the
2140      register information struct.  */
2141   for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) {
2142     regstart[mcnt] = regend[mcnt]
2143       = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
2144
2145     REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
2146     IS_ACTIVE (reg_info[mcnt]) = 0;
2147     MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2148     EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2149   }
2150
2151   end2 = (sal_Unicode *)(string2 + size2);
2152
2153   end_match_2 = (sal_Unicode *)(string2 + stop);
2154
2155   /* `p' scans through the pattern as `d' scans through the data.
2156      `dend' is the end of the input string that `d' points within.  `d'
2157      is advanced into the following input string whenever necessary, but
2158      this happens before fetching; therefore, at the beginning of the
2159      loop, `d' can be pointing at the end of a string, but it cannot
2160      equal `string2'.  */
2161   d = string2 + pos;
2162   dend = end_match_2;
2163
2164     /* This loops over pattern commands.  It exits by returning from the
2165        function if the match is complete, or it drops through if the match
2166        fails at this starting point in the input data.  */
2167   for (;;) {
2168     if (p == pend) {
2169       /* End of pattern means we might have succeeded.  */
2170
2171       /* If we haven't matched the entire string, and we want the
2172      longest match, try backtracking.  */
2173       if (d != end_match_2) {
2174     if (!FAIL_STACK_EMPTY()) {
2175       goto fail;
2176     }
2177       } /* d != end_match_2 */
2178
2179     succeed_label:
2180
2181       /* If caller wants register contents data back, do it.  */
2182       if (regs) {
2183     /* Have the register data arrays been allocated?  */
2184     if (regs->num_regs == 0) {
2185       /* No.  So allocate them with malloc.  We need one
2186          extra element beyond `num_regs' for the `-1' marker
2187          GNU code uses.  */
2188       regs->num_of_match = 0;
2189       regs->num_regs = MAX(RE_NREGS, num_regs + 1);
2190       regs->start = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2191       regs->end = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2192       if (regs->start == NULL || regs->end == NULL) {
2193         FREE_VARIABLES ();
2194         return(-2);
2195       }
2196     } else if ( regs->num_regs > 0 ) {
2197       /* Yes.  If we need more elements than were already
2198          allocated, reallocate them.  If we need fewer, just
2199          leave it alone.  */
2200       if (regs->num_regs < num_regs + 1) {
2201         regs->num_regs = num_regs + 1;
2202         regs->start = (sal_Int32 *) realloc(regs->start, regs->num_regs * sizeof(sal_Int32));
2203         regs->end = (sal_Int32 *) realloc(regs->end, regs->num_regs * sizeof(sal_Int32));
2204         if (regs->start == NULL || regs->end == NULL) {
2205           FREE_VARIABLES ();
2206           return(-2);
2207         }
2208       }
2209     } else {    // num_regs is negative
2210       FREE_VARIABLES ();
2211       return(-2);
2212     }
2213
2214     /* Convert the pointer data in `regstart' and `regend' to
2215        indices.  Register zero has to be set differently,
2216        since we haven't kept track of any info for it.  */
2217     if (regs->num_regs > 0) {
2218       // Make sure a valid location
2219       sal_Int32 dpos = d - string2;
2220       if (pos == dpos || (d - 1) >= dend ) {
2221         FREE_VARIABLES ();
2222         return(-1);
2223       }
2224       regs->start[regs->num_of_match] = pos;
2225       regs->end[regs->num_of_match] = ((sal_Int32) (d - string2));
2226       regs->num_of_match++;
2227     }
2228
2229     /* Go through the first `min (num_regs, regs->num_regs)'
2230        registers, since that is all we initialized.  */
2231         for (mcnt = regs->num_of_match; (unsigned) mcnt < MIN(num_regs, regs->num_regs);
2232          mcnt++) {
2233       regs->start[mcnt] = regs->end[mcnt] = -1;
2234       if( !(REG_UNSET(regstart[mcnt]) || REG_UNSET(regend[mcnt])) ) {
2235         regs->start[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regstart[mcnt]);
2236         regs->end[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regend[mcnt]);
2237             regs->num_of_match++;
2238       }
2239     }
2240
2241     /* If the regs structure we return has more elements than
2242        were in the pattern, set the extra elements to -1.  If
2243        we (re)allocated the registers, this is the case,
2244        because we always allocate enough to have at least one
2245        -1 at the end.  */
2246     for (mcnt = regs->num_of_match; (unsigned) mcnt < regs->num_regs; mcnt++)
2247       regs->start[mcnt] = regs->end[mcnt] = -1;
2248       } /* regs */
2249
2250       mcnt = d - pos - string2;
2251
2252       FREE_VARIABLES ();
2253       return(0);
2254     }
2255     /* Otherwise match next pattern command.  */
2256     switch ((re_opcode_t) *p++) {
2257       /* Ignore these.  Used to ignore the n of succeed_n's which
2258      currently have n == 0.  */
2259     case no_op:
2260       break;
2261
2262     case succeed:
2263       goto succeed_label;
2264
2265       /* Match the next n pattern characters exactly.  The following
2266      byte in the pattern defines n, and the n bytes after that
2267      are the characters to match.  */
2268     case exactn:
2269       mcnt = *p++;
2270
2271       do {
2272     PREFETCH();
2273     if ((sal_Unicode)*d++ != (sal_Unicode) *p++) goto fail;
2274       } while (--mcnt);
2275       SET_REGS_MATCHED();
2276       break;
2277
2278       /* Match any character except possibly a newline or a null.  */
2279     case anychar:
2280
2281       PREFETCH();
2282       if ( *d == (sal_Unicode)'\n' ||
2283        *d == (sal_Unicode)'\000' )
2284     goto fail;
2285
2286       SET_REGS_MATCHED();
2287       d++;
2288       break;
2289
2290     case charset:
2291     case charset_not: {
2292       register sal_Unicode c;
2293       sal_Bool knot = (re_opcode_t) *(p - 1) == charset_not;
2294
2295       PREFETCH();
2296       c = *d; /* The character to match.  */
2297       /* Cast to `sal_uInt32' instead of `sal_Unicode' in case the
2298      bit list is a full 32 bytes long.  */
2299       if ((c < (sal_uInt32) (*p * BYTEWIDTH)) && (p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))))
2300     knot = !knot;
2301
2302       p += 1 + *p;
2303
2304       if (!knot) {
2305     goto fail;
2306       }
2307
2308       SET_REGS_MATCHED();
2309       d++;
2310       break;
2311     }
2312
2313     /* The beginning of a group is represented by start_memory.
2314        The arguments are the register number in the next byte, and the
2315        number of groups inner to this one in the next.  The text
2316        matched within the group is recorded (in the internal
2317        registers data structure) under the register number.  */
2318     case start_memory:
2319
2320       /* Find out if this group can match the empty string.  */
2321       p1 = p;       /* To send to group_match_null_string_p.  */
2322
2323       if (REG_MATCH_NULL_STRING_P(reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
2324     REG_MATCH_NULL_STRING_P(reg_info[*p]) = group_match_null_string_p(&p1, pend, reg_info);
2325
2326       /* Save the position in the string where we were the last time
2327      we were at this open-group operator in case the group is
2328      operated upon by a repetition operator, e.g., with `(a*)*b'
2329      against `ab'; then we want to ignore where we are now in
2330      the string in case this attempt to match fails.  */
2331       old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
2332     ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
2333     : regstart[*p];
2334
2335       regstart[*p] = d;
2336
2337       IS_ACTIVE (reg_info[*p]) = 1;
2338       MATCHED_SOMETHING(reg_info[*p]) = 0;
2339
2340       /* Clear this whenever we change the register activity status.  */
2341       set_regs_matched_done = 0;
2342
2343       /* This is the new highest active register.  */
2344       highest_active_reg = *p;
2345
2346       /* If nothing was active before, this is the new lowest active
2347      register.  */
2348       if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2349     lowest_active_reg = *p;
2350
2351       /* Move past the register number and inner group count.  */
2352       p += 2;
2353       just_past_start_mem = p;
2354
2355       break;
2356
2357       /* The stop_memory opcode represents the end of a group.  Its
2358      arguments are the same as start_memory's: the register
2359      number, and the number of inner groups.  */
2360     case stop_memory:
2361
2362       /* We need to save the string position the last time we were at
2363      this close-group operator in case the group is operated
2364      upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
2365      against `aba'; then we want to ignore where we are now in
2366      the string in case this attempt to match fails.  */
2367       old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
2368     ? REG_UNSET(regend[*p]) ? d : regend[*p]
2369     : regend[*p];
2370
2371       regend[*p] = d;
2372
2373       /* This register isn't active anymore.  */
2374       IS_ACTIVE(reg_info[*p]) = 0;
2375
2376       /* Clear this whenever we change the register activity status.  */
2377       set_regs_matched_done = 0;
2378
2379       /* If this was the only register active, nothing is active
2380      anymore.  */
2381       if (lowest_active_reg == highest_active_reg) {
2382     lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2383     highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2384       } else { /* We must scan for the new highest active register, since
2385           it isn't necessarily one less than now: consider
2386           (a(b)c(d(e)f)g).  When group 3 ends, after the f), the
2387           new highest active register is 1.  */
2388     sal_Unicode r = *p - 1;
2389     while (r > 0 && !IS_ACTIVE (reg_info[r]))
2390       r--;
2391
2392     /* If we end up at register zero, that means that we saved
2393        the registers as the result of an `on_failure_jump', not
2394        a `start_memory', and we jumped to past the innermost
2395        `stop_memory'.  For example, in ((.)*) we save
2396        registers 1 and 2 as a result of the *, but when we pop
2397        back to the second ), we are at the stop_memory 1.
2398        Thus, nothing is active.  */
2399     if (r == 0) {
2400       lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2401       highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2402     } else
2403       highest_active_reg = r;
2404       }
2405
2406       /* If just failed to match something this time around with a
2407      group that's operated on by a repetition operator, try to
2408      force exit from the ``loop'', and restore the register
2409      information for this group that we had before trying this
2410      last match.  */
2411       if ((!MATCHED_SOMETHING (reg_info[*p])
2412        || just_past_start_mem == p - 1)
2413       && (p + 2) < pend) {
2414     sal_Bool is_a_jump_n = false;
2415
2416     p1 = p + 2;
2417     mcnt = 0;
2418     switch ((re_opcode_t) *p1++) {
2419     case jump_n:
2420       is_a_jump_n = true;
2421     case pop_failure_jump:
2422     case maybe_pop_jump:
2423     case jump:
2424     case dummy_failure_jump:
2425       extract_number_and_incr(mcnt, p1);
2426       if (is_a_jump_n)
2427         p1 += 2;
2428       break;
2429
2430     default:
2431       /* do nothing */ ;
2432     }
2433     p1 += mcnt;
2434
2435     /* If the next operation is a jump backwards in the pattern
2436        to an on_failure_jump right before the start_memory
2437        corresponding to this stop_memory, exit from the loop
2438        by forcing a failure after pushing on the stack the
2439        on_failure_jump's jump in the pattern, and d.  */
2440     if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
2441         && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) {
2442       /* If this group ever matched anything, then restore
2443          what its registers were before trying this last
2444          failed match, e.g., with `(a*)*b' against `ab' for
2445          regstart[1], and, e.g., with `((a*)*(b*)*)*'
2446          against `aba' for regend[3].
2447
2448          Also restore the registers for inner groups for,
2449          e.g., `((a*)(b*))*' against `aba' (register 3 would
2450          otherwise get trashed).  */
2451
2452       if (EVER_MATCHED_SOMETHING (reg_info[*p])) {
2453         unsigned r;
2454
2455         EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
2456
2457         /* Restore this and inner groups' (if any) registers.  */
2458         for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
2459          r++) {
2460           regstart[r] = old_regstart[r];
2461
2462                 /* xx why this test?  */
2463           if (old_regend[r] >= regstart[r])
2464         regend[r] = old_regend[r];
2465         }
2466       }
2467       p1++;
2468       extract_number_and_incr(mcnt, p1);
2469       PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
2470
2471       goto fail;
2472     }
2473       }
2474
2475       /* Move past the register number and the inner group count.  */
2476       p += 2;
2477       break;
2478
2479
2480       /* \<digit> has been turned into a `duplicate' command which is
2481      followed by the numeric value of <digit> as the register number.  */
2482     case duplicate:
2483       {
2484     register const sal_Unicode *d2, *dend2;
2485     sal_Unicode regno = *p++;   /* Get which register to match against.  */
2486
2487     /* Can't back reference a group which we've never matched.  */
2488     if (REG_UNSET(regstart[regno]) || REG_UNSET(regend[regno])) {
2489       goto fail;
2490     }
2491
2492     /* Where in input to try to start matching.  */
2493     d2 = regstart[regno];
2494
2495     /* Where to stop matching; if both the place to start and
2496        the place to stop matching are in the same string, then
2497        set to the place to stop, otherwise, for now have to use
2498        the end of the first string.  */
2499
2500     dend2 = regend[regno];
2501     for (;;) {
2502       /* If necessary, advance to next segment in register
2503          contents.  */
2504       while (d2 == dend2) {
2505         if (dend2 == end_match_2) break;
2506         if (dend2 == regend[regno]) break;
2507       }
2508       /* At end of register contents => success */
2509       if (d2 == dend2) break;
2510
2511       PREFETCH();
2512
2513       /* How many characters left in this segment to match.  */
2514       mcnt = dend - d;
2515
2516       /* Want how many consecutive characters we can match in
2517          one shot, so, if necessary, adjust the count.  */
2518       if (mcnt > dend2 - d2)
2519         mcnt = dend2 - d2;
2520
2521       /* Compare that many; failure if mismatch, else move
2522          past them.  */
2523       if (translate
2524           ? bcmp_translate(d, d2, mcnt)
2525           : memcmp(d, d2, mcnt * sizeof(sal_Unicode))) {
2526         goto fail;
2527       }
2528       d += mcnt, d2 += mcnt;
2529       /* Do this because we've match some characters.  */
2530       SET_REGS_MATCHED();
2531     }
2532       }
2533       break;
2534
2535       /* begline matches the empty string at the beginning of the string
2536      (unless `not_bol' is set in `bufp'), and, if
2537      `newline_anchor' is set, after newlines.  */
2538     case begline:
2539
2540       if (AT_STRINGS_BEG (d)) {
2541     if (!bufp->not_bol) break;
2542       } else if (d[-1] == '\n' && bufp->newline_anchor) {
2543     break;
2544       }
2545       /* In all other cases, we fail.  */
2546       goto fail;
2547
2548       /* endline is the dual of begline.  */
2549     case endline:
2550
2551       if (AT_STRINGS_END(d))    {
2552     if (!bufp->not_eol) break;
2553       } else if (*d == '\n' && bufp->newline_anchor) {
2554     break;
2555       }
2556       goto fail;
2557
2558       /* Match at the very beginning of the data.  */
2559     case begbuf:
2560       if (AT_STRINGS_BEG (d))
2561     break;
2562       goto fail;
2563
2564
2565       /* Match at the very end of the data.  */
2566     case endbuf:
2567       if (AT_STRINGS_END (d))
2568     break;
2569       goto fail;
2570
2571
2572       /* on_failure_keep_string_jump is used to optimize `.*\n'.  It
2573      pushes NULL as the value for the string on the stack.  Then
2574      `pop_failure_point' will keep the current value for the
2575      string, instead of restoring it.  To see why, consider
2576      matching `foo\nbar' against `.*\n'.  The .* matches the foo;
2577      then the . fails against the \n.  But the next thing we want
2578      to do is match the \n against the \n; if we restored the
2579      string value, we would be back at the foo.
2580
2581      Because this is used only in specific cases, we don't need to
2582      check all the things that `on_failure_jump' does, to make
2583      sure the right things get saved on the stack.  Hence we don't
2584      share its code.  The only reason to push anything on the
2585      stack at all is that otherwise we would have to change
2586      `anychar's code to do something besides goto fail in this
2587      case; that seems worse than this.  */
2588     case on_failure_keep_string_jump:
2589
2590       extract_number_and_incr(mcnt, p);
2591
2592       PUSH_FAILURE_POINT(p + mcnt, NULL, -2);
2593       break;
2594
2595
2596       /* Uses of on_failure_jump:
2597
2598      Each alternative starts with an on_failure_jump that points
2599      to the beginning of the next alternative.  Each alternative
2600      except the last ends with a jump that in effect jumps past
2601      the rest of the alternatives.  (They really jump to the
2602      ending jump of the following alternative, because tensioning
2603      these jumps is a hassle.)
2604
2605      Repeats start with an on_failure_jump that points past both
2606      the repetition text and either the following jump or
2607      pop_failure_jump back to this on_failure_jump.  */
2608     case on_failure_jump:
2609     on_failure:
2610
2611     extract_number_and_incr(mcnt, p);
2612
2613     /* If this on_failure_jump comes right before a group (i.e.,
2614        the original * applied to a group), save the information
2615        for that group and all inner ones, so that if we fail back
2616        to this point, the group's information will be correct.
2617        For example, in \(a*\)*\1, we need the preceding group,
2618        and in \(zz\(a*\)b*\)\2, we need the inner group.  */
2619
2620     /* We can't use `p' to check ahead because we push
2621        a failure point to `p + mcnt' after we do this.  */
2622     p1 = p;
2623
2624     /* We need to skip no_op's before we look for the
2625        start_memory in case this on_failure_jump is happening as
2626        the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
2627        against aba.  */
2628     while (p1 < pend && (re_opcode_t) *p1 == no_op)
2629       p1++;
2630
2631     if (p1 < pend && (re_opcode_t) *p1 == start_memory) {
2632       /* We have a new highest active register now.  This will
2633      get reset at the start_memory we are about to get to,
2634      but we will have saved all the registers relevant to
2635      this repetition op, as described above.  */
2636       highest_active_reg = *(p1 + 1) + *(p1 + 2);
2637       if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2638     lowest_active_reg = *(p1 + 1);
2639     }
2640
2641     PUSH_FAILURE_POINT(p + mcnt, d, -2);
2642     break;
2643
2644     /* A smart repeat ends with `maybe_pop_jump'.
2645        We change it to either `pop_failure_jump' or `jump'.  */
2646     case maybe_pop_jump:
2647       extract_number_and_incr(mcnt, p);
2648       {
2649     register sal_Unicode *p2 = p;
2650
2651     /* Compare the beginning of the repeat with what in the
2652        pattern follows its end. If we can establish that there
2653        is nothing that they would both match, i.e., that we
2654        would have to backtrack because of (as in, e.g., `a*a')
2655        then we can change to pop_failure_jump, because we'll
2656        never have to backtrack.
2657
2658        This is not true in the case of alternatives: in
2659        `(a|ab)*' we do need to backtrack to the `ab' alternative
2660        (e.g., if the string was `ab').  But instead of trying to
2661        detect that here, the alternative has put on a dummy
2662        failure point which is what we will end up popping.  */
2663
2664     /* Skip over open/close-group commands.
2665        If what follows this loop is a ...+ construct,
2666        look at what begins its body, since we will have to
2667        match at least one of that.  */
2668     while (1) {
2669       if (p2 + 2 < pend
2670           && ((re_opcode_t) *p2 == stop_memory
2671           || (re_opcode_t) *p2 == start_memory))
2672         p2 += 3;
2673       else if (p2 + 6 < pend
2674            && (re_opcode_t) *p2 == dummy_failure_jump)
2675         p2 += 6;
2676       else
2677         break;
2678     }
2679
2680     p1 = p + mcnt;
2681     /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
2682        to the `maybe_finalize_jump' of this case.  Examine what
2683        follows.  */
2684
2685     /* If we're at the end of the pattern, we can change.  */
2686     if (p2 == pend) {
2687                 /* Consider what happens when matching ":\(.*\)"
2688                    against ":/".  I don't really understand this code
2689                    yet.  */
2690       p[-3] = (sal_Unicode) pop_failure_jump;
2691     } else if ((re_opcode_t) *p2 == exactn
2692            || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) {
2693       register sal_Unicode c = *p2 == (sal_Unicode) endline ? (sal_Unicode)'\n' : p2[2];
2694
2695       if ((re_opcode_t) p1[3] == exactn && p1[5] != c) {
2696         p[-3] = (sal_Unicode) pop_failure_jump;
2697       } else if ((re_opcode_t) p1[3] == charset
2698              || (re_opcode_t) p1[3] == charset_not) {
2699         sal_Int32 knot = (re_opcode_t) p1[3] == charset_not;
2700
2701         if (c < (sal_Unicode) (p1[4] * BYTEWIDTH)
2702         && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2703           knot = !knot;
2704
2705         /* `not' is equal to 1 if c would match, which means
2706            that we can't change to pop_failure_jump.  */
2707         if (!knot) {
2708           p[-3] = (unsigned char) pop_failure_jump;
2709         }
2710       }
2711     } else if ((re_opcode_t) *p2 == charset) {
2712                 /* We win if the first character of the loop is not part
2713                    of the charset.  */
2714       if ((re_opcode_t) p1[3] == exactn
2715           && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
2716             && (p2[2 + p1[5] / BYTEWIDTH]
2717             & (1 << (p1[5] % BYTEWIDTH))))) {
2718         p[-3] = (sal_Unicode) pop_failure_jump;
2719       } else if ((re_opcode_t) p1[3] == charset_not) {
2720         sal_Int32 idx;
2721         /* We win if the charset_not inside the loop
2722            lists every character listed in the charset after.  */
2723         for (idx = 0; idx < (int) p2[1]; idx++)
2724           if (! (p2[2 + idx] == 0
2725              || (idx < (int) p1[4]
2726              && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
2727         break;
2728
2729         if (idx == p2[1]) {
2730           p[-3] = (sal_Unicode) pop_failure_jump;
2731         }
2732       } else if ((re_opcode_t) p1[3] == charset) {
2733         sal_Int32 idx;
2734         /* We win if the charset inside the loop
2735            has no overlap with the one after the loop.  */
2736         for (idx = 0;
2737          idx < (sal_Int32) p2[1] && idx < (sal_Int32) p1[4];
2738          idx++)
2739           if ((p2[2 + idx] & p1[5 + idx]) != 0)
2740         break;
2741
2742         if (idx == p2[1] || idx == p1[4]) {
2743           p[-3] = (sal_Unicode) pop_failure_jump;
2744         }
2745       }
2746     }
2747       }
2748       p -= 2;       /* Point at relative address again.  */
2749       if ((re_opcode_t) p[-1] != pop_failure_jump) {
2750     p[-1] = (sal_Unicode) jump;
2751     goto unconditional_jump;
2752       }
2753       /* Note fall through.  */
2754
2755
2756       /* The end of a simple repeat has a pop_failure_jump back to
2757      its matching on_failure_jump, where the latter will push a
2758      failure point.  The pop_failure_jump takes off failure
2759      points put on by this pop_failure_jump's matching
2760      on_failure_jump; we got through the pattern to here from the
2761      matching on_failure_jump, so didn't fail.  */
2762     case pop_failure_jump:
2763       {
2764     /* We need to pass separate storage for the lowest and
2765        highest registers, even though we don't care about the
2766        actual values.  Otherwise, we will restore only one
2767        register from the stack, since lowest will == highest in
2768        `pop_failure_point'.  */
2769     sal_uInt32 dummy_low_reg, dummy_high_reg;
2770     sal_Unicode *pdummy = NULL;
2771     const sal_Unicode *sdummy = NULL;
2772
2773     POP_FAILURE_POINT(sdummy, pdummy,
2774               dummy_low_reg, dummy_high_reg,
2775               reg_dummy, reg_dummy, reg_info_dummy);
2776       }
2777       /* Note fall through.  */
2778
2779     unconditional_jump:
2780     /* Note fall through.  */
2781
2782     /* Unconditionally jump (without popping any failure points).  */
2783     case jump:
2784       extract_number_and_incr(mcnt, p); /* Get the amount to jump.  */
2785       p += mcnt;                /* Do the jump.  */
2786       break;
2787
2788       /* We need this opcode so we can detect where alternatives end
2789      in `group_match_null_string_p' et al.  */
2790     case jump_past_alt:
2791       goto unconditional_jump;
2792
2793
2794       /* Normally, the on_failure_jump pushes a failure point, which
2795      then gets popped at pop_failure_jump.  We will end up at
2796      pop_failure_jump, also, and with a pattern of, say, `a+', we
2797      are skipping over the on_failure_jump, so we have to push
2798      something meaningless for pop_failure_jump to pop.  */
2799     case dummy_failure_jump:
2800       /* It doesn't matter what we push for the string here.  What
2801      the code at `fail' tests is the value for the pattern.  */
2802       PUSH_FAILURE_POINT(NULL, NULL, -2);
2803       goto unconditional_jump;
2804
2805
2806       /* At the end of an alternative, we need to push a dummy failure
2807      point in case we are followed by a `pop_failure_jump', because
2808      we don't want the failure point for the alternative to be
2809      popped.  For example, matching `(a|ab)*' against `aab'
2810      requires that we match the `ab' alternative.  */
2811     case push_dummy_failure:
2812       /* See comments just above at `dummy_failure_jump' about the
2813      two zeroes.  */
2814       PUSH_FAILURE_POINT(NULL, NULL, -2);
2815       break;
2816
2817       /* Have to succeed matching what follows at least n times.
2818      After that, handle like `on_failure_jump'.  */
2819     case succeed_n:
2820       extract_number(mcnt, p + 2);
2821
2822       assert (mcnt >= 0);
2823       /* Originally, this is how many times we HAVE to succeed.  */
2824       if (mcnt > 0) {
2825     mcnt--;
2826     p += 2;
2827     store_number_and_incr (p, mcnt);
2828       } else if (mcnt == 0) {
2829     p[2] = (sal_Unicode) no_op;
2830     p[3] = (sal_Unicode) no_op;
2831     goto on_failure;
2832       }
2833       break;
2834
2835     case jump_n:
2836       extract_number(mcnt, p + 2);
2837
2838       /* Originally, this is how many times we CAN jump.  */
2839       if (mcnt) {
2840     mcnt--;
2841     store_number (p + 2, mcnt);
2842     goto unconditional_jump;
2843       }
2844       /* If don't have to jump any more, skip over the rest of command.  */
2845       else
2846     p += 4;
2847       break;
2848
2849     case set_number_at:
2850       {
2851
2852     extract_number_and_incr(mcnt, p);
2853     p1 = p + mcnt;
2854     extract_number_and_incr(mcnt, p);
2855     store_number (p1, mcnt);
2856     break;
2857       }
2858
2859     case wordbeg:
2860       if (iswordbegin(d, string2, size2))
2861     break;
2862       goto fail;
2863
2864     case wordend:
2865       if (iswordend(d, string2, size2))
2866     break;
2867       goto fail;
2868
2869
2870     default:
2871       abort();
2872     }
2873     continue;  /* Successfully executed one pattern command; keep going.  */
2874
2875     /* We goto here if a matching operation fails. */
2876   fail:
2877     if (!FAIL_STACK_EMPTY()) {
2878       /* A restart point is known.  Restore to that state.  */
2879       POP_FAILURE_POINT(d, p,
2880             lowest_active_reg, highest_active_reg,
2881             regstart, regend, reg_info);
2882
2883       /* If this failure point is a dummy, try the next one.  */
2884       if (!p)
2885     goto fail;
2886
2887       /* If we failed to the end of the pattern, don't examine *p.  */
2888       assert(p <= pend);
2889       if (p < pend) {
2890     sal_Bool is_a_jump_n = false;
2891
2892     /* If failed to a backwards jump that's part of a repetition
2893        loop, need to pop this failure point and use the next
2894        one.  */
2895     switch ((re_opcode_t) *p) {
2896     case jump_n:
2897       is_a_jump_n = true;
2898     case maybe_pop_jump:
2899     case pop_failure_jump:
2900     case jump:
2901       p1 = p + 1;
2902       extract_number_and_incr(mcnt, p1);
2903       p1 += mcnt;
2904
2905       if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
2906           || (!is_a_jump_n
2907           && (re_opcode_t) *p1 == on_failure_jump)) {
2908         goto fail;
2909       }
2910       break;
2911     default:
2912       /* do nothing */ ;
2913     }
2914       }
2915
2916     } else {
2917       break;   /* Matching at this starting point really fails.  */
2918     }
2919   } /* for (;;) */
2920
2921   FREE_VARIABLES ();
2922
2923   return(-1);                   /* Failure to match.  */
2924 } /* re_match2 */
2925
2926 /* Set the bit for character C in a list.  */
2927 void
2928 Regexpr::set_list_bit(sal_Unicode c, sal_Unicode *b)
2929 {
2930   if ( translate ) {
2931     try {
2932         sal_Unicode tmp = translit->transliterateChar2Char(c);
2933         b[tmp / BYTEWIDTH] |= 1 << (tmp % BYTEWIDTH);
2934     } catch (::com::sun::star::i18n::MultipleCharsOutputException e) {
2935         ::rtl::OUString o2( translit->transliterateChar2String( c));
2936         sal_Int32 len2 = o2.getLength();
2937         const sal_Unicode * k2 = o2.getStr();
2938         for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
2939           b[k2[nmatch] / BYTEWIDTH] |= 1 << (k2[nmatch] % BYTEWIDTH);
2940         }
2941     }
2942   } else {
2943     b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
2944   }
2945 }
2946
2947 /* vim: set ts=8 sw=2 noexpandtab: */