regexp/source/reclass.cxx

   1 /*************************************************************************
   2  *
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * Copyright 2008 by Sun Microsystems, Inc.
   6  *
   7  * OpenOffice.org - a multi-platform office productivity suite
   8  *
   9  * $RCSfile: reclass.cxx,v $
  10  * $Revision: 1.7 $
  11  *
  12  * This file is part of OpenOffice.org.
  13  *
  14  * OpenOffice.org is free software: you can redistribute it and/or modify
  15  * it under the terms of the GNU Lesser General Public License version 3
  16  * only, as published by the Free Software Foundation.
  17  *
  18  * OpenOffice.org is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU Lesser General Public License version 3 for more details
  22  * (a copy is included in the LICENSE file that accompanied this code).
  23  *
  24  * You should have received a copy of the GNU Lesser General Public License
  25  * version 3 along with OpenOffice.org.  If not, see
  26  * <http://www.openoffice.org/license.html>
  27  * for a copy of the LGPLv3 License.
  28  *
  29  ************************************************************************/
  30
  31
  32 // MARKER(update_precomp.py): autogen include statement, do not remove
  33 #include "precompiled_regexp.hxx"
  34 /* Extended regular expression matching and search library,
  35    version 0.12.
  36    (Implements POSIX draft P1003.2/D11.2, except for some of the
  37    internationalization features.)
  38    Copyright (C) 1993, 94, 95, 96, 97, 98, 99 Free Software Foundation, Inc.
  39
  40    The GNU C Library is free software; you can redistribute it and/or
  41    modify it under the terms of the GNU Library General Public License as
  42    published by the Free Software Foundation; either version 2 of the
  43    License, or (at your option) any later version.
  44
  45    The GNU C Library is distributed in the hope that it will be useful,
  46    but WITHOUT ANY WARRANTY; without even the implied warranty of
  47    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  48    Library General Public License for more details.
  49
  50    You should have received a copy of the GNU Library General Public
  51    License along with the GNU C Library; see the file COPYING.LIB.  If not,
  52    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  53    Boston, MA 02111-1307, USA.  */
  54
  55 /*
  56     Modified for OpenOffice.org to use sal_Unicode and Transliteration service.
  57  */
  58
  59
  60 #if 0
  61 /* If for any reason (porting, debug) we can't use alloca() use malloc()
  62    instead.  Use alloca() if possible for performance reasons, this _is_
  63    significant, with malloc() the re_match2() method makes heavy use of regexps
  64    through the TextSearch interface up to three times slower.  This is _the_
  65    bottleneck in some spreadsheet documents.  */
  66 #define REGEX_MALLOC
  67 #endif
  68
  69 /* AIX requires this to be the first thing in the file. */
  70 #if defined _AIX && !defined REGEX_MALLOC
  71   #pragma alloca
  72 #endif
  73
  74 #include <string.h>
  75 #include <assert.h>
  76
  77 #include <rtl/ustring.hxx>
  78 #include <com/sun/star/i18n/TransliterationModules.hpp>
  79
  80 #include "reclass.hxx"
  81
  82
  83 /* Maximum number of duplicates an interval can allow.  Some systems
  84    (erroneously) define this in other header files, but we want our
  85    value, so remove any previous define.  */
  86 #ifdef RE_DUP_MAX
  87 # undef RE_DUP_MAX
  88 #endif
  89 /* If sizeof(int) == 2, then ((1 << 15) - 1) overflows.  */
  90 #define RE_DUP_MAX (0x7fff)
  91
  92
  93 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
  94    `re_match_2' returns information about at least this many registers
  95    the first time a `regs' structure is passed.  */
  96 #ifndef RE_NREGS
  97 # define RE_NREGS 30
  98 #endif
  99
 100
 101 // Macros
 102 #define INIT_COMPILE_STACK_SIZE         32
 103 #define INIT_BUF_SIZE                   ((1 << BYTEWIDTH)/BYTEWIDTH)
 104 #define MAX_BUF_SIZE                    65535L
 105 #define NO_HIGHEST_ACTIVE_REG           (1 << BYTEWIDTH)
 106 #define NO_LOWEST_ACTIVE_REG            (NO_HIGHEST_ACTIVE_REG + 1)
 107
 108 /* Since we have one byte reserved for the register number argument to
 109    {start,stop}_memory, the maximum number of groups we can report
 110    things about is what fits in that byte.  */
 111 #define MAX_REGNUM 255
 112
 113 #define MIN(x, y) ( (x) < (y) ? (x) : (y) )
 114 #define MAX(x, y) ( (x) > (y) ? (x) : (y) )
 115
 116
 117 // Always. We're not in Emacs and don't use relocating allocators.
 118 #define MATCH_MAY_ALLOCATE
 119
 120 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
 121    use `alloca' instead of `malloc'.  This is because malloc is slower and
 122    causes storage fragmentation.  On the other hand, malloc is more portable,
 123    and easier to debug.
 124
 125    Because we sometimes use alloca, some routines have to be macros,
 126    not functions -- `alloca'-allocated space disappears at the end of the
 127    function it is called in.  */
 128
 129 #ifdef REGEX_MALLOC
 130
 131 # define REGEX_ALLOCATE malloc
 132 # define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
 133 # define REGEX_FREE free
 134
 135 #else /* not REGEX_MALLOC  */
 136
 137 /* Emacs already defines alloca, sometimes. So does MSDEV.  */
 138 # ifndef alloca
 139
 140 /* Make alloca work the best possible way.  */
 141 #  ifdef __GNUC__
 142 #   define alloca __builtin_alloca
 143 #  else /* not __GNUC__ */
 144 #   include <sal/alloca.h>
 145 #  endif /* not __GNUC__ */
 146
 147 # endif /* not alloca */
 148
 149 # define REGEX_ALLOCATE alloca
 150
 151 /* Assumes a `char *destination' variable.  */
 152 # define REGEX_REALLOCATE(source, osize, nsize)                         \
 153   (destination = (char *) alloca (nsize),                               \
 154    memcpy (destination, source, osize))
 155
 156 /* No need to do anything to free, after alloca.  */
 157 # define REGEX_FREE(arg) ((void)0) /* Do nothing!  But inhibit gcc warning.  */
 158
 159 #endif /* not REGEX_MALLOC */
 160
 161
 162 /* Define how to allocate the failure stack.  */
 163
 164 #ifdef REGEX_MALLOC
 165
 166 # define REGEX_ALLOCATE_STACK malloc
 167 # define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
 168 # define REGEX_FREE_STACK free
 169
 170 #else /* not REGEX_MALLOC */
 171
 172 # define REGEX_ALLOCATE_STACK alloca
 173
 174 # define REGEX_REALLOCATE_STACK(source, osize, nsize)                   \
 175    REGEX_REALLOCATE (source, osize, nsize)
 176 /* No need to explicitly free anything.  */
 177 # define REGEX_FREE_STACK(arg)
 178
 179 #endif /* not REGEX_MALLOC */
 180
 181
 182 /* (Re)Allocate N items of type T using malloc, or fail.  */
 183 #define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
 184 #define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
 185 #define RETALLOC_IF(addr, n, t) \
 186   if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
 187 #define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
 188
 189 #define BYTEWIDTH 16    /* In bits (assuming sizeof(sal_Unicode)*8) */
 190
 191
 192 #define CHAR_CLASS_MAX_LENGTH 256
 193
 194 /* Fetch the next character in the uncompiled pattern, with no
 195    translation.  */
 196 #define PATFETCH_RAW(c)                                                 \
 197     do {                                                                \
 198         if (p == pend) return REG_EEND;                         \
 199         c = (sal_Unicode) *p++;                                 \
 200     } while (0)
 201
 202 /* Go backwards one character in the pattern.  */
 203 #define PATUNFETCH p--
 204
 205 #define FREE_STACK_RETURN(value)                                        \
 206     return(free(compile_stack.stack), value)
 207
 208 #define GET_BUFFER_SPACE(n)                                             \
 209     while ((sal_uInt32)(b - bufp->buffer + (n)) > bufp->allocated)      \
 210         EXTEND_BUFFER()
 211
 212 /* Extend the buffer by twice its current size via realloc and
 213    reset the pointers that pointed into the old block to point to the
 214    correct places in the new one.  If extending the buffer results in it
 215    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
 216 #define EXTEND_BUFFER()                                                 \
 217   do {                                                                  \
 218     sal_Unicode *old_buffer = bufp->buffer;                           \
 219     if (bufp->allocated == MAX_BUF_SIZE)                                \
 220       return REG_ESIZE;                                                 \
 221     bufp->allocated <<= 1;                                              \
 222     if (bufp->allocated > MAX_BUF_SIZE)                                 \
 223       bufp->allocated = MAX_BUF_SIZE;                                   \
 224     bufp->buffer = (sal_Unicode *) realloc(bufp->buffer,                \
 225                        bufp->allocated *                \
 226                        sizeof(sal_Unicode));    \
 227     if (bufp->buffer == NULL)                                           \
 228       return REG_ESPACE;                                                \
 229     /* If the buffer moved, move all the pointers into it.  */          \
 230     if (old_buffer != bufp->buffer) {                                   \
 231         b = (b - old_buffer) + bufp->buffer;                            \
 232         begalt = (begalt - old_buffer) + bufp->buffer;                  \
 233         if (fixup_alt_jump)                                             \
 234           fixup_alt_jump = (fixup_alt_jump - old_buffer) + bufp->buffer;\
 235         if (laststart)                                                  \
 236           laststart = (laststart - old_buffer) + bufp->buffer;          \
 237         if (pending_exact)                                              \
 238           pending_exact = (pending_exact - old_buffer) + bufp->buffer;  \
 239       }                                                                 \
 240   } while (0)
 241
 242 #define BUF_PUSH(c)                                                     \
 243     do {                                                                \
 244         GET_BUFFER_SPACE(1);                                    \
 245         *b++ = (sal_Unicode)(c);                                \
 246     } while(0)
 247
 248 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
 249 #define BUF_PUSH_2(c1, c2)                                              \
 250   do {                                                                  \
 251     GET_BUFFER_SPACE(2);                                                \
 252     *b++ = (sal_Unicode) (c1);                                          \
 253     *b++ = (sal_Unicode) (c2);                                          \
 254   } while (0)
 255
 256 /* As with BUF_PUSH_2, except for three bytes.  */
 257 #define BUF_PUSH_3(c1, c2, c3)                                          \
 258   do {                                                                  \
 259     GET_BUFFER_SPACE(3);                                                \
 260     *b++ = (sal_Unicode) (c1);                                          \
 261     *b++ = (sal_Unicode) (c2);                                          \
 262     *b++ = (sal_Unicode) (c3);                                          \
 263   } while (0)
 264
 265 /* Store a jump with opcode OP at LOC to location TO.  We store a
 266    relative address offset by the three bytes the jump itself occupies.  */
 267 #define STORE_JUMP(op, loc, to)                                         \
 268     store_op1(op, loc, (int) ((to) - (loc) - 3))
 269
 270 /* Likewise, for a two-argument jump.  */
 271 #define STORE_JUMP2(op, loc, to, arg)                                   \
 272     store_op2(op, loc, (int) ((to) - (loc) - 3), arg)
 273
 274 /* Store NUMBER in two contiguous sal_Unicode starting at DESTINATION.  */
 275
 276 inline
 277 void
 278 Regexpr::store_number( sal_Unicode * destination, sal_Int32 number )
 279 {
 280   (destination)[0] = sal_Unicode((number) & 0xffff);
 281   (destination)[1] = sal_Unicode((number) >> 16);
 282 }
 283
 284 /* Same as STORE_NUMBER, except increment DESTINATION to
 285    the byte after where the number is stored.  Therefore, DESTINATION
 286    must be an lvalue.  */
 287
 288 inline
 289 void
 290 Regexpr::store_number_and_incr( sal_Unicode *& destination, sal_Int32 number )
 291 {
 292   store_number( destination, number );
 293   (destination) += 2;
 294 }
 295
 296 /* Put into DESTINATION a number stored in two contiguous sal_Unicode starting
 297    at SOURCE.  */
 298
 299 inline void Regexpr::extract_number( sal_Int32 & dest, sal_Unicode *source )
 300 {
 301   dest = (((sal_Int32) source[1]) << 16) | (source[0] & 0xffff);
 302 }
 303
 304 /* Like `STORE_JUMP', but for inserting.  Assume `b' is the buffer end.  */
 305 #define INSERT_JUMP(op, loc, to)                                        \
 306     insert_op1(op, loc, (sal_Int32) ((to) - (loc) - 3), b)
 307
 308 /* Like `STORE_JUMP2', but for inserting.  Assume `b' is the buffer end.  */
 309 #define INSERT_JUMP2(op, loc, to, arg)                                  \
 310     insert_op2(op, loc, (sal_Int32) ((to) - (loc) - 3), arg, b)
 311
 312 #define STREQ(s1, s2) (rtl_ustr_compare((s1), (s2)) ? (0) : (1))
 313
 314 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
 315 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
 316
 317 /* The next available element.  */
 318 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
 319
 320 /* Get the next unsigned number in the uncompiled pattern.  */
 321 #define GET_UNSIGNED_NUMBER(num) {                                      \
 322     if (p != pend) {                                                \
 323         PATFETCH_RAW(c);                                        \
 324         while (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') {        \
 325             if (num < 0)                                    \
 326                 num = 0;                                \
 327             num = num * 10 + c - (sal_Unicode)'0';              \
 328             if (p == pend)                                      \
 329                 break;                                  \
 330             PATFETCH_RAW(c);                            \
 331         }                                                       \
 332     }                                                               \
 333 }
 334
 335 /* Get the next hex number in the uncompiled pattern.  */
 336 #define GET_HEX_NUMBER(num) {                                           \
 337     if (p != pend) {                                                \
 338         sal_Bool stop = false;                                  \
 339         sal_Int16 hexcnt = 1;                                   \
 340         PATFETCH_RAW(c);                                        \
 341         while ( (c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9') || (c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f') || (c >= (sal_Unicode)'A' && c <= (sal_Unicode)'F') ) {    \
 342             if (num < 0)                                    \
 343                 num = 0;                                \
 344             if ( c >= (sal_Unicode)'0' && c <= (sal_Unicode)'9' ) \
 345                 num = num * 16 + c - (sal_Unicode)'0';          \
 346             else if ( c >= (sal_Unicode)'a' && c <= (sal_Unicode)'f' ) \
 347                 num = num * 16 + (10 + c - (sal_Unicode)'a');           \
 348             else                                                \
 349                 num = num * 16 + (10 + c - (sal_Unicode)'A');           \
 350             if (p == pend || hexcnt == 4) {                     \
 351                 stop = true;                            \
 352                 break;                                  \
 353             }                                           \
 354             PATFETCH_RAW(c);                            \
 355             hexcnt++;                                   \
 356         }                                                       \
 357                                     \
 358         if ( ! stop ) {                                         \
 359             PATUNFETCH;                                 \
 360             hexcnt--;                                   \
 361         }                                                       \
 362         if ( hexcnt > 4 || (num < 0 || num > 0xffff) ) num = -1;\
 363     }                                                               \
 364 }
 365
 366
 367 /* Number of failure points for which to initially allocate space
 368    when matching.  If this number is exceeded, we allocate more
 369    space, so it is not a hard limit.  */
 370 #ifndef INIT_FAILURE_ALLOC
 371 # define INIT_FAILURE_ALLOC 5
 372 #endif
 373
 374 #define INIT_FAIL_STACK()                                               \
 375   do {                                                                  \
 376     fail_stack.stack = (fail_stack_elt_t *)                             \
 377       REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \
 378                                     \
 379     if (fail_stack.stack == NULL)                                       \
 380       return -2;                                                        \
 381                                     \
 382     fail_stack.size = INIT_FAILURE_ALLOC;                               \
 383     fail_stack.avail = 0;                                               \
 384   } while (0)
 385
 386 #define RESET_FAIL_STACK()  REGEX_FREE_STACK (fail_stack.stack)
 387
 388 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
 389
 390    Return 1 if succeeds, and 0 if either ran out of memory
 391    allocating space for it or it was already too large.
 392
 393    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
 394
 395 #define DOUBLE_FAIL_STACK(fail_stack)                                   \
 396   ((fail_stack).size > (sal_uInt32) (re_max_failures * MAX_FAILURE_ITEMS)       \
 397    ? 0                                                                  \
 398    : ((fail_stack).stack = (fail_stack_elt_t *)                         \
 399         REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
 400           (fail_stack).size * sizeof (fail_stack_elt_t),                \
 401           ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)),        \
 402                                     \
 403       (fail_stack).stack == NULL                                        \
 404       ? 0                                                               \
 405       : ((fail_stack).size <<= 1,                                       \
 406          1)))
 407
 408
 409 #define REG_UNSET_VALUE (&reg_unset_dummy)
 410 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
 411
 412 #define REG_MATCH_NULL_STRING_P(R)  ((R).bits.match_null_string_p)
 413 #define IS_ACTIVE(R)  ((R).bits.is_active)
 414 #define MATCHED_SOMETHING(R)  ((R).bits.matched_something)
 415 #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)
 416
 417 /* Call this when have matched a real character; it sets `matched' flags
 418    for the subexpressions which we are currently inside.  Also records
 419    that those subexprs have matched.  */
 420 #define SET_REGS_MATCHED()                                              \
 421   do {                                                                  \
 422       if (!set_regs_matched_done) {                                     \
 423           sal_uInt32 r;                                                 \
 424           set_regs_matched_done = 1;                                    \
 425           for (r = lowest_active_reg; r <= highest_active_reg; r++) {   \
 426               MATCHED_SOMETHING(reg_info[r])                            \
 427                 = EVER_MATCHED_SOMETHING(reg_info[r])                   \
 428                 = 1;                                                    \
 429             }                                                           \
 430         }                                                               \
 431     }                                                                   \
 432   while (0)
 433
 434 #define FAIL_STACK_EMPTY()     (fail_stack.avail == 0)
 435
 436 /* This converts PTR, a pointer into the search string `string2' into an offset from the beginning of that string.  */
 437 #define POINTER_TO_OFFSET(ptr) ((sal_Int32) ((ptr) - string2))
 438
 439 /* This is the number of items that are pushed and popped on the stack
 440    for each register.  */
 441 #define NUM_REG_ITEMS  3
 442
 443 /* Individual items aside from the registers.  */
 444 # define NUM_NONREG_ITEMS 4
 445
 446 /* We push at most this many items on the stack.  */
 447 /* We used to use (num_regs - 1), which is the number of registers
 448    this regexp will save; but that was changed to 5
 449    to avoid stack overflow for a regexp with lots of parens.  */
 450 #define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)
 451
 452 /* We actually push this many items.  */
 453 #define NUM_FAILURE_ITEMS                               \
 454   (((0                                                  \
 455      ? 0 : highest_active_reg - lowest_active_reg + 1)  \
 456     * NUM_REG_ITEMS)                                    \
 457    + NUM_NONREG_ITEMS)
 458
 459 /* How many items can still be added to the stack without overflowing it.  */
 460 #define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)
 461
 462 /* Push a pointer value onto the failure stack.
 463    Assumes the variable `fail_stack'.  Probably should only
 464    be called from within `PUSH_FAILURE_POINT'.  */
 465 #define PUSH_FAILURE_POINTER(item)                                      \
 466   fail_stack.stack[fail_stack.avail++].pointer = (sal_Unicode *) (item)
 467
 468 /* This pushes an integer-valued item onto the failure stack.
 469    Assumes the variable `fail_stack'.  Probably should only
 470    be called from within `PUSH_FAILURE_POINT'.  */
 471 #define PUSH_FAILURE_INT(item)                                  \
 472   fail_stack.stack[fail_stack.avail++].integer = (item)
 473
 474 /* Push a fail_stack_elt_t value onto the failure stack.
 475    Assumes the variable `fail_stack'.  Probably should only
 476    be called from within `PUSH_FAILURE_POINT'.  */
 477 #define PUSH_FAILURE_ELT(item)                                  \
 478   fail_stack.stack[fail_stack.avail++] =  (item)
 479
 480 /* These three POP... operations complement the three PUSH... operations.
 481    All assume that `fail_stack' is nonempty.  */
 482 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
 483 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
 484 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
 485
 486 /* Test if at very beginning or at very end of `string2'. */
 487 #define AT_STRINGS_BEG(d) ((d) == string2 || !size2)
 488 #define AT_STRINGS_END(d) ((d) == end2)
 489
 490 /* Checking for end of string */
 491 #define PREFETCH() \
 492 do { \
 493     if ( d == end2 ) { \
 494         goto fail; \
 495     } \
 496 } while (0)
 497
 498
 499 sal_Bool
 500 Regexpr::iswordbegin(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
 501 {
 502    if ( d == string || ! ssize ) return true;
 503
 504    if ( !unicode::isAlphaDigit(d[-1]) && unicode::isAlphaDigit(d[0])) {
 505     return true;
 506    }
 507    return false;
 508 }
 509
 510 sal_Bool
 511 Regexpr::iswordend(const sal_Unicode *d, sal_Unicode *string, sal_Int32 ssize)
 512 {
 513    if ( d == (string+ssize) ) return true;
 514
 515    if ( !unicode::isAlphaDigit(d[0]) && unicode::isAlphaDigit(d[-1])) {
 516     return true;
 517    }
 518    return false;
 519 }
 520
 521 /* Push the information about the state we will need
 522    if we ever fail back to it.
 523
 524    Requires variables fail_stack, regstart, regend, and reg_info
 525    be declared.  DOUBLE_FAIL_STACK requires `destination'
 526    be declared.
 527
 528    Does `return FAILURE_CODE' if runs out of memory.  */
 529
 530 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code)   \
 531   do {                                                                  \
 532     char *destination;                                                  \
 533     /* Must be int, so when we don't save any registers, the arithmetic \
 534        of 0 + -1 isn't done as unsigned.  */                            \
 535     /* Can't be int, since there is not a shred of a guarantee that int \
 536        is wide enough to hold a value of something to which pointer can \
 537        be assigned */                                                   \
 538     sal_uInt32 this_reg;                                                \
 539                                                                         \
 540     /* Ensure we have enough space allocated for what we will push.  */ \
 541     while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) {                  \
 542         if (!DOUBLE_FAIL_STACK(fail_stack))                            \
 543           return failure_code;                                          \
 544       }                                                                 \
 545                                                                         \
 546     /* Push the info, starting with the registers.  */                  \
 547     if (1)                                                              \
 548       for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \
 549            this_reg++) {                                                 \
 550           PUSH_FAILURE_POINTER(regstart[this_reg]);                    \
 551                                                                         \
 552           PUSH_FAILURE_POINTER (regend[this_reg]);                      \
 553                                                                         \
 554           PUSH_FAILURE_ELT(reg_info[this_reg].word);                   \
 555         }                                                               \
 556                                                                         \
 557     PUSH_FAILURE_INT(lowest_active_reg);                               \
 558                                                                         \
 559     PUSH_FAILURE_INT(highest_active_reg);                              \
 560                                                                         \
 561     PUSH_FAILURE_POINTER(pattern_place);                               \
 562                                                                         \
 563     PUSH_FAILURE_POINTER(string_place);                                \
 564                                                                         \
 565   } while (0)
 566
 567 /* Pops what PUSH_FAIL_STACK pushes.
 568
 569    We restore into the parameters, all of which should be lvalues:
 570      STR -- the saved data position.
 571      PAT -- the saved pattern position.
 572      LOW_REG, HIGH_REG -- the highest and lowest active registers.
 573      REGSTART, REGEND -- arrays of string positions.
 574      REG_INFO -- array of information about each subexpression.
 575
 576    Also assumes the variables `fail_stack' and (if debugging), `bufp',
 577    `pend', `string2', and `size2'.  */
 578
 579 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info) {\
 580     sal_uInt32 this_reg;                                                \
 581     sal_Unicode *string_temp;                                     \
 582                                                                         \
 583   assert(!FAIL_STACK_EMPTY());                                        \
 584                                                                         \
 585   /* Remove failure points and point to how many regs pushed.  */       \
 586   assert(fail_stack.avail >= NUM_NONREG_ITEMS);                        \
 587                                                                         \
 588   /* If the saved string location is NULL, it came from an              \
 589      on_failure_keep_string_jump opcode, and we want to throw away the  \
 590      saved NULL, thus retaining our current position in the string.  */ \
 591   string_temp = POP_FAILURE_POINTER();                                 \
 592   if (string_temp != NULL)                                              \
 593     str = (const sal_Unicode *) string_temp;                                   \
 594                                                                         \
 595   pat = (sal_Unicode *) POP_FAILURE_POINTER();                       \
 596                                                                         \
 597   /* Restore register info.  */                                         \
 598   high_reg = (sal_uInt32) POP_FAILURE_INT();                         \
 599                                                                         \
 600   low_reg = (sal_uInt32) POP_FAILURE_INT();                          \
 601                                                                         \
 602   if (1)                                                                \
 603     for (this_reg = high_reg; this_reg >= low_reg; this_reg--) {         \
 604                                                                         \
 605         reg_info[this_reg].word = POP_FAILURE_ELT();                   \
 606                                                                         \
 607         regend[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER();       \
 608                                                                         \
 609         regstart[this_reg] = (const sal_Unicode *) POP_FAILURE_POINTER();     \
 610       } else {                                                          \
 611       for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) {\
 612           reg_info[this_reg].word.integer = 0;                          \
 613           regend[this_reg] = 0;                                         \
 614           regstart[this_reg] = 0;                                       \
 615         }                                                               \
 616       highest_active_reg = high_reg;                                    \
 617     }                                                                   \
 618                                                                         \
 619   set_regs_matched_done = 0;                                            \
 620 } /* POP_FAILURE_POINT */
 621
 622 inline
 623 void
 624 Regexpr::extract_number_and_incr( sal_Int32 & destination, sal_Unicode *& source )
 625 {
 626   extract_number(destination, source);
 627   source += 2;
 628 }
 629
 630
 631 inline
 632 void
 633 Regexpr::store_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg)
 634 {
 635   *loc = (sal_Unicode) op;
 636   store_number(loc + 1, arg);
 637 }
 638
 639 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2.  */
 640
 641 inline
 642 void
 643 Regexpr::store_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2)
 644 {
 645   *loc = (sal_Unicode) op;
 646   store_number(loc + 1, arg1);
 647   store_number(loc + 3, arg2);
 648 }
 649
 650 void
 651 Regexpr::insert_op1(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg, sal_Unicode *end)
 652 {
 653   register sal_Unicode *pfrom = end;
 654   register sal_Unicode *pto = end + 3;
 655
 656   while (pfrom != loc) {
 657     *--pto = *--pfrom;
 658   }
 659
 660   store_op1(op, loc, arg);
 661 }
 662
 663
 664 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2.  */
 665
 666 void
 667 Regexpr::insert_op2(re_opcode_t op, sal_Unicode *loc, sal_Int32 arg1, sal_Int32 arg2, sal_Unicode *end)
 668 {
 669   register sal_Unicode *pfrom = end;
 670   register sal_Unicode *pto = end + 5;
 671
 672   while (pfrom != loc)
 673     *--pto = *--pfrom;
 674
 675   store_op2 (op, loc, arg1, arg2);
 676 }
 677
 678 /* P points to just after a ^ in PATTERN.  Return true if that ^ comes
 679    after an alternative or a begin-subexpression.  We assume there is at
 680    least one character before the ^.  */
 681
 682 sal_Bool
 683 Regexpr::at_begline_loc_p(const sal_Unicode *local_pattern, const sal_Unicode *p)
 684 {
 685   const sal_Unicode *prev = p - 2;
 686   sal_Bool prev_prev_backslash = prev > local_pattern && prev[-1] == '\\';
 687
 688   return(
 689      /* After a subexpression?  */
 690      (*prev == (sal_Unicode)'(' && prev_prev_backslash)
 691      /* After an alternative?  */
 692      || (*prev == (sal_Unicode)'|' && prev_prev_backslash));
 693 }
 694
 695 /* The dual of at_begline_loc_p.  This one is for $.  We assume there is
 696    at least one character after the $, i.e., `P < PEND'.  */
 697
 698 sal_Bool
 699 Regexpr::at_endline_loc_p(const sal_Unicode *p, const sal_Unicode * /* pend */ )
 700 {
 701   const sal_Unicode *next = p;
 702   //sal_Bool next_backslash = *next == (sal_Unicode)'\\';
 703   //const sal_Unicode *next_next = p + 1 < pend ? p + 1 : 0;
 704
 705   return(
 706      /* Before a subexpression?  */
 707      *next == (sal_Unicode)')'
 708      // (next_backslash && next_next && *next_next == (sal_Unicode)')')
 709      /* Before an alternative?  */
 710      || *next == (sal_Unicode)'|' );
 711   //    || (next_backslash && next_next && *next_next == (sal_Unicode)'|'));
 712 }
 713
 714 reg_errcode_t
 715 Regexpr::compile_range(sal_Unicode range_start, sal_Unicode range_end, sal_Unicode *b)
 716 {
 717   sal_uInt32 this_char;
 718
 719   /* If the start is after the end, the range is empty.  */
 720   if (range_start > range_end)
 721     return REG_NOERROR;
 722
 723   /* Here we see why `this_char' has to be larger than an `sal_Unicode'
 724      -- the range is inclusive, so if `range_end' == 0xffff
 725      (assuming 16-bit characters), we would otherwise go into an infinite
 726      loop, since all characters <= 0xffff.  */
 727   for (this_char = range_start; this_char <= range_end; this_char++) {
 728     set_list_bit( sal_Unicode(this_char), b);
 729   }
 730
 731   return REG_NOERROR;
 732 }
 733
 734 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
 735    false if it's not.  */
 736
 737 sal_Bool
 738 Regexpr::group_in_compile_stack(compile_stack_type compile_stack, sal_uInt32 regnum)
 739 {
 740   sal_Int32 this_element;
 741
 742   for (this_element = compile_stack.avail - 1;
 743        this_element >= 0;
 744        this_element--) {
 745     if (compile_stack.stack[this_element].regnum == regnum) {
 746       return true;
 747     }
 748   }
 749
 750   return false;
 751 }
 752
 753
 754 Regexpr::Regexpr( const ::com::sun::star::util::SearchOptions & rOptions,
 755          ::com::sun::star::uno::Reference<
 756          ::com::sun::star::i18n::XExtendedTransliteration > XTrans)
 757 {
 758   bufp = NULL;
 759   pattern = NULL;
 760
 761   if ( rOptions.algorithmType != ::com::sun::star::util::SearchAlgorithms_REGEXP ) {
 762     return;
 763   }
 764
 765   if ( rOptions.searchString == NULL ||
 766        rOptions.searchString.getLength() <= 0) {
 767     return;
 768   }
 769
 770   pattern = (sal_Unicode *)rOptions.searchString.getStr();
 771   patsize = rOptions.searchString.getLength();
 772
 773   re_max_failures = 2000;
 774
 775   translit = XTrans;
 776   translate = translit.is() ? 1 : 0;
 777
 778   bufp = NULL;
 779
 780   isIgnoreCase = ((rOptions.transliterateFlags &
 781         ::com::sun::star::i18n::TransliterationModules_IGNORE_CASE) != 0);
 782
 783   // Compile Regular expression pattern
 784   if ( regcomp() != REG_NOERROR )
 785     {
 786       if ( bufp )
 787     {
 788       if ( bufp->buffer )
 789         free(bufp->buffer);
 790       if( bufp->fastmap )
 791         free(bufp->fastmap);
 792
 793       free(bufp);
 794       bufp = NULL;
 795         }
 796     }
 797 }
 798
 799 Regexpr::~Regexpr()
 800 {
 801   //    translit->remove();
 802   if( bufp )
 803     {
 804       if( bufp->buffer )
 805     free(bufp->buffer);
 806       if( bufp->fastmap )
 807     free(bufp->fastmap);
 808
 809       free(bufp);
 810       bufp = NULL;
 811     }
 812
 813 }
 814
 815 // sets a new line to search in (restore start/end_ptr)
 816 void
 817 Regexpr::set_line(const sal_Unicode *new_line, sal_Int32 len)
 818 {
 819   line = new_line;
 820   linelen = len;
 821 }
 822
 823 // main function for searching the pattern
 824 // returns negative or startpos and sets regs
 825 sal_Int32
 826 Regexpr::re_search(struct re_registers *regs, sal_Int32 pOffset)
 827 {
 828   // Check if pattern buffer is NULL
 829   if ( bufp == NULL ) {
 830     return(-3);
 831   }
 832
 833   sal_Int32 range;
 834   sal_Int32 startpos;
 835   sal_Int32 stoppos;
 836
 837   startpos = pOffset;
 838   if ( linelen < 0 ) {
 839     range = linelen + 1;
 840     linelen = -(linelen);
 841     stoppos = pOffset + 1;
 842   } else {
 843     range = linelen - 1;
 844     stoppos = linelen;
 845   }
 846   for ( ; ; ) {
 847     sal_Int32 val = re_match2(regs, startpos, stoppos);
 848
 849 #ifndef REGEX_MALLOC
 850 # ifdef C_ALLOCA
 851     alloca (0);
 852 # endif
 853 #endif
 854
 855     // Return success if match found
 856     if (val == 0) {
 857       break;
 858     }
 859
 860     if (val == -2) {
 861       return(-2);
 862     }
 863
 864     // If match only beginning of string (startpos)
 865     if (!range) {
 866       break;
 867     }
 868
 869     // If search match from startpos to startpos+range
 870     else if (range > 0) {       // Forward string search
 871       range--;
 872       startpos++;
 873     } else {            // Reverse string search
 874       range++;
 875       startpos--;
 876     }
 877   }
 878
 879   if ( regs->num_of_match > 0 )
 880     return(0);
 881   else
 882     return(-1);
 883 }
 884
 885 sal_Int32
 886 Regexpr::regcomp()
 887 {
 888   bufp = (struct re_pattern_buffer *)malloc(sizeof(struct re_pattern_buffer));
 889   if ( bufp == NULL ) {
 890     return(-1);
 891   }
 892
 893   bufp->buffer = 0;
 894   bufp->allocated = 0;
 895   bufp->used = 0;
 896
 897   //bufp->fastmap = (sal_Unicode*) malloc((1 << BYTEWIDTH) * sizeof(sal_Unicode));
 898   // No fastmap with Unicode
 899   bufp->fastmap = NULL;
 900
 901   return(regex_compile());
 902 }
 903
 904 sal_Int32
 905 Regexpr::regex_compile()
 906 {
 907   register sal_Unicode c, c1;
 908   const sal_Unicode *p1;
 909   register sal_Unicode *b;
 910
 911   /* Keeps track of unclosed groups.  */
 912   compile_stack_type compile_stack;
 913
 914   /* Points to the current (ending) position in the pattern.  */
 915   const sal_Unicode *p = pattern;
 916   const sal_Unicode *pend = pattern + patsize;
 917
 918   /* Address of the count-byte of the most recently inserted `exactn'
 919      command.  This makes it possible to tell if a new exact-match
 920      character can be added to that command or if the character requires
 921      a new `exactn' command.  */
 922   sal_Unicode *pending_exact = 0;
 923
 924   /* Address of start of the most recently finished expression.
 925      This tells, e.g., postfix * where to find the start of its
 926      operand.  Reset at the beginning of groups and alternatives.  */
 927   sal_Unicode *laststart = 0;
 928
 929   /* Address of beginning of regexp, or inside of last group.  */
 930   sal_Unicode *begalt;
 931
 932   /* Place in the uncompiled pattern (i.e., the {) to
 933      which to go back if the interval is invalid.  */
 934   const sal_Unicode *beg_interval;
 935
 936   /* Address of the place where a forward jump should go to the end of
 937      the containing expression.  Each alternative of an `or' -- except the
 938      last -- ends with a forward jump of this sort.  */
 939   sal_Unicode *fixup_alt_jump = 0;
 940
 941   /* Counts open-groups as they are encountered.  Remembered for the
 942      matching close-group on the compile stack, so the same register
 943      number is put in the stop_memory as the start_memory.  */
 944   sal_Int32 regnum = 0;
 945
 946   /* Initialize the compile stack.  */
 947   compile_stack.stack = (compile_stack_elt_t *)malloc(INIT_COMPILE_STACK_SIZE * sizeof(compile_stack_elt_t));
 948   if (compile_stack.stack == NULL)
 949     return(REG_ESPACE);
 950
 951   compile_stack.size = INIT_COMPILE_STACK_SIZE;
 952   compile_stack.avail = 0;
 953
 954   /* Initialize the pattern buffer.  */
 955   bufp->fastmap_accurate = 0;
 956   bufp->not_bol = 0;
 957   bufp->not_eol = 0;
 958   bufp->newline_anchor = 1;
 959
 960   /* Set `used' to zero, so that if we return an error, the pattern
 961      printer (for debugging) will think there's no pattern.  We reset it
 962      at the end.  */
 963   bufp->used = 0;
 964
 965   /* Always count groups. */
 966   bufp->re_nsub = 0;
 967
 968   if (bufp->allocated == 0) {
 969     if (bufp->buffer) {
 970       /* If zero allocated, but buffer is non-null, try to realloc
 971      enough space.  This loses if buffer's address is bogus, but
 972      that is the user's responsibility.  */
 973       bufp->buffer = (sal_Unicode *)realloc(bufp->buffer, INIT_BUF_SIZE * sizeof(sal_Unicode));
 974     } else { /* Caller did not allocate a buffer.  Do it for them.  */
 975       bufp->buffer = (sal_Unicode *)malloc(INIT_BUF_SIZE * sizeof(sal_Unicode));
 976     }
 977     if (!bufp->buffer) FREE_STACK_RETURN(REG_ESPACE);
 978
 979     bufp->allocated = INIT_BUF_SIZE;
 980   }
 981
 982   begalt = b = bufp->buffer;
 983
 984   /* Loop through the uncompiled pattern until we're at the end.  */
 985   while (p != pend) {
 986     PATFETCH_RAW(c);
 987
 988     switch (c) {
 989     case (sal_Unicode)'^': {
 990       if (   /* If at start of pattern, it's an operator.  */
 991       p == pattern + 1
 992       /* Otherwise, depends on what's come before.  */
 993       || at_begline_loc_p(pattern, p))
 994     BUF_PUSH(begline);
 995       else
 996     goto normal_char;
 997     }
 998     break;
 999
1000     case (sal_Unicode)'$': {
1001       if (   /* If at end of pattern, it's an operator.  */
1002       p == pend
1003       /* Otherwise, depends on what's next.  */
1004       || at_endline_loc_p(p, pend)) {
1005     BUF_PUSH(endline);
1006       } else {
1007     goto normal_char;
1008       }
1009     }
1010     break;
1011
1012     case (sal_Unicode)'+':
1013     case (sal_Unicode)'?':
1014     case (sal_Unicode)'*':
1015       /* If there is no previous pattern... */
1016       if (!laststart) {
1017     goto normal_char;
1018       }
1019
1020       {
1021     /* Are we optimizing this jump?  */
1022     sal_Bool keep_string_p = false;
1023
1024     /* 1 means zero (many) matches is allowed.  */
1025     sal_Unicode zero_times_ok = 0, many_times_ok = 0;
1026
1027     /* If there is a sequence of repetition chars, collapse it
1028        down to just one (the right one).  We can't combine
1029        interval operators with these because of, e.g., `a{2}*',
1030        which should only match an even number of `a's.  */
1031
1032     for (;;) {
1033       zero_times_ok |= c != (sal_Unicode)'+';
1034       many_times_ok |= c != (sal_Unicode)'?';
1035
1036       if (p == pend)
1037         break;
1038
1039       PATFETCH_RAW(c);
1040
1041       if (c == (sal_Unicode)'*' || (c == (sal_Unicode)'+'
1042                     || c == (sal_Unicode)'?')) {
1043       } else {
1044         PATUNFETCH;
1045         break;
1046       }
1047
1048       /* If we get here, we found another repeat character.  */
1049     }
1050
1051     /* Star, etc. applied to an empty pattern is equivalent
1052        to an empty pattern.  */
1053     if (!laststart) {
1054       break;
1055     }
1056
1057     /* Now we know whether or not zero matches is allowed
1058        and also whether or not two or more matches is allowed.  */
1059     if (many_times_ok) {
1060       /* More than one repetition is allowed, so put in at the
1061          end a backward relative jump from `b' to before the next
1062          jump we're going to put in below (which jumps from
1063          laststart to after this jump).
1064
1065          But if we are at the `*' in the exact sequence `.*\n',
1066          insert an unconditional jump backwards to the .,
1067          instead of the beginning of the loop.  This way we only
1068          push a failure point once, instead of every time
1069          through the loop.  */
1070       assert(p - 1 > pattern);
1071
1072       /* Allocate the space for the jump.  */
1073       GET_BUFFER_SPACE(3);
1074
1075       /* We know we are not at the first character of the pattern,
1076          because laststart was nonzero.  And we've already
1077          incremented `p', by the way, to be the character after
1078          the `*'.  Do we have to do something analogous here
1079          for null bytes, because of RE_DOT_NOT_NULL?  */
1080       if (*(p - 2) == (sal_Unicode)'.'
1081           && zero_times_ok
1082           && p < pend && *p == (sal_Unicode)'\n') {
1083         /* We have .*\n.  */
1084         STORE_JUMP(jump, b, laststart);
1085         keep_string_p = true;
1086       } else {
1087         /* Anything else.  */
1088         STORE_JUMP(maybe_pop_jump, b, laststart - 3);
1089       }
1090
1091       /* We've added more stuff to the buffer.  */
1092       b += 3;
1093     }
1094
1095     /* On failure, jump from laststart to b + 3, which will be the
1096        end of the buffer after this jump is inserted.  */
1097     GET_BUFFER_SPACE(3);
1098     INSERT_JUMP(keep_string_p ? on_failure_keep_string_jump
1099             : on_failure_jump,
1100             laststart, b + 3);
1101     pending_exact = 0;
1102     b += 3;
1103
1104     if (!zero_times_ok) {
1105       /* At least one repetition is required, so insert a
1106          `dummy_failure_jump' before the initial
1107          `on_failure_jump' instruction of the loop. This
1108          effects a skip over that instruction the first time
1109          we hit that loop.  */
1110       GET_BUFFER_SPACE(3);
1111       INSERT_JUMP(dummy_failure_jump, laststart, laststart + 6);
1112       b += 3;
1113     }
1114       }
1115       break;
1116
1117     case (sal_Unicode)'.':
1118       laststart = b;
1119       BUF_PUSH(anychar);
1120       break;
1121
1122
1123     case (sal_Unicode)'[': {
1124       sal_Bool have_range = false;
1125       sal_Unicode last_char = 0xffff;
1126       sal_Unicode first_range = 0xffff;
1127       sal_Unicode second_range = 0xffff;
1128       sal_Int16 bsiz;
1129
1130       if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1131
1132       /* Ensure that we have enough space to push a charset: the
1133      opcode, the length count, and the bitset;
1134      1 + 1 + (1 << BYTEWIDTH) / BYTEWIDTH "bytes" in all.  */
1135       bsiz = 2 + ((1 << BYTEWIDTH) / BYTEWIDTH);
1136       GET_BUFFER_SPACE(bsiz);
1137
1138       laststart = b;
1139
1140       /* We test `*p == '^' twice, instead of using an if
1141      statement, so we only need one BUF_PUSH.  */
1142       BUF_PUSH (*p == (sal_Unicode)'^' ? charset_not : charset);
1143       if (*p == (sal_Unicode)'^')
1144     p++;
1145
1146       /* Remember the first position in the bracket expression.  */
1147       p1 = p;
1148
1149       /* Push the number of "bytes" in the bitmap.  */
1150       BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
1151
1152       /* Clear the whole map.  */
1153       memset(b, 0, ((1 << BYTEWIDTH) / BYTEWIDTH) * sizeof(sal_Unicode));
1154
1155       /* Read in characters and ranges, setting map bits.  */
1156       for (;;) {
1157     if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1158
1159     PATFETCH_RAW(c);
1160
1161     if ( c == (sal_Unicode)'\\' ) {
1162
1163       PATFETCH_RAW(c);
1164
1165       if ( c == (sal_Unicode)'x' ) {
1166         sal_Int32 UniChar = -1;
1167
1168         GET_HEX_NUMBER(UniChar);
1169         if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1170         c = (sal_Unicode) UniChar;
1171         last_char = c;
1172         set_list_bit(last_char, b);
1173       } else {
1174         last_char = c;
1175         set_list_bit(last_char, b);
1176       }
1177     } else if (c == (sal_Unicode)']') {
1178       /* Could be the end of the bracket expression.  If it's
1179          not (i.e., when the bracket expression is `[]' so
1180          far), the ']' character bit gets set way below.  */
1181         break;
1182     } else if ( c == (sal_Unicode)'-' ) {
1183       if ( !have_range ) {
1184         if ( last_char != 0xffff ) {
1185           first_range = last_char;
1186           have_range = true;
1187           continue;
1188         } else {
1189           last_char = (sal_Unicode)'-';
1190           set_list_bit(last_char, b);
1191         }
1192       }
1193         }
1194
1195     /* See if we're at the beginning of a possible character
1196        class.  */
1197     else if (c == (sal_Unicode)':' && p[-2] == (sal_Unicode)'[') {
1198       /* Leave room for the null.  */
1199       sal_Unicode str[CHAR_CLASS_MAX_LENGTH + 1];
1200
1201       PATFETCH_RAW(c);
1202       c1 = 0;
1203
1204       /* If pattern is `[[:'.  */
1205       if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1206
1207       str[c1++] = c;
1208       for (;;) {
1209         PATFETCH_RAW(c);
1210         if ((c == (sal_Unicode)':' && *p == (sal_Unicode)']') || p == pend)
1211           break;
1212         if (c1 < CHAR_CLASS_MAX_LENGTH)
1213           str[c1++] = c;
1214         else
1215                 /* This is in any case an invalid class name.  */
1216           str[0] = (sal_Unicode)'\0';
1217       }
1218       str[c1] = (sal_Unicode)'\0';
1219
1220       /* If isn't a word bracketed by `[:' and `:]':
1221          undo the ending character, the letters, and leave
1222          the leading `:' and `[' (but set bits for them).  */
1223       if (c == (sal_Unicode)':' && *p == (sal_Unicode)']') {
1224         sal_Int32 ch;
1225         // no support for GRAPH, PUNCT, or XDIGIT yet
1226         sal_Bool is_alnum = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"alnum").getStr());
1227         sal_Bool is_alpha = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"alpha").getStr());
1228         sal_Bool is_cntrl = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"cntrl").getStr());
1229         sal_Bool is_digit = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"digit").getStr());
1230         sal_Bool is_lower = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"lower").getStr());
1231         sal_Bool is_print = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"print").getStr());
1232         sal_Bool is_space = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"space").getStr());
1233         sal_Bool is_upper = STREQ(str, ::rtl::OUString::createFromAscii((const sal_Char*)"upper").getStr());
1234
1235         if (!(is_alnum || is_alpha || is_cntrl ||
1236           is_digit || is_lower || is_print || is_space || is_upper) )
1237           FREE_STACK_RETURN(REG_ECTYPE);
1238
1239         /* Throw away the ] at the end of the character
1240            class.  */
1241         PATFETCH_RAW(c);
1242
1243         if (p == pend) FREE_STACK_RETURN(REG_EBRACK);
1244
1245         for (ch = 0; ch < 1 << BYTEWIDTH; ch++) {
1246                 /* This was split into 3 if's to
1247                    avoid an arbitrary limit in some compiler.  */
1248           if (   (is_alnum  && unicode::isAlphaDigit(sal_Unicode(ch))) ||
1249              (is_alpha  && unicode::isAlpha(sal_Unicode(ch))) ||
1250              (is_cntrl  && unicode::isControl(sal_Unicode(ch))))
1251         set_list_bit(sal_Unicode(ch), b);
1252           if (   (is_digit  && unicode::isDigit(sal_Unicode(ch))) ||
1253              (is_lower  && unicode::isLower(sal_Unicode(ch))) ||
1254              (is_print  && unicode::isPrint(sal_Unicode(ch))))
1255         set_list_bit(sal_Unicode(ch), b);
1256           if (   (is_space  && unicode::isSpace(sal_Unicode(ch))) ||
1257              (is_upper  && unicode::isUpper(sal_Unicode(ch))) )
1258         set_list_bit(sal_Unicode(ch), b);
1259           if ( isIgnoreCase && (is_upper || is_lower) &&
1260              (unicode::isUpper(sal_Unicode(ch)) || unicode::isLower(sal_Unicode(ch))))
1261         set_list_bit(sal_Unicode(ch), b);
1262         }
1263         break;
1264       } else {
1265         p = p1+1;
1266         last_char = (sal_Unicode)':';
1267         set_list_bit(last_char, b);
1268       }
1269     } else {
1270       last_char = c;
1271       set_list_bit(last_char, b);
1272     }
1273     if ( have_range ) {
1274       if ( last_char != 0xffff ) {
1275         second_range = last_char;
1276         have_range = false;
1277         compile_range(first_range, second_range, b);
1278       } else FREE_STACK_RETURN(REG_EBRACK);
1279     } else {
1280       if ( last_char != 0xffff ) {
1281         set_list_bit(last_char, b);
1282       } else FREE_STACK_RETURN(REG_EBRACK);
1283     }
1284       }
1285
1286       /* Discard any (non)matching list bytes that are all 0 at the
1287      end of the map.  Decrease the map-length byte too.  */
1288       bsiz = b[-1];
1289       while ((sal_Int16) bsiz > 0 && b[bsiz - 1] == 0)
1290     bsiz--;
1291       b[-1] = (sal_Unicode)bsiz;
1292       b += bsiz;
1293     }
1294     break;
1295
1296     case (sal_Unicode)'(':
1297       goto handle_open;
1298
1299     case (sal_Unicode)')':
1300       goto handle_close;
1301
1302     case (sal_Unicode)'\n':
1303       goto normal_char;
1304
1305     case (sal_Unicode)'|':
1306       goto handle_alt;
1307
1308     case (sal_Unicode)'{':
1309       goto handle_interval;
1310
1311     case (sal_Unicode)'\\':
1312       if (p == pend) FREE_STACK_RETURN(REG_EESCAPE);
1313
1314       /* Do not translate the character after the \, so that we can
1315      distinguish, e.g., \B from \b, even if we normally would
1316      translate, e.g., B to b.  */
1317       PATFETCH_RAW(c);
1318
1319       switch (c) {
1320       case (sal_Unicode)'(':
1321     goto normal_backslash;
1322
1323       handle_open:
1324     bufp->re_nsub++;
1325     regnum++;
1326
1327     if (COMPILE_STACK_FULL) {
1328       compile_stack.stack = (compile_stack_elt_t *)realloc(compile_stack.stack, (compile_stack.size << 1) * sizeof(compile_stack_elt_t));
1329       if (compile_stack.stack == NULL) return(REG_ESPACE);
1330
1331       compile_stack.size <<= 1;
1332     }
1333
1334     /* These are the values to restore when we hit end of this
1335        group.  They are all relative offsets, so that if the
1336        whole pattern moves because of realloc, they will still
1337        be valid.  */
1338     COMPILE_STACK_TOP.begalt_offset = begalt - bufp->buffer;
1339     COMPILE_STACK_TOP.fixup_alt_jump
1340       = fixup_alt_jump ? fixup_alt_jump - bufp->buffer + 1 : 0;
1341     COMPILE_STACK_TOP.laststart_offset = b - bufp->buffer;
1342     COMPILE_STACK_TOP.regnum = regnum;
1343
1344     /* We will eventually replace the 0 with the number of
1345        groups inner to this one.  But do not push a
1346        start_memory for groups beyond the last one we can
1347        represent in the compiled pattern.  */
1348     if (regnum <= MAX_REGNUM) {
1349       COMPILE_STACK_TOP.inner_group_offset = b - bufp->buffer + 2;
1350       BUF_PUSH_3 (start_memory, regnum, 0);
1351     }
1352
1353     compile_stack.avail++;
1354
1355     fixup_alt_jump = 0;
1356     laststart = 0;
1357     begalt = b;
1358     /* If we've reached MAX_REGNUM groups, then this open
1359        won't actually generate any code, so we'll have to
1360        clear pending_exact explicitly.  */
1361     pending_exact = 0;
1362     break;
1363
1364
1365       case (sal_Unicode)')':
1366     goto normal_backslash;
1367
1368     // unreachable (after goto):
1369 #if 0
1370     if (COMPILE_STACK_EMPTY) {
1371       FREE_STACK_RETURN(REG_ERPAREN);
1372     }
1373 #endif
1374
1375       handle_close:
1376     if (fixup_alt_jump) {
1377       /* Push a dummy failure point at the end of the
1378          alternative for a possible future
1379          `pop_failure_jump' to pop.  See comments at
1380          `push_dummy_failure' in `re_match2'.  */
1381       BUF_PUSH(push_dummy_failure);
1382
1383       /* We allocated space for this jump when we assigned
1384          to `fixup_alt_jump', in the `handle_alt' case below.  */
1385       STORE_JUMP(jump_past_alt, fixup_alt_jump, b - 1);
1386     }
1387
1388     /* See similar code for backslashed left paren above.  */
1389     if (COMPILE_STACK_EMPTY) {
1390       FREE_STACK_RETURN(REG_ERPAREN);
1391     }
1392
1393     /* Since we just checked for an empty stack above, this
1394        ``can't happen''.  */
1395     assert (compile_stack.avail != 0);
1396
1397     {
1398       /* We don't just want to restore into `regnum', because
1399          later groups should continue to be numbered higher,
1400          as in `(ab)c(de)' -- the second group is #2.  */
1401       sal_Int32 this_group_regnum;
1402
1403       compile_stack.avail--;
1404       begalt = bufp->buffer + COMPILE_STACK_TOP.begalt_offset;
1405       fixup_alt_jump
1406         = COMPILE_STACK_TOP.fixup_alt_jump
1407         ? bufp->buffer + COMPILE_STACK_TOP.fixup_alt_jump - 1
1408         : 0;
1409       laststart = bufp->buffer + COMPILE_STACK_TOP.laststart_offset;
1410       this_group_regnum = COMPILE_STACK_TOP.regnum;
1411       /* If we've reached MAX_REGNUM groups, then this open
1412          won't actually generate any code, so we'll have to
1413          clear pending_exact explicitly.  */
1414       pending_exact = 0;
1415
1416       /* We're at the end of the group, so now we know how many
1417          groups were inside this one.  */
1418       if (this_group_regnum <= MAX_REGNUM) {
1419         sal_Unicode *inner_group_loc
1420           = bufp->buffer + COMPILE_STACK_TOP.inner_group_offset;
1421
1422         *inner_group_loc = sal::static_int_cast<sal_Unicode>( regnum - this_group_regnum );
1423         BUF_PUSH_3 (stop_memory, this_group_regnum,
1424             regnum - this_group_regnum);
1425       }
1426     }
1427     break;
1428
1429
1430       case (sal_Unicode)'|':                    /* `\|'.
1431                          * */
1432     goto normal_backslash;
1433       handle_alt:
1434
1435     /* Insert before the previous alternative a jump which
1436        jumps to this alternative if the former fails.  */
1437     GET_BUFFER_SPACE (3);
1438     INSERT_JUMP (on_failure_jump, begalt, b + 6);
1439     pending_exact = 0;
1440     b += 3;
1441
1442     /* The alternative before this one has a jump after it
1443        which gets executed if it gets matched.  Adjust that
1444        jump so it will jump to this alternative's analogous
1445        jump (put in below, which in turn will jump to the next
1446        (if any) alternative's such jump, etc.).  The last such
1447        jump jumps to the correct final destination.  A picture:
1448        _____ _____
1449        |   | |   |
1450        |   v |   v
1451        a | b   | c
1452
1453        If we are at `b', then fixup_alt_jump right now points to a
1454        three-byte space after `a'.  We'll put in the jump, set
1455        fixup_alt_jump to right after `b', and leave behind three
1456        bytes which we'll fill in when we get to after `c'.  */
1457
1458     if (fixup_alt_jump)
1459       STORE_JUMP (jump_past_alt, fixup_alt_jump, b);
1460
1461     /* Mark and leave space for a jump after this alternative,
1462        to be filled in later either by next alternative or
1463        when know we're at the end of a series of alternatives.  */
1464     fixup_alt_jump = b;
1465     GET_BUFFER_SPACE (3);
1466     b += 3;
1467
1468     laststart = 0;
1469     begalt = b;
1470     break;
1471
1472
1473       case (sal_Unicode)'{':
1474     goto normal_backslash;
1475
1476       handle_interval:
1477     {
1478       /* allows intervals.  */
1479       /* At least (most) this many matches must be made.  */
1480       sal_Int32 lower_bound = -1, upper_bound = -1;
1481
1482       beg_interval = p - 1;
1483
1484       if (p == pend) {
1485         goto unfetch_interval;
1486       }
1487
1488       GET_UNSIGNED_NUMBER(lower_bound);
1489
1490       if (c == (sal_Unicode)',') {
1491         GET_UNSIGNED_NUMBER(upper_bound);
1492         if (upper_bound < 0) upper_bound = RE_DUP_MAX;
1493       } else
1494         /* Interval such as `{1}' => match exactly once. */
1495         upper_bound = lower_bound;
1496
1497       if (lower_bound < 0 || upper_bound > RE_DUP_MAX
1498           || lower_bound > upper_bound) {
1499         goto unfetch_interval;
1500       }
1501
1502       if (c != (sal_Unicode)'}') {
1503         goto unfetch_interval;
1504       }
1505
1506       /* We just parsed a valid interval.  */
1507
1508       /* If it's invalid to have no preceding re.  */
1509       if (!laststart) {
1510         goto unfetch_interval;
1511       }
1512
1513       /* If the upper bound is zero, don't want to succeed at
1514          all; jump from `laststart' to `b + 3', which will be
1515          the end of the buffer after we insert the jump.  */
1516       if (upper_bound == 0) {
1517         GET_BUFFER_SPACE(3);
1518         INSERT_JUMP(jump, laststart, b + 3);
1519         b += 3;
1520       }
1521
1522       /* Otherwise, we have a nontrivial interval.  When
1523          we're all done, the pattern will look like:
1524          set_number_at <jump count> <upper bound>
1525          set_number_at <succeed_n count> <lower bound>
1526          succeed_n <after jump addr> <succeed_n count>
1527          <body of loop>
1528          jump_n <succeed_n addr> <jump count>
1529          (The upper bound and `jump_n' are omitted if
1530          `upper_bound' is 1, though.)  */
1531       else {
1532         /* If the upper bound is > 1, we need to insert
1533            more at the end of the loop.  */
1534         unsigned nbytes = 10 + (upper_bound > 1) * 10;
1535
1536         GET_BUFFER_SPACE(nbytes);
1537
1538         /* Initialize lower bound of the `succeed_n', even
1539            though it will be set during matching by its
1540            attendant `set_number_at' (inserted next),
1541            because `re_compile_fastmap' needs to know.
1542            Jump to the `jump_n' we might insert below.  */
1543         INSERT_JUMP2(succeed_n, laststart,
1544              b + 5 + (upper_bound > 1) * 5,
1545              lower_bound);
1546         b += 5;
1547
1548         /* Code to initialize the lower bound.  Insert
1549            before the `succeed_n'.  The `5' is the last two
1550            bytes of this `set_number_at', plus 3 bytes of
1551            the following `succeed_n'.  */
1552         insert_op2(set_number_at, laststart, 5, lower_bound, b);
1553         b += 5;
1554
1555         if (upper_bound > 1) {
1556                 /* More than one repetition is allowed, so
1557                    append a backward jump to the `succeed_n'
1558                    that starts this interval.
1559
1560                    When we've reached this during matching,
1561                    we'll have matched the interval once, so
1562                    jump back only `upper_bound - 1' times.  */
1563           STORE_JUMP2(jump_n, b, laststart + 5,
1564               upper_bound - 1);
1565           b += 5;
1566
1567                 /* The location we want to set is the second
1568                    parameter of the `jump_n'; that is `b-2' as
1569                    an absolute address.  `laststart' will be
1570                    the `set_number_at' we're about to insert;
1571                    `laststart+3' the number to set, the source
1572                    for the relative address.  But we are
1573                    inserting into the middle of the pattern --
1574                    so everything is getting moved up by 5.
1575                    Conclusion: (b - 2) - (laststart + 3) + 5,
1576                    i.e., b - laststart.
1577
1578                    We insert this at the beginning of the loop
1579                    so that if we fail during matching, we'll
1580                    reinitialize the bounds.  */
1581           insert_op2(set_number_at, laststart, b - laststart,
1582              upper_bound - 1, b);
1583           b += 5;
1584         }
1585       }
1586       pending_exact = 0;
1587       beg_interval = NULL;
1588     }
1589     break;
1590
1591       unfetch_interval:
1592     /* If an invalid interval, match the characters as literals.  */
1593     assert (beg_interval);
1594     p = beg_interval;
1595     beg_interval = NULL;
1596
1597     /* normal_char and normal_backslash need `c'.  */
1598     PATFETCH_RAW(c);
1599
1600     goto normal_char;
1601
1602       case (sal_Unicode)'`':
1603     BUF_PUSH(begbuf);
1604     break;
1605
1606       case (sal_Unicode)'\'':
1607     BUF_PUSH(endbuf);
1608     break;
1609
1610       case (sal_Unicode)'1': case (sal_Unicode)'2':
1611       case (sal_Unicode)'3': case (sal_Unicode)'4':
1612       case (sal_Unicode)'5': case (sal_Unicode)'6':
1613       case (sal_Unicode)'7': case (sal_Unicode)'8':
1614       case (sal_Unicode)'9':
1615     c1 = c - (sal_Unicode)'0';
1616
1617     if (c1 > regnum)
1618       FREE_STACK_RETURN(REG_ESUBREG);
1619
1620     /* Can't back reference to a subexpression if inside of it.  */
1621     if (group_in_compile_stack(compile_stack, (sal_uInt32) c1)) {
1622       goto normal_char;
1623     }
1624
1625     laststart = b;
1626     BUF_PUSH_2(duplicate, c1);
1627     break;
1628
1629
1630       case (sal_Unicode)'+':
1631       case (sal_Unicode)'?':
1632     goto normal_backslash;
1633
1634       case (sal_Unicode)'x':            // Unicode char
1635     {
1636       sal_Int32 UniChar = -1;
1637
1638       GET_HEX_NUMBER(UniChar);
1639       if (UniChar < 0 || UniChar > 0xffff) FREE_STACK_RETURN(REG_BADPAT);
1640       c = (sal_Unicode) UniChar;
1641       goto normal_char;
1642     }
1643     // break;   // unreachable - see goto above
1644
1645       case (sal_Unicode)'<':            // begin Word boundary
1646     BUF_PUSH(wordbeg);
1647     break;
1648
1649       case (sal_Unicode)'>':            // end Word boundary
1650     BUF_PUSH(wordend);
1651     break;
1652
1653       case (sal_Unicode)'n':
1654     c = 0x0a;
1655     goto normal_char;
1656
1657       case (sal_Unicode)'t':
1658     c = 0x09;
1659     goto normal_char;
1660
1661       default:
1662       normal_backslash:
1663     goto normal_char;
1664       }
1665       break;
1666
1667     default:
1668       /* Expects the character in `c'.  */
1669     normal_char:
1670       /* If no exactn currently being built.  */
1671       if ( pending_exact == NULL
1672
1673        /* If last exactn not at current position.  */
1674        || pending_exact + *pending_exact + 1 != b
1675
1676        /* We have only one sal_Unicode char following the
1677           exactn for the count.  */
1678        || *pending_exact == (1 << BYTEWIDTH) - 1
1679
1680        /* If followed by a repetition operator.  */
1681        || *p == (sal_Unicode)'*' || *p == (sal_Unicode)'^'
1682        || *p == (sal_Unicode)'+' || *p == (sal_Unicode)'?'
1683        || *p == (sal_Unicode) '{' ) {
1684     /* Start building a new exactn.  */
1685     laststart = b;
1686     BUF_PUSH_2(exactn, 0);
1687     pending_exact = b - 1;
1688       }
1689
1690       if ( translate ) {
1691         try {
1692             sal_Unicode tmp = translit->transliterateChar2Char(c);
1693             BUF_PUSH(tmp);
1694             (*pending_exact)++;
1695         } catch (::com::sun::star::i18n::MultipleCharsOutputException e) {
1696             ::rtl::OUString o2( translit->transliterateChar2String( c));
1697             sal_Int32 len2 = o2.getLength();
1698             const sal_Unicode * k2 = o2.getStr();
1699             for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
1700               BUF_PUSH(k2[nmatch]);
1701               (*pending_exact)++;
1702             }
1703         }
1704       } else {
1705     BUF_PUSH(c);
1706     (*pending_exact)++;
1707       }
1708       break;
1709     } /* switch (c) */
1710   } /* while p != pend */
1711
1712   /* Through the pattern now.  */
1713
1714   if (fixup_alt_jump)
1715     STORE_JUMP(jump_past_alt, fixup_alt_jump, b);
1716
1717   if (!COMPILE_STACK_EMPTY)
1718     FREE_STACK_RETURN(REG_EPAREN);
1719
1720   // Assumes no backtracking
1721   BUF_PUSH(succeed);
1722
1723   if ( compile_stack.stack )
1724     free(compile_stack.stack);
1725   compile_stack.stack = NULL;
1726
1727   /* We have succeeded; set the length of the buffer.  */
1728   bufp->used = b - bufp->buffer;
1729
1730   return REG_NOERROR;
1731 } /* regex_compile */
1732
1733 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
1734    bytes; nonzero otherwise.  */
1735
1736 sal_Int32
1737 Regexpr::bcmp_translate(const sal_Unicode *s1, const sal_Unicode *s2, sal_Int32 len)
1738 {
1739   for (sal_Int32 nmatch = 0; nmatch < len; nmatch++) {
1740     if (*s1++ != *s2++) {
1741       return(1);
1742     }
1743   }
1744
1745   return(0);
1746 }
1747
1748
1749 /* We are passed P pointing to a register number after a start_memory.
1750
1751    Return true if the pattern up to the corresponding stop_memory can
1752    match the empty string, and false otherwise.
1753
1754    If we find the matching stop_memory, sets P to point to one past its number.
1755    Otherwise, sets P to an undefined byte less than or equal to END.
1756
1757    We don't handle duplicates properly (yet).  */
1758
1759 sal_Bool
1760 Regexpr::group_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1761 {
1762   sal_Int32 mcnt;
1763 /* Point to after the args to the start_memory.  */
1764     sal_Unicode *p1 = *p + 2;
1765
1766     while (p1 < end) {
1767     /* Skip over opcodes that can match nothing, and return true or
1768        false, as appropriate, when we get to one that can't, or to the
1769                       matching stop_memory.  */
1770
1771       switch ((re_opcode_t) *p1) {
1772     /* Could be either a loop or a series of alternatives.  */
1773       case on_failure_jump:
1774     p1++;
1775     extract_number_and_incr(mcnt, p1);
1776
1777     /* If the next operation is not a jump backwards in the
1778        pattern.  */
1779
1780     if (mcnt >= 0) {
1781       /* Go through the on_failure_jumps of the alternatives,
1782          seeing if any of the alternatives cannot match nothing.
1783          The last alternative starts with only a jump,
1784          whereas the rest start with on_failure_jump and end
1785          with a jump, e.g., here is the pattern for `a|b|c':
1786
1787          /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
1788          /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
1789          /exactn/1/c
1790
1791          So, we have to first go through the first (n-1)
1792          alternatives and then deal with the last one separately.  */
1793
1794
1795       /* Deal with the first (n-1) alternatives, which start
1796          with an on_failure_jump (see above) that jumps to right
1797          past a jump_past_alt.  */
1798
1799       while ((re_opcode_t) p1[mcnt-3] == jump_past_alt) {
1800         /* `mcnt' holds how many bytes long the alternative
1801            is, including the ending `jump_past_alt' and
1802            its number.  */
1803
1804         if (!alt_match_null_string_p(p1, p1 + mcnt - 3, reg_info))
1805           return false;
1806
1807         /* Move to right after this alternative, including the
1808            jump_past_alt.  */
1809         p1 += mcnt;
1810
1811         /* Break if it's the beginning of an n-th alternative
1812            that doesn't begin with an on_failure_jump.  */
1813         if ((re_opcode_t) *p1 != on_failure_jump)
1814           break;
1815
1816         /* Still have to check that it's not an n-th
1817            alternative that starts with an on_failure_jump.  */
1818         p1++;
1819         extract_number_and_incr(mcnt, p1);
1820         if ((re_opcode_t) p1[mcnt-3] != jump_past_alt) {
1821           /* Get to the beginning of the n-th alternative.  */
1822           p1 -= 3;
1823           break;
1824         }
1825       }
1826
1827       /* Deal with the last alternative: go back and get number
1828          of the `jump_past_alt' just before it.  `mcnt' contains
1829          the length of the alternative.  */
1830       extract_number(mcnt, p1 - 2);
1831
1832       if (!alt_match_null_string_p (p1, p1 + mcnt, reg_info))
1833         return false;
1834
1835       p1 += mcnt;       /* Get past the n-th alternative.  */
1836     } /* if mcnt > 0 */
1837     break;
1838
1839
1840       case stop_memory:
1841     assert (p1[1] == **p);
1842     *p = p1 + 2;
1843     return true;
1844
1845
1846       default:
1847     if (!common_op_match_null_string_p(&p1, end, reg_info))
1848       return false;
1849       }
1850     } /* while p1 < end */
1851
1852  return false;
1853 } /* group_match_null_string_p */
1854
1855 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
1856    It expects P to be the first byte of a single alternative and END one
1857    byte past the last. The alternative can contain groups.  */
1858
1859 sal_Bool
1860 Regexpr::alt_match_null_string_p(sal_Unicode *p, sal_Unicode *end, register_info_type *reg_info)
1861 {
1862   sal_Int32 mcnt;
1863   sal_Unicode *p1 = p;
1864
1865   while (p1 < end) {
1866     /* Skip over opcodes that can match nothing, and break when we get
1867        to one that can't.  */
1868
1869     switch ((re_opcode_t) *p1) {
1870       /* It's a loop.  */
1871     case on_failure_jump:
1872       p1++;
1873       extract_number_and_incr(mcnt, p1);
1874       p1 += mcnt;
1875       break;
1876
1877     default:
1878       if (!common_op_match_null_string_p(&p1, end, reg_info))
1879     return false;
1880     }
1881   }  /* while p1 < end */
1882
1883   return true;
1884 } /* alt_match_null_string_p */
1885
1886
1887 /* Deals with the ops common to group_match_null_string_p and
1888    alt_match_null_string_p.
1889
1890    Sets P to one after the op and its arguments, if any.  */
1891
1892 sal_Bool
1893 Regexpr::common_op_match_null_string_p(sal_Unicode **p, sal_Unicode *end, register_info_type *reg_info)
1894 {
1895   sal_Int32 mcnt;
1896   sal_Bool ret;
1897   sal_Int32 reg_no;
1898   sal_Unicode *p1 = *p;
1899
1900   switch ((re_opcode_t) *p1++) {
1901   case no_op:
1902   case begline:
1903   case endline:
1904   case begbuf:
1905   case endbuf:
1906     break;
1907
1908   case start_memory:
1909     reg_no = *p1;
1910     assert (reg_no > 0 && reg_no <= MAX_REGNUM);
1911     ret = group_match_null_string_p(&p1, end, reg_info);
1912     /* Have to set this here in case we're checking a group which
1913        contains a group and a back reference to it.  */
1914
1915     if (REG_MATCH_NULL_STRING_P(reg_info[reg_no]) == MATCH_NULL_UNSET_VALUE)
1916       REG_MATCH_NULL_STRING_P(reg_info[reg_no]) = ret;
1917
1918     if (!ret)
1919       return false;
1920     break;
1921
1922     /* If this is an optimized succeed_n for zero times, make the jump.  */
1923   case jump:
1924     extract_number_and_incr(mcnt, p1);
1925     if (mcnt >= 0)
1926       p1 += mcnt;
1927     else
1928       return false;
1929     break;
1930
1931   case succeed_n:
1932     /* Get to the number of times to succeed.  */
1933     p1 += 2;
1934     extract_number_and_incr(mcnt, p1);
1935
1936     if (mcnt == 0)
1937       {
1938     p1 -= 4;
1939     extract_number_and_incr(mcnt, p1);
1940     p1 += mcnt;
1941       }
1942     else
1943       return false;
1944     break;
1945
1946   case duplicate:
1947     if (!REG_MATCH_NULL_STRING_P(reg_info[*p1]))
1948       return false;
1949     break;
1950
1951   case set_number_at:
1952     p1 += 4;
1953
1954   default:
1955     /* All other opcodes mean we cannot match the empty string.  */
1956     return false;
1957   }
1958
1959   *p = p1;
1960   return true;
1961 } /* common_op_match_null_string_p */
1962
1963
1964
1965 /* Free everything we malloc.  */
1966 #ifdef MATCH_MAY_ALLOCATE
1967 # define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
1968 # define FREE_VARIABLES()                                               \
1969   do {                                                                  \
1970     REGEX_FREE_STACK (fail_stack.stack);                                \
1971     FREE_VAR (regstart);                                                \
1972     FREE_VAR (regend);                                                  \
1973     FREE_VAR (old_regstart);                                            \
1974     FREE_VAR (old_regend);                                              \
1975     FREE_VAR (best_regstart);                                           \
1976     FREE_VAR (best_regend);                                             \
1977     FREE_VAR (reg_info);                                                \
1978     FREE_VAR (reg_dummy);                                               \
1979     FREE_VAR (reg_info_dummy);                                          \
1980   } while (0)
1981 #else
1982 # define FREE_VARIABLES() ((void)0) /* Do nothing!  But inhibit gcc warning. */
1983 #endif /* not MATCH_MAY_ALLOCATE */
1984
1985 /* This is a separate function so that we can force an alloca cleanup
1986    afterwards.  */
1987 sal_Int32
1988 Regexpr::re_match2(struct re_registers *regs, sal_Int32 pos, sal_Int32 range)
1989 {
1990   /* General temporaries.  */
1991   sal_Int32 mcnt;
1992   sal_Unicode *p1;
1993
1994   /* Just past the end of the corresponding string.  */
1995   sal_Unicode *end2;
1996
1997   /* Pointers into string2, just past the last characters in
1998        each to consider matching.  */
1999   sal_Unicode *end_match_2;
2000
2001   /* Where we are in the data, and the end of the current string.  */
2002   const sal_Unicode *d, *dend;
2003
2004   /* Where we are in the compiled pattern, and the end of the compiled
2005        pattern.  */
2006   sal_Unicode *p = bufp->buffer;
2007   register sal_Unicode *pend = p + bufp->used;
2008
2009     /* Mark the opcode just after a start_memory, so we can test for an
2010        empty subpattern when we get to the stop_memory.  */
2011   sal_Unicode *just_past_start_mem = 0;
2012
2013   /* Failure point stack.  Each place that can handle a failure further
2014      down the line pushes a failure point on this stack.  It consists of
2015      restart, regend, and reg_info for all registers corresponding to
2016      the subexpressions we're currently inside, plus the number of such
2017      registers, and, finally, two sal_Unicode *'s.  The first
2018      sal_Unicode * is where to resume scanning the pattern; the second
2019      one is where to resume scanning the strings.  If the latter is
2020      zero, the failure point is a ``dummy''; if a failure happens and
2021      the failure point is a dummy, it gets discarded and the next next
2022      one is tried.  */
2023 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
2024   fail_stack_type fail_stack;
2025 #endif
2026
2027   /* We fill all the registers internally, independent of what we
2028      return, for use in backreferences.  The number here includes
2029      an element for register zero.  */
2030   size_t num_regs = bufp->re_nsub + 1;
2031
2032   /* The currently active registers.  */
2033   sal_uInt32 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2034   sal_uInt32 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2035
2036   /* Information on the contents of registers. These are pointers into
2037      the input strings; they record just what was matched (on this
2038      attempt) by a subexpression part of the pattern, that is, the
2039      regnum-th regstart pointer points to where in the pattern we began
2040      matching and the regnum-th regend points to right after where we
2041      stopped matching the regnum-th subexpression.  (The zeroth register
2042      keeps track of what the whole pattern matches.)  */
2043 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2044   const sal_Unicode **regstart, **regend;
2045 #endif
2046
2047   /* If a group that's operated upon by a repetition operator fails to
2048      match anything, then the register for its start will need to be
2049      restored because it will have been set to wherever in the string we
2050      are when we last see its open-group operator.  Similarly for a
2051      register's end.  */
2052 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2053   const sal_Unicode **old_regstart, **old_regend;
2054 #endif
2055
2056   /* The is_active field of reg_info helps us keep track of which (possibly
2057      nested) subexpressions we are currently in. The matched_something
2058      field of reg_info[reg_num] helps us tell whether or not we have
2059      matched any of the pattern so far this time through the reg_num-th
2060      subexpression.  These two fields get reset each time through any
2061      loop their register is in.  */
2062 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global.  */
2063   register_info_type *reg_info;
2064 #endif
2065
2066   /* The following record the register info as found in the above
2067      variables when we find a match better than any we've seen before.
2068      This happens as we backtrack through the failure points, which in
2069      turn happens only if we have not yet matched the entire string. */
2070   //unsigned best_regs_set = false;
2071 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2072   const sal_Unicode **best_regstart, **best_regend;
2073 #endif
2074
2075   /* Logically, this is `best_regend[0]'.  But we don't want to have to
2076      allocate space for that if we're not allocating space for anything
2077      else (see below).  Also, we never need info about register 0 for
2078      any of the other register vectors, and it seems rather a kludge to
2079      treat `best_regend' differently than the rest.  So we keep track of
2080      the end of the best match so far in a separate variable.  We
2081      initialize this to NULL so that when we backtrack the first time
2082      and need to test it, it's not garbage.  */
2083   //const sal_Unicode *match_end = NULL;
2084
2085   /* This helps SET_REGS_MATCHED avoid doing redundant work.  */
2086   sal_Int32 set_regs_matched_done = 0;
2087
2088   /* Used when we pop values we don't care about.  */
2089 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global.  */
2090   const sal_Unicode **reg_dummy;
2091   register_info_type *reg_info_dummy;
2092 #endif
2093
2094   INIT_FAIL_STACK();
2095
2096 #ifdef MATCH_MAY_ALLOCATE
2097   /* Do not bother to initialize all the register variables if there are
2098      no groups in the pattern, as it takes a fair amount of time.  If
2099      there are groups, we include space for register 0 (the whole
2100      pattern), even though we never use it, since it simplifies the
2101      array indexing.  We should fix this.  */
2102   if (bufp->re_nsub)
2103     {
2104       regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2105       regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2106       old_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2107       old_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2108       best_regstart = REGEX_TALLOC (num_regs, const sal_Unicode *);
2109       best_regend = REGEX_TALLOC (num_regs, const sal_Unicode *);
2110       reg_info = REGEX_TALLOC (num_regs, register_info_type);
2111       reg_dummy = REGEX_TALLOC (num_regs, const sal_Unicode *);
2112       reg_info_dummy = REGEX_TALLOC (num_regs, register_info_type);
2113
2114       if (!(regstart && regend && old_regstart && old_regend && reg_info
2115             && best_regstart && best_regend && reg_dummy && reg_info_dummy))
2116         {
2117           FREE_VARIABLES ();
2118           return -2;
2119         }
2120     }
2121   else
2122     {
2123       /* We must initialize all our variables to NULL, so that
2124          `FREE_VARIABLES' doesn't try to free them.  */
2125       regstart = regend = old_regstart = old_regend = best_regstart
2126         = best_regend = reg_dummy = NULL;
2127       reg_info = reg_info_dummy = (register_info_type *) NULL;
2128     }
2129 #endif /* MATCH_MAY_ALLOCATE */
2130
2131   sal_Unicode *string2 = (sal_Unicode *)line;
2132   sal_Int32 size2 = linelen;
2133   sal_Int32 stop = range;
2134
2135   /* The starting position is bogus.  */
2136   if (pos < 0 || pos >= size2 || linelen <= 0 ) {
2137       FREE_VARIABLES ();
2138       return(-1);
2139   }
2140
2141   /* Initialize subexpression text positions to -1 to mark ones that no
2142      start_memory/stop_memory has been seen for. Also initialize the
2143      register information struct.  */
2144   for (mcnt = 1; (unsigned) mcnt < num_regs; mcnt++) {
2145     regstart[mcnt] = regend[mcnt]
2146       = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
2147
2148     REG_MATCH_NULL_STRING_P (reg_info[mcnt]) = MATCH_NULL_UNSET_VALUE;
2149     IS_ACTIVE (reg_info[mcnt]) = 0;
2150     MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2151     EVER_MATCHED_SOMETHING (reg_info[mcnt]) = 0;
2152   }
2153
2154   end2 = (sal_Unicode *)(string2 + size2);
2155
2156   end_match_2 = (sal_Unicode *)(string2 + stop);
2157
2158   /* `p' scans through the pattern as `d' scans through the data.
2159      `dend' is the end of the input string that `d' points within.  `d'
2160      is advanced into the following input string whenever necessary, but
2161      this happens before fetching; therefore, at the beginning of the
2162      loop, `d' can be pointing at the end of a string, but it cannot
2163      equal `string2'.  */
2164   d = string2 + pos;
2165   dend = end_match_2;
2166
2167     /* This loops over pattern commands.  It exits by returning from the
2168        function if the match is complete, or it drops through if the match
2169        fails at this starting point in the input data.  */
2170   for (;;) {
2171     if (p == pend) {
2172       /* End of pattern means we might have succeeded.  */
2173
2174       /* If we haven't matched the entire string, and we want the
2175      longest match, try backtracking.  */
2176       if (d != end_match_2) {
2177     if (!FAIL_STACK_EMPTY()) {
2178       goto fail;
2179     }
2180       } /* d != end_match_2 */
2181
2182     succeed_label:
2183
2184       /* If caller wants register contents data back, do it.  */
2185       if (regs) {
2186     /* Have the register data arrays been allocated?  */
2187     if (regs->num_regs == 0) {
2188       /* No.  So allocate them with malloc.  We need one
2189          extra element beyond `num_regs' for the `-1' marker
2190          GNU code uses.  */
2191       regs->num_of_match = 0;
2192       regs->num_regs = MAX(RE_NREGS, num_regs + 1);
2193       regs->start = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2194       regs->end = (sal_Int32 *) malloc(regs->num_regs * sizeof(sal_Int32));
2195       if (regs->start == NULL || regs->end == NULL) {
2196         FREE_VARIABLES ();
2197         return(-2);
2198       }
2199     } else if ( regs->num_regs > 0 ) {
2200       /* Yes.  If we need more elements than were already
2201          allocated, reallocate them.  If we need fewer, just
2202          leave it alone.  */
2203       if (regs->num_regs < num_regs + 1) {
2204         regs->num_regs = num_regs + 1;
2205         regs->start = (sal_Int32 *) realloc(regs->start, regs->num_regs * sizeof(sal_Int32));
2206         regs->end = (sal_Int32 *) realloc(regs->end, regs->num_regs * sizeof(sal_Int32));
2207         if (regs->start == NULL || regs->end == NULL) {
2208           FREE_VARIABLES ();
2209           return(-2);
2210         }
2211       }
2212     } else {    // num_regs is negative
2213       FREE_VARIABLES ();
2214       return(-2);
2215     }
2216
2217     /* Convert the pointer data in `regstart' and `regend' to
2218        indices.  Register zero has to be set differently,
2219        since we haven't kept track of any info for it.  */
2220     if (regs->num_regs > 0) {
2221       // Make sure a valid location
2222       sal_Int32 dpos = d - string2;
2223       if (pos == dpos || (d - 1) >= dend ) {
2224         FREE_VARIABLES ();
2225         return(-1);
2226       }
2227       regs->start[regs->num_of_match] = pos;
2228       regs->end[regs->num_of_match] = ((sal_Int32) (d - string2));
2229       regs->num_of_match++;
2230     }
2231
2232     /* Go through the first `min (num_regs, regs->num_regs)'
2233        registers, since that is all we initialized.  */
2234         for (mcnt = regs->num_of_match; (unsigned) mcnt < MIN(num_regs, regs->num_regs);
2235          mcnt++) {
2236       regs->start[mcnt] = regs->end[mcnt] = -1;
2237       if( !(REG_UNSET(regstart[mcnt]) || REG_UNSET(regend[mcnt])) ) {
2238         regs->start[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regstart[mcnt]);
2239         regs->end[regs->num_of_match] = (sal_Int32) POINTER_TO_OFFSET(regend[mcnt]);
2240             regs->num_of_match++;
2241       }
2242     }
2243
2244     /* If the regs structure we return has more elements than
2245        were in the pattern, set the extra elements to -1.  If
2246        we (re)allocated the registers, this is the case,
2247        because we always allocate enough to have at least one
2248        -1 at the end.  */
2249     for (mcnt = regs->num_of_match; (unsigned) mcnt < regs->num_regs; mcnt++)
2250       regs->start[mcnt] = regs->end[mcnt] = -1;
2251       } /* regs */
2252
2253       mcnt = d - pos - string2;
2254
2255       FREE_VARIABLES ();
2256       return(0);
2257     }
2258     /* Otherwise match next pattern command.  */
2259     switch ((re_opcode_t) *p++) {
2260       /* Ignore these.  Used to ignore the n of succeed_n's which
2261      currently have n == 0.  */
2262     case no_op:
2263       break;
2264
2265     case succeed:
2266       goto succeed_label;
2267
2268       /* Match the next n pattern characters exactly.  The following
2269      byte in the pattern defines n, and the n bytes after that
2270      are the characters to match.  */
2271     case exactn:
2272       mcnt = *p++;
2273
2274       do {
2275     PREFETCH();
2276     if ((sal_Unicode)*d++ != (sal_Unicode) *p++) goto fail;
2277       } while (--mcnt);
2278       SET_REGS_MATCHED();
2279       break;
2280
2281       /* Match any character except possibly a newline or a null.  */
2282     case anychar:
2283
2284       PREFETCH();
2285       if ( *d == (sal_Unicode)'\n' ||
2286        *d == (sal_Unicode)'\000' )
2287     goto fail;
2288
2289       SET_REGS_MATCHED();
2290       d++;
2291       break;
2292
2293     case charset:
2294     case charset_not: {
2295       register sal_Unicode c;
2296       sal_Bool knot = (re_opcode_t) *(p - 1) == charset_not;
2297
2298       PREFETCH();
2299       c = *d; /* The character to match.  */
2300       /* Cast to `sal_uInt32' instead of `sal_Unicode' in case the
2301      bit list is a full 32 bytes long.  */
2302       if ((c < (sal_uInt32) (*p * BYTEWIDTH)) && (p[1 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH))))
2303     knot = !knot;
2304
2305       p += 1 + *p;
2306
2307       if (!knot) {
2308     goto fail;
2309       }
2310
2311       SET_REGS_MATCHED();
2312       d++;
2313       break;
2314     }
2315
2316     /* The beginning of a group is represented by start_memory.
2317        The arguments are the register number in the next byte, and the
2318        number of groups inner to this one in the next.  The text
2319        matched within the group is recorded (in the internal
2320        registers data structure) under the register number.  */
2321     case start_memory:
2322
2323       /* Find out if this group can match the empty string.  */
2324       p1 = p;           /* To send to group_match_null_string_p.  */
2325
2326       if (REG_MATCH_NULL_STRING_P(reg_info[*p]) == MATCH_NULL_UNSET_VALUE)
2327     REG_MATCH_NULL_STRING_P(reg_info[*p]) = group_match_null_string_p(&p1, pend, reg_info);
2328
2329       /* Save the position in the string where we were the last time
2330      we were at this open-group operator in case the group is
2331      operated upon by a repetition operator, e.g., with `(a*)*b'
2332      against `ab'; then we want to ignore where we are now in
2333      the string in case this attempt to match fails.  */
2334       old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
2335     ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
2336     : regstart[*p];
2337
2338       regstart[*p] = d;
2339
2340       IS_ACTIVE (reg_info[*p]) = 1;
2341       MATCHED_SOMETHING(reg_info[*p]) = 0;
2342
2343       /* Clear this whenever we change the register activity status.  */
2344       set_regs_matched_done = 0;
2345
2346       /* This is the new highest active register.  */
2347       highest_active_reg = *p;
2348
2349       /* If nothing was active before, this is the new lowest active
2350      register.  */
2351       if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2352     lowest_active_reg = *p;
2353
2354       /* Move past the register number and inner group count.  */
2355       p += 2;
2356       just_past_start_mem = p;
2357
2358       break;
2359
2360       /* The stop_memory opcode represents the end of a group.  Its
2361      arguments are the same as start_memory's: the register
2362      number, and the number of inner groups.  */
2363     case stop_memory:
2364
2365       /* We need to save the string position the last time we were at
2366      this close-group operator in case the group is operated
2367      upon by a repetition operator, e.g., with `((a*)*(b*)*)*'
2368      against `aba'; then we want to ignore where we are now in
2369      the string in case this attempt to match fails.  */
2370       old_regend[*p] = REG_MATCH_NULL_STRING_P (reg_info[*p])
2371     ? REG_UNSET(regend[*p]) ? d : regend[*p]
2372     : regend[*p];
2373
2374       regend[*p] = d;
2375
2376       /* This register isn't active anymore.  */
2377       IS_ACTIVE(reg_info[*p]) = 0;
2378
2379       /* Clear this whenever we change the register activity status.  */
2380       set_regs_matched_done = 0;
2381
2382       /* If this was the only register active, nothing is active
2383      anymore.  */
2384       if (lowest_active_reg == highest_active_reg) {
2385     lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2386     highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2387       } else { /* We must scan for the new highest active register, since
2388           it isn't necessarily one less than now: consider
2389           (a(b)c(d(e)f)g).  When group 3 ends, after the f), the
2390           new highest active register is 1.  */
2391     sal_Unicode r = *p - 1;
2392     while (r > 0 && !IS_ACTIVE (reg_info[r]))
2393       r--;
2394
2395     /* If we end up at register zero, that means that we saved
2396        the registers as the result of an `on_failure_jump', not
2397        a `start_memory', and we jumped to past the innermost
2398        `stop_memory'.  For example, in ((.)*) we save
2399        registers 1 and 2 as a result of the *, but when we pop
2400        back to the second ), we are at the stop_memory 1.
2401        Thus, nothing is active.  */
2402     if (r == 0) {
2403       lowest_active_reg = NO_LOWEST_ACTIVE_REG;
2404       highest_active_reg = NO_HIGHEST_ACTIVE_REG;
2405     } else
2406       highest_active_reg = r;
2407       }
2408
2409       /* If just failed to match something this time around with a
2410      group that's operated on by a repetition operator, try to
2411      force exit from the ``loop'', and restore the register
2412      information for this group that we had before trying this
2413      last match.  */
2414       if ((!MATCHED_SOMETHING (reg_info[*p])
2415        || just_past_start_mem == p - 1)
2416       && (p + 2) < pend) {
2417     sal_Bool is_a_jump_n = false;
2418
2419     p1 = p + 2;
2420     mcnt = 0;
2421     switch ((re_opcode_t) *p1++) {
2422     case jump_n:
2423       is_a_jump_n = true;
2424     case pop_failure_jump:
2425     case maybe_pop_jump:
2426     case jump:
2427     case dummy_failure_jump:
2428       extract_number_and_incr(mcnt, p1);
2429       if (is_a_jump_n)
2430         p1 += 2;
2431       break;
2432
2433     default:
2434       /* do nothing */ ;
2435     }
2436     p1 += mcnt;
2437
2438     /* If the next operation is a jump backwards in the pattern
2439        to an on_failure_jump right before the start_memory
2440        corresponding to this stop_memory, exit from the loop
2441        by forcing a failure after pushing on the stack the
2442        on_failure_jump's jump in the pattern, and d.  */
2443     if (mcnt < 0 && (re_opcode_t) *p1 == on_failure_jump
2444         && (re_opcode_t) p1[3] == start_memory && p1[4] == *p) {
2445       /* If this group ever matched anything, then restore
2446          what its registers were before trying this last
2447          failed match, e.g., with `(a*)*b' against `ab' for
2448          regstart[1], and, e.g., with `((a*)*(b*)*)*'
2449          against `aba' for regend[3].
2450
2451          Also restore the registers for inner groups for,
2452          e.g., `((a*)(b*))*' against `aba' (register 3 would
2453          otherwise get trashed).  */
2454
2455       if (EVER_MATCHED_SOMETHING (reg_info[*p])) {
2456         unsigned r;
2457
2458         EVER_MATCHED_SOMETHING (reg_info[*p]) = 0;
2459
2460         /* Restore this and inner groups' (if any) registers.  */
2461         for (r = *p; r < (unsigned) *p + (unsigned) *(p + 1);
2462          r++) {
2463           regstart[r] = old_regstart[r];
2464
2465                 /* xx why this test?  */
2466           if (old_regend[r] >= regstart[r])
2467         regend[r] = old_regend[r];
2468         }
2469       }
2470       p1++;
2471       extract_number_and_incr(mcnt, p1);
2472       PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
2473
2474       goto fail;
2475     }
2476       }
2477
2478       /* Move past the register number and the inner group count.  */
2479       p += 2;
2480       break;
2481
2482
2483       /* \<digit> has been turned into a `duplicate' command which is
2484      followed by the numeric value of <digit> as the register number.  */
2485     case duplicate:
2486       {
2487     register const sal_Unicode *d2, *dend2;
2488     sal_Unicode regno = *p++;   /* Get which register to match against.  */
2489
2490     /* Can't back reference a group which we've never matched.  */
2491     if (REG_UNSET(regstart[regno]) || REG_UNSET(regend[regno])) {
2492       goto fail;
2493     }
2494
2495     /* Where in input to try to start matching.  */
2496     d2 = regstart[regno];
2497
2498     /* Where to stop matching; if both the place to start and
2499        the place to stop matching are in the same string, then
2500        set to the place to stop, otherwise, for now have to use
2501        the end of the first string.  */
2502
2503     dend2 = regend[regno];
2504     for (;;) {
2505       /* If necessary, advance to next segment in register
2506          contents.  */
2507       while (d2 == dend2) {
2508         if (dend2 == end_match_2) break;
2509         if (dend2 == regend[regno]) break;
2510       }
2511       /* At end of register contents => success */
2512       if (d2 == dend2) break;
2513
2514       PREFETCH();
2515
2516       /* How many characters left in this segment to match.  */
2517       mcnt = dend - d;
2518
2519       /* Want how many consecutive characters we can match in
2520          one shot, so, if necessary, adjust the count.  */
2521       if (mcnt > dend2 - d2)
2522         mcnt = dend2 - d2;
2523
2524       /* Compare that many; failure if mismatch, else move
2525          past them.  */
2526       if (translate
2527           ? bcmp_translate(d, d2, mcnt)
2528           : memcmp(d, d2, mcnt * sizeof(sal_Unicode))) {
2529         goto fail;
2530       }
2531       d += mcnt, d2 += mcnt;
2532       /* Do this because we've match some characters.  */
2533       SET_REGS_MATCHED();
2534     }
2535       }
2536       break;
2537
2538       /* begline matches the empty string at the beginning of the string
2539      (unless `not_bol' is set in `bufp'), and, if
2540      `newline_anchor' is set, after newlines.  */
2541     case begline:
2542
2543       if (AT_STRINGS_BEG (d)) {
2544     if (!bufp->not_bol) break;
2545       } else if (d[-1] == '\n' && bufp->newline_anchor) {
2546     break;
2547       }
2548       /* In all other cases, we fail.  */
2549       goto fail;
2550
2551       /* endline is the dual of begline.  */
2552     case endline:
2553
2554       if (AT_STRINGS_END(d))    {
2555     if (!bufp->not_eol) break;
2556       } else if (*d == '\n' && bufp->newline_anchor) {
2557     break;
2558       }
2559       goto fail;
2560
2561       /* Match at the very beginning of the data.  */
2562     case begbuf:
2563       if (AT_STRINGS_BEG (d))
2564     break;
2565       goto fail;
2566
2567
2568       /* Match at the very end of the data.  */
2569     case endbuf:
2570       if (AT_STRINGS_END (d))
2571     break;
2572       goto fail;
2573
2574
2575       /* on_failure_keep_string_jump is used to optimize `.*\n'.  It
2576      pushes NULL as the value for the string on the stack.  Then
2577      `pop_failure_point' will keep the current value for the
2578      string, instead of restoring it.  To see why, consider
2579      matching `foo\nbar' against `.*\n'.  The .* matches the foo;
2580      then the . fails against the \n.  But the next thing we want
2581      to do is match the \n against the \n; if we restored the
2582      string value, we would be back at the foo.
2583
2584      Because this is used only in specific cases, we don't need to
2585      check all the things that `on_failure_jump' does, to make
2586      sure the right things get saved on the stack.  Hence we don't
2587      share its code.  The only reason to push anything on the
2588      stack at all is that otherwise we would have to change
2589      `anychar's code to do something besides goto fail in this
2590      case; that seems worse than this.  */
2591     case on_failure_keep_string_jump:
2592
2593       extract_number_and_incr(mcnt, p);
2594
2595       PUSH_FAILURE_POINT(p + mcnt, NULL, -2);
2596       break;
2597
2598
2599       /* Uses of on_failure_jump:
2600
2601      Each alternative starts with an on_failure_jump that points
2602      to the beginning of the next alternative.  Each alternative
2603      except the last ends with a jump that in effect jumps past
2604      the rest of the alternatives.  (They really jump to the
2605      ending jump of the following alternative, because tensioning
2606      these jumps is a hassle.)
2607
2608      Repeats start with an on_failure_jump that points past both
2609      the repetition text and either the following jump or
2610      pop_failure_jump back to this on_failure_jump.  */
2611     case on_failure_jump:
2612     on_failure:
2613
2614     extract_number_and_incr(mcnt, p);
2615
2616     /* If this on_failure_jump comes right before a group (i.e.,
2617        the original * applied to a group), save the information
2618        for that group and all inner ones, so that if we fail back
2619        to this point, the group's information will be correct.
2620        For example, in \(a*\)*\1, we need the preceding group,
2621        and in \(zz\(a*\)b*\)\2, we need the inner group.  */
2622
2623     /* We can't use `p' to check ahead because we push
2624        a failure point to `p + mcnt' after we do this.  */
2625     p1 = p;
2626
2627     /* We need to skip no_op's before we look for the
2628        start_memory in case this on_failure_jump is happening as
2629        the result of a completed succeed_n, as in \(a\)\{1,3\}b\1
2630        against aba.  */
2631     while (p1 < pend && (re_opcode_t) *p1 == no_op)
2632       p1++;
2633
2634     if (p1 < pend && (re_opcode_t) *p1 == start_memory) {
2635       /* We have a new highest active register now.  This will
2636      get reset at the start_memory we are about to get to,
2637      but we will have saved all the registers relevant to
2638      this repetition op, as described above.  */
2639       highest_active_reg = *(p1 + 1) + *(p1 + 2);
2640       if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
2641     lowest_active_reg = *(p1 + 1);
2642     }
2643
2644     PUSH_FAILURE_POINT(p + mcnt, d, -2);
2645     break;
2646
2647     /* A smart repeat ends with `maybe_pop_jump'.
2648        We change it to either `pop_failure_jump' or `jump'.  */
2649     case maybe_pop_jump:
2650       extract_number_and_incr(mcnt, p);
2651       {
2652     register sal_Unicode *p2 = p;
2653
2654     /* Compare the beginning of the repeat with what in the
2655        pattern follows its end. If we can establish that there
2656        is nothing that they would both match, i.e., that we
2657        would have to backtrack because of (as in, e.g., `a*a')
2658        then we can change to pop_failure_jump, because we'll
2659        never have to backtrack.
2660
2661        This is not true in the case of alternatives: in
2662        `(a|ab)*' we do need to backtrack to the `ab' alternative
2663        (e.g., if the string was `ab').  But instead of trying to
2664        detect that here, the alternative has put on a dummy
2665        failure point which is what we will end up popping.  */
2666
2667     /* Skip over open/close-group commands.
2668        If what follows this loop is a ...+ construct,
2669        look at what begins its body, since we will have to
2670        match at least one of that.  */
2671     while (1) {
2672       if (p2 + 2 < pend
2673           && ((re_opcode_t) *p2 == stop_memory
2674           || (re_opcode_t) *p2 == start_memory))
2675         p2 += 3;
2676       else if (p2 + 6 < pend
2677            && (re_opcode_t) *p2 == dummy_failure_jump)
2678         p2 += 6;
2679       else
2680         break;
2681     }
2682
2683     p1 = p + mcnt;
2684     /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
2685        to the `maybe_finalize_jump' of this case.  Examine what
2686        follows.  */
2687
2688     /* If we're at the end of the pattern, we can change.  */
2689     if (p2 == pend) {
2690                 /* Consider what happens when matching ":\(.*\)"
2691                    against ":/".  I don't really understand this code
2692                    yet.  */
2693       p[-3] = (sal_Unicode) pop_failure_jump;
2694     } else if ((re_opcode_t) *p2 == exactn
2695            || (bufp->newline_anchor && (re_opcode_t) *p2 == endline)) {
2696       register sal_Unicode c = *p2 == (sal_Unicode) endline ? (sal_Unicode)'\n' : p2[2];
2697
2698       if ((re_opcode_t) p1[3] == exactn && p1[5] != c) {
2699         p[-3] = (sal_Unicode) pop_failure_jump;
2700       } else if ((re_opcode_t) p1[3] == charset
2701              || (re_opcode_t) p1[3] == charset_not) {
2702         sal_Int32 knot = (re_opcode_t) p1[3] == charset_not;
2703
2704         if (c < (sal_Unicode) (p1[4] * BYTEWIDTH)
2705         && p1[5 + c / BYTEWIDTH] & (1 << (c % BYTEWIDTH)))
2706           knot = !knot;
2707
2708         /* `not' is equal to 1 if c would match, which means
2709            that we can't change to pop_failure_jump.  */
2710         if (!knot) {
2711           p[-3] = (unsigned char) pop_failure_jump;
2712         }
2713       }
2714     } else if ((re_opcode_t) *p2 == charset) {
2715                 /* We win if the first character of the loop is not part
2716                    of the charset.  */
2717       if ((re_opcode_t) p1[3] == exactn
2718           && ! ((int) p2[1] * BYTEWIDTH > (int) p1[5]
2719             && (p2[2 + p1[5] / BYTEWIDTH]
2720             & (1 << (p1[5] % BYTEWIDTH))))) {
2721         p[-3] = (sal_Unicode) pop_failure_jump;
2722       } else if ((re_opcode_t) p1[3] == charset_not) {
2723         sal_Int32 idx;
2724         /* We win if the charset_not inside the loop
2725            lists every character listed in the charset after.  */
2726         for (idx = 0; idx < (int) p2[1]; idx++)
2727           if (! (p2[2 + idx] == 0
2728              || (idx < (int) p1[4]
2729              && ((p2[2 + idx] & ~ p1[5 + idx]) == 0))))
2730         break;
2731
2732         if (idx == p2[1]) {
2733           p[-3] = (sal_Unicode) pop_failure_jump;
2734         }
2735       } else if ((re_opcode_t) p1[3] == charset) {
2736         sal_Int32 idx;
2737         /* We win if the charset inside the loop
2738            has no overlap with the one after the loop.  */
2739         for (idx = 0;
2740          idx < (sal_Int32) p2[1] && idx < (sal_Int32) p1[4];
2741          idx++)
2742           if ((p2[2 + idx] & p1[5 + idx]) != 0)
2743         break;
2744
2745         if (idx == p2[1] || idx == p1[4]) {
2746           p[-3] = (sal_Unicode) pop_failure_jump;
2747         }
2748       }
2749     }
2750       }
2751       p -= 2;           /* Point at relative address again.  */
2752       if ((re_opcode_t) p[-1] != pop_failure_jump) {
2753     p[-1] = (sal_Unicode) jump;
2754     goto unconditional_jump;
2755       }
2756       /* Note fall through.  */
2757
2758
2759       /* The end of a simple repeat has a pop_failure_jump back to
2760      its matching on_failure_jump, where the latter will push a
2761      failure point.  The pop_failure_jump takes off failure
2762      points put on by this pop_failure_jump's matching
2763      on_failure_jump; we got through the pattern to here from the
2764      matching on_failure_jump, so didn't fail.  */
2765     case pop_failure_jump:
2766       {
2767     /* We need to pass separate storage for the lowest and
2768        highest registers, even though we don't care about the
2769        actual values.  Otherwise, we will restore only one
2770        register from the stack, since lowest will == highest in
2771        `pop_failure_point'.  */
2772     sal_uInt32 dummy_low_reg, dummy_high_reg;
2773     sal_Unicode *pdummy = NULL;
2774     const sal_Unicode *sdummy = NULL;
2775
2776     POP_FAILURE_POINT(sdummy, pdummy,
2777               dummy_low_reg, dummy_high_reg,
2778               reg_dummy, reg_dummy, reg_info_dummy);
2779       }
2780       /* Note fall through.  */
2781
2782     unconditional_jump:
2783     /* Note fall through.  */
2784
2785     /* Unconditionally jump (without popping any failure points).  */
2786     case jump:
2787       extract_number_and_incr(mcnt, p); /* Get the amount to jump.  */
2788       p += mcnt;                                /* Do the jump.  */
2789       break;
2790
2791       /* We need this opcode so we can detect where alternatives end
2792      in `group_match_null_string_p' et al.  */
2793     case jump_past_alt:
2794       goto unconditional_jump;
2795
2796
2797       /* Normally, the on_failure_jump pushes a failure point, which
2798      then gets popped at pop_failure_jump.  We will end up at
2799      pop_failure_jump, also, and with a pattern of, say, `a+', we
2800      are skipping over the on_failure_jump, so we have to push
2801      something meaningless for pop_failure_jump to pop.  */
2802     case dummy_failure_jump:
2803       /* It doesn't matter what we push for the string here.  What
2804      the code at `fail' tests is the value for the pattern.  */
2805       PUSH_FAILURE_POINT(NULL, NULL, -2);
2806       goto unconditional_jump;
2807
2808
2809       /* At the end of an alternative, we need to push a dummy failure
2810      point in case we are followed by a `pop_failure_jump', because
2811      we don't want the failure point for the alternative to be
2812      popped.  For example, matching `(a|ab)*' against `aab'
2813      requires that we match the `ab' alternative.  */
2814     case push_dummy_failure:
2815       /* See comments just above at `dummy_failure_jump' about the
2816      two zeroes.  */
2817       PUSH_FAILURE_POINT(NULL, NULL, -2);
2818       break;
2819
2820       /* Have to succeed matching what follows at least n times.
2821      After that, handle like `on_failure_jump'.  */
2822     case succeed_n:
2823       extract_number(mcnt, p + 2);
2824
2825       assert (mcnt >= 0);
2826       /* Originally, this is how many times we HAVE to succeed.  */
2827       if (mcnt > 0) {
2828     mcnt--;
2829     p += 2;
2830     store_number_and_incr (p, mcnt);
2831       } else if (mcnt == 0) {
2832     p[2] = (sal_Unicode) no_op;
2833     p[3] = (sal_Unicode) no_op;
2834     goto on_failure;
2835       }
2836       break;
2837
2838     case jump_n:
2839       extract_number(mcnt, p + 2);
2840
2841       /* Originally, this is how many times we CAN jump.  */
2842       if (mcnt) {
2843     mcnt--;
2844     store_number (p + 2, mcnt);
2845     goto unconditional_jump;
2846       }
2847       /* If don't have to jump any more, skip over the rest of command.  */
2848       else
2849     p += 4;
2850       break;
2851
2852     case set_number_at:
2853       {
2854
2855     extract_number_and_incr(mcnt, p);
2856     p1 = p + mcnt;
2857     extract_number_and_incr(mcnt, p);
2858     store_number (p1, mcnt);
2859     break;
2860       }
2861
2862     case wordbeg:
2863       if (iswordbegin(d, string2, size2))
2864     break;
2865       goto fail;
2866
2867     case wordend:
2868       if (iswordend(d, string2, size2))
2869     break;
2870       goto fail;
2871
2872
2873     default:
2874       abort();
2875     }
2876     continue;  /* Successfully executed one pattern command; keep going.  */
2877
2878     /* We goto here if a matching operation fails. */
2879   fail:
2880     if (!FAIL_STACK_EMPTY()) {
2881       /* A restart point is known.  Restore to that state.  */
2882       POP_FAILURE_POINT(d, p,
2883             lowest_active_reg, highest_active_reg,
2884             regstart, regend, reg_info);
2885
2886       /* If this failure point is a dummy, try the next one.  */
2887       if (!p)
2888     goto fail;
2889
2890       /* If we failed to the end of the pattern, don't examine *p.  */
2891       assert(p <= pend);
2892       if (p < pend) {
2893     sal_Bool is_a_jump_n = false;
2894
2895     /* If failed to a backwards jump that's part of a repetition
2896        loop, need to pop this failure point and use the next
2897        one.  */
2898     switch ((re_opcode_t) *p) {
2899     case jump_n:
2900       is_a_jump_n = true;
2901     case maybe_pop_jump:
2902     case pop_failure_jump:
2903     case jump:
2904       p1 = p + 1;
2905       extract_number_and_incr(mcnt, p1);
2906       p1 += mcnt;
2907
2908       if ((is_a_jump_n && (re_opcode_t) *p1 == succeed_n)
2909           || (!is_a_jump_n
2910           && (re_opcode_t) *p1 == on_failure_jump)) {
2911         goto fail;
2912       }
2913       break;
2914     default:
2915       /* do nothing */ ;
2916     }
2917       }
2918
2919     } else {
2920       break;   /* Matching at this starting point really fails.  */
2921     }
2922   } /* for (;;) */
2923
2924   FREE_VARIABLES ();
2925
2926   return(-1);                           /* Failure to match.  */
2927 } /* re_match2 */
2928
2929 /* Set the bit for character C in a list.  */
2930 void
2931 Regexpr::set_list_bit(sal_Unicode c, sal_Unicode *b)
2932 {
2933   if ( translate ) {
2934     try {
2935         sal_Unicode tmp = translit->transliterateChar2Char(c);
2936         b[tmp / BYTEWIDTH] |= 1 << (tmp % BYTEWIDTH);
2937     } catch (::com::sun::star::i18n::MultipleCharsOutputException e) {
2938         ::rtl::OUString o2( translit->transliterateChar2String( c));
2939         sal_Int32 len2 = o2.getLength();
2940         const sal_Unicode * k2 = o2.getStr();
2941         for (sal_Int32 nmatch = 0; nmatch < len2; nmatch++) {
2942           b[k2[nmatch] / BYTEWIDTH] |= 1 << (k2[nmatch] % BYTEWIDTH);
2943         }
2944     }
2945   } else {
2946     b[c / BYTEWIDTH] |= 1 << (c % BYTEWIDTH);
2947   }
2948 }
2949
2950 /* vim: set ts=8 sw=2 noexpandtab: */