Meson: Remove hack that got fixed a while ago
[glib.git] / glib / pcre / pcre_dfa_exec.c
blob9cc82323df0ee0b836d32f0eeefd173c56e8f62b
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
9 Written by Philip Hazel
10 Copyright (c) 1997-2012 University of Cambridge
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 applications. */
47 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48 the performance of his patterns greatly. I could not use it as it stood, as it
49 was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 test 7 to loop, and test 9 to crash with a segfault.
52 The issue is the check for duplicate states, which is done by a simple linear
53 search up the state list. (Grep for "duplicate" below to find the code.) For
54 many patterns, there will never be many states active at one time, so a simple
55 linear search is fine. In patterns that have many active states, it might be a
56 bottleneck. The suggested code used an indexing scheme to remember which states
57 had previously been used for each character, and avoided the linear search when
58 it knew there was no chance of a duplicate. This was implemented when adding
59 states to the state lists.
61 I wrote some thread-safe, not-limited code to try something similar at the time
62 of checking for duplicates (instead of when adding states), using index vectors
63 on the stack. It did give a 13% improvement with one specially constructed
64 pattern for certain subject strings, but on other strings and on many of the
65 simpler patterns in the test suite it did worse. The major problem, I think,
66 was the extra time to initialize the index. This had to be done for each call
67 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68 only once - I suspect this was the cause of the problems with the tests.)
70 Overall, I concluded that the gains in some cases did not outweigh the losses
71 in others, so I abandoned this code. */
75 #include "config.h"
77 #define NLBLOCK md /* Block containing newline information */
78 #define PSSTART start_subject /* Field containing processed string start */
79 #define PSEND end_subject /* Field containing processed string end */
81 #include "pcre_internal.h"
84 /* For use to indent debugging output */
86 #define SP " "
89 /*************************************************
90 * Code parameters and static tables *
91 *************************************************/
93 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
94 into others, under special conditions. A gap of 20 between the blocks should be
95 enough. The resulting opcodes don't have to be less than 256 because they are
96 never stored, so we push them well clear of the normal opcodes. */
98 #define OP_PROP_EXTRA 300
99 #define OP_EXTUNI_EXTRA 320
100 #define OP_ANYNL_EXTRA 340
101 #define OP_HSPACE_EXTRA 360
102 #define OP_VSPACE_EXTRA 380
105 /* This table identifies those opcodes that are followed immediately by a
106 character that is to be tested in some way. This makes it possible to
107 centralize the loading of these characters. In the case of Type * etc, the
108 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
109 small value. Non-zero values in the table are the offsets from the opcode where
110 the character is to be found. ***NOTE*** If the start of this table is
111 modified, the three tables that follow must also be modified. */
113 static const pcre_uint8 coptable[] = {
114 0, /* End */
115 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
116 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
117 0, 0, 0, /* Any, AllAny, Anybyte */
118 0, 0, /* \P, \p */
119 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
120 0, /* \X */
121 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
122 1, /* Char */
123 1, /* Chari */
124 1, /* not */
125 1, /* noti */
126 /* Positive single-char repeats */
127 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
128 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
129 1+IMM2_SIZE, /* exact */
130 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
131 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
132 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
133 1+IMM2_SIZE, /* exact I */
134 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
135 /* Negative single-char repeats - only for chars < 256 */
136 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
137 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
138 1+IMM2_SIZE, /* NOT exact */
139 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
140 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
142 1+IMM2_SIZE, /* NOT exact I */
143 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
144 /* Positive type repeats */
145 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
146 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
147 1+IMM2_SIZE, /* Type exact */
148 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
149 /* Character class & ref repeats */
150 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
151 0, 0, /* CRRANGE, CRMINRANGE */
152 0, /* CLASS */
153 0, /* NCLASS */
154 0, /* XCLASS - variable length */
155 0, /* REF */
156 0, /* REFI */
157 0, /* RECURSE */
158 0, /* CALLOUT */
159 0, /* Alt */
160 0, /* Ket */
161 0, /* KetRmax */
162 0, /* KetRmin */
163 0, /* KetRpos */
164 0, /* Reverse */
165 0, /* Assert */
166 0, /* Assert not */
167 0, /* Assert behind */
168 0, /* Assert behind not */
169 0, 0, /* ONCE, ONCE_NC */
170 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
171 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
172 0, 0, /* CREF, NCREF */
173 0, 0, /* RREF, NRREF */
174 0, /* DEF */
175 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
176 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
177 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
178 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
179 0, 0 /* CLOSE, SKIPZERO */
182 /* This table identifies those opcodes that inspect a character. It is used to
183 remember the fact that a character could have been inspected when the end of
184 the subject is reached. ***NOTE*** If the start of this table is modified, the
185 two tables that follow must also be modified. */
187 static const pcre_uint8 poptable[] = {
188 0, /* End */
189 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
190 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
191 1, 1, 1, /* Any, AllAny, Anybyte */
192 1, 1, /* \P, \p */
193 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
194 1, /* \X */
195 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
196 1, /* Char */
197 1, /* Chari */
198 1, /* not */
199 1, /* noti */
200 /* Positive single-char repeats */
201 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
202 1, 1, 1, /* upto, minupto, exact */
203 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
204 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
205 1, 1, 1, /* upto I, minupto I, exact I */
206 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
207 /* Negative single-char repeats - only for chars < 256 */
208 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
209 1, 1, 1, /* NOT upto, minupto, exact */
210 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
211 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
212 1, 1, 1, /* NOT upto I, minupto I, exact I */
213 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
214 /* Positive type repeats */
215 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
216 1, 1, 1, /* Type upto, minupto, exact */
217 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
218 /* Character class & ref repeats */
219 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
220 1, 1, /* CRRANGE, CRMINRANGE */
221 1, /* CLASS */
222 1, /* NCLASS */
223 1, /* XCLASS - variable length */
224 0, /* REF */
225 0, /* REFI */
226 0, /* RECURSE */
227 0, /* CALLOUT */
228 0, /* Alt */
229 0, /* Ket */
230 0, /* KetRmax */
231 0, /* KetRmin */
232 0, /* KetRpos */
233 0, /* Reverse */
234 0, /* Assert */
235 0, /* Assert not */
236 0, /* Assert behind */
237 0, /* Assert behind not */
238 0, 0, /* ONCE, ONCE_NC */
239 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
240 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
241 0, 0, /* CREF, NCREF */
242 0, 0, /* RREF, NRREF */
243 0, /* DEF */
244 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
245 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
246 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
247 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
248 0, 0 /* CLOSE, SKIPZERO */
251 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
252 and \w */
254 static const pcre_uint8 toptable1[] = {
255 0, 0, 0, 0, 0, 0,
256 ctype_digit, ctype_digit,
257 ctype_space, ctype_space,
258 ctype_word, ctype_word,
259 0, 0 /* OP_ANY, OP_ALLANY */
262 static const pcre_uint8 toptable2[] = {
263 0, 0, 0, 0, 0, 0,
264 ctype_digit, 0,
265 ctype_space, 0,
266 ctype_word, 0,
267 1, 1 /* OP_ANY, OP_ALLANY */
271 /* Structure for holding data about a particular state, which is in effect the
272 current data for an active path through the match tree. It must consist
273 entirely of ints because the working vector we are passed, and which we put
274 these structures in, is a vector of ints. */
276 typedef struct stateblock {
277 int offset; /* Offset to opcode */
278 int count; /* Count for repeats */
279 int data; /* Some use extra data */
280 } stateblock;
282 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
285 #ifdef PCRE_DEBUG
286 /*************************************************
287 * Print character string *
288 *************************************************/
290 /* Character string printing function for debugging.
292 Arguments:
293 p points to string
294 length number of bytes
295 f where to print
297 Returns: nothing
300 static void
301 pchars(const pcre_uchar *p, int length, FILE *f)
303 int c;
304 while (length-- > 0)
306 if (isprint(c = *(p++)))
307 fprintf(f, "%c", c);
308 else
309 fprintf(f, "\\x%02x", c);
312 #endif
316 /*************************************************
317 * Execute a Regular Expression - DFA engine *
318 *************************************************/
320 /* This internal function applies a compiled pattern to a subject string,
321 starting at a given point, using a DFA engine. This function is called from the
322 external one, possibly multiple times if the pattern is not anchored. The
323 function calls itself recursively for some kinds of subpattern.
325 Arguments:
326 md the match_data block with fixed information
327 this_start_code the opening bracket of this subexpression's code
328 current_subject where we currently are in the subject string
329 start_offset start offset in the subject string
330 offsets vector to contain the matching string offsets
331 offsetcount size of same
332 workspace vector of workspace
333 wscount size of same
334 rlevel function call recursion level
336 Returns: > 0 => number of match offset pairs placed in offsets
337 = 0 => offsets overflowed; longest matches are present
338 -1 => failed to match
339 < -1 => some kind of unexpected problem
341 The following macros are used for adding states to the two state vectors (one
342 for the current character, one for the following character). */
344 #define ADD_ACTIVE(x,y) \
345 if (active_count++ < wscount) \
347 next_active_state->offset = (x); \
348 next_active_state->count = (y); \
349 next_active_state++; \
350 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
352 else return PCRE_ERROR_DFA_WSSIZE
354 #define ADD_ACTIVE_DATA(x,y,z) \
355 if (active_count++ < wscount) \
357 next_active_state->offset = (x); \
358 next_active_state->count = (y); \
359 next_active_state->data = (z); \
360 next_active_state++; \
361 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
363 else return PCRE_ERROR_DFA_WSSIZE
365 #define ADD_NEW(x,y) \
366 if (new_count++ < wscount) \
368 next_new_state->offset = (x); \
369 next_new_state->count = (y); \
370 next_new_state++; \
371 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
373 else return PCRE_ERROR_DFA_WSSIZE
375 #define ADD_NEW_DATA(x,y,z) \
376 if (new_count++ < wscount) \
378 next_new_state->offset = (x); \
379 next_new_state->count = (y); \
380 next_new_state->data = (z); \
381 next_new_state++; \
382 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
383 (x), (y), (z), __LINE__)); \
385 else return PCRE_ERROR_DFA_WSSIZE
387 /* And now, here is the code */
389 static int
390 internal_dfa_exec(
391 dfa_match_data *md,
392 const pcre_uchar *this_start_code,
393 const pcre_uchar *current_subject,
394 int start_offset,
395 int *offsets,
396 int offsetcount,
397 int *workspace,
398 int wscount,
399 int rlevel)
401 stateblock *active_states, *new_states, *temp_states;
402 stateblock *next_active_state, *next_new_state;
404 const pcre_uint8 *ctypes, *lcc, *fcc;
405 const pcre_uchar *ptr;
406 const pcre_uchar *end_code, *first_op;
408 dfa_recursion_info new_recursive;
410 int active_count, new_count, match_count;
412 /* Some fields in the md block are frequently referenced, so we load them into
413 independent variables in the hope that this will perform better. */
415 const pcre_uchar *start_subject = md->start_subject;
416 const pcre_uchar *end_subject = md->end_subject;
417 const pcre_uchar *start_code = md->start_code;
419 #ifdef SUPPORT_UTF
420 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
421 #else
422 BOOL utf = FALSE;
423 #endif
425 BOOL reset_could_continue = FALSE;
427 rlevel++;
428 offsetcount &= (-2);
430 wscount -= 2;
431 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
432 (2 * INTS_PER_STATEBLOCK);
434 DPRINTF(("\n%.*s---------------------\n"
435 "%.*sCall to internal_dfa_exec f=%d\n",
436 rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
438 ctypes = md->tables + ctypes_offset;
439 lcc = md->tables + lcc_offset;
440 fcc = md->tables + fcc_offset;
442 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
444 active_states = (stateblock *)(workspace + 2);
445 next_new_state = new_states = active_states + wscount;
446 new_count = 0;
448 first_op = this_start_code + 1 + LINK_SIZE +
449 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
450 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
451 ? IMM2_SIZE:0);
453 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
454 the alternative states onto the list, and find out where the end is. This
455 makes is possible to use this function recursively, when we want to stop at a
456 matching internal ket rather than at the end.
458 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
459 a backward assertion. In that case, we have to find out the maximum amount to
460 move back, and set up each alternative appropriately. */
462 if (*first_op == OP_REVERSE)
464 int max_back = 0;
465 int gone_back;
467 end_code = this_start_code;
470 int back = GET(end_code, 2+LINK_SIZE);
471 if (back > max_back) max_back = back;
472 end_code += GET(end_code, 1);
474 while (*end_code == OP_ALT);
476 /* If we can't go back the amount required for the longest lookbehind
477 pattern, go back as far as we can; some alternatives may still be viable. */
479 #ifdef SUPPORT_UTF
480 /* In character mode we have to step back character by character */
482 if (utf)
484 for (gone_back = 0; gone_back < max_back; gone_back++)
486 if (current_subject <= start_subject) break;
487 current_subject--;
488 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491 else
492 #endif
494 /* In byte-mode we can do this quickly. */
497 gone_back = (current_subject - max_back < start_subject)?
498 (int)(current_subject - start_subject) : max_back;
499 current_subject -= gone_back;
502 /* Save the earliest consulted character */
504 if (current_subject < md->start_used_ptr)
505 md->start_used_ptr = current_subject;
507 /* Now we can process the individual branches. */
509 end_code = this_start_code;
512 int back = GET(end_code, 2+LINK_SIZE);
513 if (back <= gone_back)
515 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
516 ADD_NEW_DATA(-bstate, 0, gone_back - back);
518 end_code += GET(end_code, 1);
520 while (*end_code == OP_ALT);
523 /* This is the code for a "normal" subpattern (not a backward assertion). The
524 start of a whole pattern is always one of these. If we are at the top level,
525 we may be asked to restart matching from the same point that we reached for a
526 previous partial match. We still have to scan through the top-level branches to
527 find the end state. */
529 else
531 end_code = this_start_code;
533 /* Restarting */
535 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
537 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
538 new_count = workspace[1];
539 if (!workspace[0])
540 memcpy(new_states, active_states, new_count * sizeof(stateblock));
543 /* Not restarting */
545 else
547 int length = 1 + LINK_SIZE +
548 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
549 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
550 ? IMM2_SIZE:0);
553 ADD_NEW((int)(end_code - start_code + length), 0);
554 end_code += GET(end_code, 1);
555 length = 1 + LINK_SIZE;
557 while (*end_code == OP_ALT);
561 workspace[0] = 0; /* Bit indicating which vector is current */
563 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
565 /* Loop for scanning the subject */
567 ptr = current_subject;
568 for (;;)
570 int i, j;
571 int clen, dlen;
572 unsigned int c, d;
573 int forced_fail = 0;
574 BOOL partial_newline = FALSE;
575 BOOL could_continue = reset_could_continue;
576 reset_could_continue = FALSE;
578 /* Make the new state list into the active state list and empty the
579 new state list. */
581 temp_states = active_states;
582 active_states = new_states;
583 new_states = temp_states;
584 active_count = new_count;
585 new_count = 0;
587 workspace[0] ^= 1; /* Remember for the restarting feature */
588 workspace[1] = active_count;
590 #ifdef PCRE_DEBUG
591 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
592 pchars(ptr, STRLEN_UC(ptr), stdout);
593 printf("\"\n");
595 printf("%.*sActive states: ", rlevel*2-2, SP);
596 for (i = 0; i < active_count; i++)
597 printf("%d/%d ", active_states[i].offset, active_states[i].count);
598 printf("\n");
599 #endif
601 /* Set the pointers for adding new states */
603 next_active_state = active_states + active_count;
604 next_new_state = new_states;
606 /* Load the current character from the subject outside the loop, as many
607 different states may want to look at it, and we assume that at least one
608 will. */
610 if (ptr < end_subject)
612 clen = 1; /* Number of data items in the character */
613 #ifdef SUPPORT_UTF
614 if (utf) { GETCHARLEN(c, ptr, clen); } else
615 #endif /* SUPPORT_UTF */
616 c = *ptr;
618 else
620 clen = 0; /* This indicates the end of the subject */
621 c = NOTACHAR; /* This value should never actually be used */
624 /* Scan up the active states and act on each one. The result of an action
625 may be to add more states to the currently active list (e.g. on hitting a
626 parenthesis) or it may be to put states on the new list, for considering
627 when we move the character pointer on. */
629 for (i = 0; i < active_count; i++)
631 stateblock *current_state = active_states + i;
632 BOOL caseless = FALSE;
633 const pcre_uchar *code;
634 int state_offset = current_state->offset;
635 int count, codevalue, rrc;
637 #ifdef PCRE_DEBUG
638 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
639 if (clen == 0) printf("EOL\n");
640 else if (c > 32 && c < 127) printf("'%c'\n", c);
641 else printf("0x%02x\n", c);
642 #endif
644 /* A negative offset is a special case meaning "hold off going to this
645 (negated) state until the number of characters in the data field have
646 been skipped". If the could_continue flag was passed over from a previous
647 state, arrange for it to passed on. */
649 if (state_offset < 0)
651 if (current_state->data > 0)
653 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
654 ADD_NEW_DATA(state_offset, current_state->count,
655 current_state->data - 1);
656 if (could_continue) reset_could_continue = TRUE;
657 continue;
659 else
661 current_state->offset = state_offset = -state_offset;
665 /* Check for a duplicate state with the same count, and skip if found.
666 See the note at the head of this module about the possibility of improving
667 performance here. */
669 for (j = 0; j < i; j++)
671 if (active_states[j].offset == state_offset &&
672 active_states[j].count == current_state->count)
674 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
675 goto NEXT_ACTIVE_STATE;
679 /* The state offset is the offset to the opcode */
681 code = start_code + state_offset;
682 codevalue = *code;
684 /* If this opcode inspects a character, but we are at the end of the
685 subject, remember the fact for use when testing for a partial match. */
687 if (clen == 0 && poptable[codevalue] != 0)
688 could_continue = TRUE;
690 /* If this opcode is followed by an inline character, load it. It is
691 tempting to test for the presence of a subject character here, but that
692 is wrong, because sometimes zero repetitions of the subject are
693 permitted.
695 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
696 argument that is not a data character - but is always one byte long because
697 the values are small. We have to take special action to deal with \P, \p,
698 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
699 these ones to new opcodes. */
701 if (coptable[codevalue] > 0)
703 dlen = 1;
704 #ifdef SUPPORT_UTF
705 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
706 #endif /* SUPPORT_UTF */
707 d = code[coptable[codevalue]];
708 if (codevalue >= OP_TYPESTAR)
710 switch(d)
712 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
713 case OP_NOTPROP:
714 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
715 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
716 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
717 case OP_NOT_HSPACE:
718 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
719 case OP_NOT_VSPACE:
720 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
721 default: break;
725 else
727 dlen = 0; /* Not strictly necessary, but compilers moan */
728 d = NOTACHAR; /* if these variables are not set. */
732 /* Now process the individual opcodes */
734 switch (codevalue)
736 /* ========================================================================== */
737 /* These cases are never obeyed. This is a fudge that causes a compile-
738 time error if the vectors coptable or poptable, which are indexed by
739 opcode, are not the correct length. It seems to be the only way to do
740 such a check at compile time, as the sizeof() operator does not work
741 in the C preprocessor. */
743 case OP_TABLE_LENGTH:
744 case OP_TABLE_LENGTH +
745 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
746 (sizeof(poptable) == OP_TABLE_LENGTH)):
747 break;
749 /* ========================================================================== */
750 /* Reached a closing bracket. If not at the end of the pattern, carry
751 on with the next opcode. For repeating opcodes, also add the repeat
752 state. Note that KETRPOS will always be encountered at the end of the
753 subpattern, because the possessive subpattern repeats are always handled
754 using recursive calls. Thus, it never adds any new states.
756 At the end of the (sub)pattern, unless we have an empty string and
757 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
758 start of the subject, save the match data, shifting up all previous
759 matches so we always have the longest first. */
761 case OP_KET:
762 case OP_KETRMIN:
763 case OP_KETRMAX:
764 case OP_KETRPOS:
765 if (code != end_code)
767 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
768 if (codevalue != OP_KET)
770 ADD_ACTIVE(state_offset - GET(code, 1), 0);
773 else
775 if (ptr > current_subject ||
776 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
777 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
778 current_subject > start_subject + md->start_offset)))
780 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
781 else if (match_count > 0 && ++match_count * 2 > offsetcount)
782 match_count = 0;
783 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
784 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
785 if (offsetcount >= 2)
787 offsets[0] = (int)(current_subject - start_subject);
788 offsets[1] = (int)(ptr - start_subject);
789 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
790 offsets[1] - offsets[0], (char *)current_subject));
792 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
794 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
795 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
796 match_count, rlevel*2-2, SP));
797 return match_count;
801 break;
803 /* ========================================================================== */
804 /* These opcodes add to the current list of states without looking
805 at the current character. */
807 /*-----------------------------------------------------------------*/
808 case OP_ALT:
809 do { code += GET(code, 1); } while (*code == OP_ALT);
810 ADD_ACTIVE((int)(code - start_code), 0);
811 break;
813 /*-----------------------------------------------------------------*/
814 case OP_BRA:
815 case OP_SBRA:
818 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
819 code += GET(code, 1);
821 while (*code == OP_ALT);
822 break;
824 /*-----------------------------------------------------------------*/
825 case OP_CBRA:
826 case OP_SCBRA:
827 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
828 code += GET(code, 1);
829 while (*code == OP_ALT)
831 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
832 code += GET(code, 1);
834 break;
836 /*-----------------------------------------------------------------*/
837 case OP_BRAZERO:
838 case OP_BRAMINZERO:
839 ADD_ACTIVE(state_offset + 1, 0);
840 code += 1 + GET(code, 2);
841 while (*code == OP_ALT) code += GET(code, 1);
842 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
843 break;
845 /*-----------------------------------------------------------------*/
846 case OP_SKIPZERO:
847 code += 1 + GET(code, 2);
848 while (*code == OP_ALT) code += GET(code, 1);
849 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
850 break;
852 /*-----------------------------------------------------------------*/
853 case OP_CIRC:
854 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
855 { ADD_ACTIVE(state_offset + 1, 0); }
856 break;
858 /*-----------------------------------------------------------------*/
859 case OP_CIRCM:
860 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
861 (ptr != end_subject && WAS_NEWLINE(ptr)))
862 { ADD_ACTIVE(state_offset + 1, 0); }
863 break;
865 /*-----------------------------------------------------------------*/
866 case OP_EOD:
867 if (ptr >= end_subject)
869 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
870 could_continue = TRUE;
871 else { ADD_ACTIVE(state_offset + 1, 0); }
873 break;
875 /*-----------------------------------------------------------------*/
876 case OP_SOD:
877 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
878 break;
880 /*-----------------------------------------------------------------*/
881 case OP_SOM:
882 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
883 break;
886 /* ========================================================================== */
887 /* These opcodes inspect the next subject character, and sometimes
888 the previous one as well, but do not have an argument. The variable
889 clen contains the length of the current character and is zero if we are
890 at the end of the subject. */
892 /*-----------------------------------------------------------------*/
893 case OP_ANY:
894 if (clen > 0 && !IS_NEWLINE(ptr))
896 if (ptr + 1 >= md->end_subject &&
897 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
898 NLBLOCK->nltype == NLTYPE_FIXED &&
899 NLBLOCK->nllen == 2 &&
900 c == NLBLOCK->nl[0])
902 could_continue = partial_newline = TRUE;
904 else
906 ADD_NEW(state_offset + 1, 0);
909 break;
911 /*-----------------------------------------------------------------*/
912 case OP_ALLANY:
913 if (clen > 0)
914 { ADD_NEW(state_offset + 1, 0); }
915 break;
917 /*-----------------------------------------------------------------*/
918 case OP_EODN:
919 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
920 could_continue = TRUE;
921 else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
922 { ADD_ACTIVE(state_offset + 1, 0); }
923 break;
925 /*-----------------------------------------------------------------*/
926 case OP_DOLL:
927 if ((md->moptions & PCRE_NOTEOL) == 0)
929 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
930 could_continue = TRUE;
931 else if (clen == 0 ||
932 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
933 (ptr == end_subject - md->nllen)
935 { ADD_ACTIVE(state_offset + 1, 0); }
936 else if (ptr + 1 >= md->end_subject &&
937 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
938 NLBLOCK->nltype == NLTYPE_FIXED &&
939 NLBLOCK->nllen == 2 &&
940 c == NLBLOCK->nl[0])
942 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
944 reset_could_continue = TRUE;
945 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
947 else could_continue = partial_newline = TRUE;
950 break;
952 /*-----------------------------------------------------------------*/
953 case OP_DOLLM:
954 if ((md->moptions & PCRE_NOTEOL) == 0)
956 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
957 could_continue = TRUE;
958 else if (clen == 0 ||
959 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
960 { ADD_ACTIVE(state_offset + 1, 0); }
961 else if (ptr + 1 >= md->end_subject &&
962 (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
963 NLBLOCK->nltype == NLTYPE_FIXED &&
964 NLBLOCK->nllen == 2 &&
965 c == NLBLOCK->nl[0])
967 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
969 reset_could_continue = TRUE;
970 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
972 else could_continue = partial_newline = TRUE;
975 else if (IS_NEWLINE(ptr))
976 { ADD_ACTIVE(state_offset + 1, 0); }
977 break;
979 /*-----------------------------------------------------------------*/
981 case OP_DIGIT:
982 case OP_WHITESPACE:
983 case OP_WORDCHAR:
984 if (clen > 0 && c < 256 &&
985 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
986 { ADD_NEW(state_offset + 1, 0); }
987 break;
989 /*-----------------------------------------------------------------*/
990 case OP_NOT_DIGIT:
991 case OP_NOT_WHITESPACE:
992 case OP_NOT_WORDCHAR:
993 if (clen > 0 && (c >= 256 ||
994 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
995 { ADD_NEW(state_offset + 1, 0); }
996 break;
998 /*-----------------------------------------------------------------*/
999 case OP_WORD_BOUNDARY:
1000 case OP_NOT_WORD_BOUNDARY:
1002 int left_word, right_word;
1004 if (ptr > start_subject)
1006 const pcre_uchar *temp = ptr - 1;
1007 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1008 #ifdef SUPPORT_UTF
1009 if (utf) { BACKCHAR(temp); }
1010 #endif
1011 GETCHARTEST(d, temp);
1012 #ifdef SUPPORT_UCP
1013 if ((md->poptions & PCRE_UCP) != 0)
1015 if (d == '_') left_word = TRUE; else
1017 int cat = UCD_CATEGORY(d);
1018 left_word = (cat == ucp_L || cat == ucp_N);
1021 else
1022 #endif
1023 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1025 else left_word = FALSE;
1027 if (clen > 0)
1029 #ifdef SUPPORT_UCP
1030 if ((md->poptions & PCRE_UCP) != 0)
1032 if (c == '_') right_word = TRUE; else
1034 int cat = UCD_CATEGORY(c);
1035 right_word = (cat == ucp_L || cat == ucp_N);
1038 else
1039 #endif
1040 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1042 else right_word = FALSE;
1044 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1045 { ADD_ACTIVE(state_offset + 1, 0); }
1047 break;
1050 /*-----------------------------------------------------------------*/
1051 /* Check the next character by Unicode property. We will get here only
1052 if the support is in the binary; otherwise a compile-time error occurs.
1055 #ifdef SUPPORT_UCP
1056 case OP_PROP:
1057 case OP_NOTPROP:
1058 if (clen > 0)
1060 BOOL OK;
1061 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1062 switch(code[1])
1064 case PT_ANY:
1065 OK = TRUE;
1066 break;
1068 case PT_LAMP:
1069 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1070 chartype == ucp_Lt;
1071 break;
1073 case PT_GC:
1074 OK = PRIV(ucp_gentype)[chartype] == code[2];
1075 break;
1077 case PT_PC:
1078 OK = chartype == code[2];
1079 break;
1081 case PT_SC:
1082 OK = UCD_SCRIPT(c) == code[2];
1083 break;
1085 /* These are specials for combination cases. */
1087 case PT_ALNUM:
1088 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1089 PRIV(ucp_gentype)[chartype] == ucp_N;
1090 break;
1092 case PT_SPACE: /* Perl space */
1093 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1094 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1095 break;
1097 case PT_PXSPACE: /* POSIX space */
1098 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1099 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1100 c == CHAR_FF || c == CHAR_CR;
1101 break;
1103 case PT_WORD:
1104 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1105 PRIV(ucp_gentype)[chartype] == ucp_N ||
1106 c == CHAR_UNDERSCORE;
1107 break;
1109 /* Should never occur, but keep compilers from grumbling. */
1111 default:
1112 OK = codevalue != OP_PROP;
1113 break;
1116 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1118 break;
1119 #endif
1123 /* ========================================================================== */
1124 /* These opcodes likewise inspect the subject character, but have an
1125 argument that is not a data character. It is one of these opcodes:
1126 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1127 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1129 case OP_TYPEPLUS:
1130 case OP_TYPEMINPLUS:
1131 case OP_TYPEPOSPLUS:
1132 count = current_state->count; /* Already matched */
1133 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1134 if (clen > 0)
1136 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1137 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1138 NLBLOCK->nltype == NLTYPE_FIXED &&
1139 NLBLOCK->nllen == 2 &&
1140 c == NLBLOCK->nl[0])
1142 could_continue = partial_newline = TRUE;
1144 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1145 (c < 256 &&
1146 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1147 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1149 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1151 active_count--; /* Remove non-match possibility */
1152 next_active_state--;
1154 count++;
1155 ADD_NEW(state_offset, count);
1158 break;
1160 /*-----------------------------------------------------------------*/
1161 case OP_TYPEQUERY:
1162 case OP_TYPEMINQUERY:
1163 case OP_TYPEPOSQUERY:
1164 ADD_ACTIVE(state_offset + 2, 0);
1165 if (clen > 0)
1167 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1168 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1169 NLBLOCK->nltype == NLTYPE_FIXED &&
1170 NLBLOCK->nllen == 2 &&
1171 c == NLBLOCK->nl[0])
1173 could_continue = partial_newline = TRUE;
1175 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176 (c < 256 &&
1177 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1178 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1180 if (codevalue == OP_TYPEPOSQUERY)
1182 active_count--; /* Remove non-match possibility */
1183 next_active_state--;
1185 ADD_NEW(state_offset + 2, 0);
1188 break;
1190 /*-----------------------------------------------------------------*/
1191 case OP_TYPESTAR:
1192 case OP_TYPEMINSTAR:
1193 case OP_TYPEPOSSTAR:
1194 ADD_ACTIVE(state_offset + 2, 0);
1195 if (clen > 0)
1197 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1198 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1199 NLBLOCK->nltype == NLTYPE_FIXED &&
1200 NLBLOCK->nllen == 2 &&
1201 c == NLBLOCK->nl[0])
1203 could_continue = partial_newline = TRUE;
1205 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1206 (c < 256 &&
1207 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1208 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1210 if (codevalue == OP_TYPEPOSSTAR)
1212 active_count--; /* Remove non-match possibility */
1213 next_active_state--;
1215 ADD_NEW(state_offset, 0);
1218 break;
1220 /*-----------------------------------------------------------------*/
1221 case OP_TYPEEXACT:
1222 count = current_state->count; /* Number already matched */
1223 if (clen > 0)
1225 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1226 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1227 NLBLOCK->nltype == NLTYPE_FIXED &&
1228 NLBLOCK->nllen == 2 &&
1229 c == NLBLOCK->nl[0])
1231 could_continue = partial_newline = TRUE;
1233 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1234 (c < 256 &&
1235 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1236 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1238 if (++count >= GET2(code, 1))
1239 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1240 else
1241 { ADD_NEW(state_offset, count); }
1244 break;
1246 /*-----------------------------------------------------------------*/
1247 case OP_TYPEUPTO:
1248 case OP_TYPEMINUPTO:
1249 case OP_TYPEPOSUPTO:
1250 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1251 count = current_state->count; /* Number already matched */
1252 if (clen > 0)
1254 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1255 (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1256 NLBLOCK->nltype == NLTYPE_FIXED &&
1257 NLBLOCK->nllen == 2 &&
1258 c == NLBLOCK->nl[0])
1260 could_continue = partial_newline = TRUE;
1262 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1263 (c < 256 &&
1264 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1265 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1267 if (codevalue == OP_TYPEPOSUPTO)
1269 active_count--; /* Remove non-match possibility */
1270 next_active_state--;
1272 if (++count >= GET2(code, 1))
1273 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1274 else
1275 { ADD_NEW(state_offset, count); }
1278 break;
1280 /* ========================================================================== */
1281 /* These are virtual opcodes that are used when something like
1282 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1283 argument. It keeps the code above fast for the other cases. The argument
1284 is in the d variable. */
1286 #ifdef SUPPORT_UCP
1287 case OP_PROP_EXTRA + OP_TYPEPLUS:
1288 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1289 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1290 count = current_state->count; /* Already matched */
1291 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1292 if (clen > 0)
1294 BOOL OK;
1295 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1296 switch(code[2])
1298 case PT_ANY:
1299 OK = TRUE;
1300 break;
1302 case PT_LAMP:
1303 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1304 chartype == ucp_Lt;
1305 break;
1307 case PT_GC:
1308 OK = PRIV(ucp_gentype)[chartype] == code[3];
1309 break;
1311 case PT_PC:
1312 OK = chartype == code[3];
1313 break;
1315 case PT_SC:
1316 OK = UCD_SCRIPT(c) == code[3];
1317 break;
1319 /* These are specials for combination cases. */
1321 case PT_ALNUM:
1322 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1323 PRIV(ucp_gentype)[chartype] == ucp_N;
1324 break;
1326 case PT_SPACE: /* Perl space */
1327 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1328 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1329 break;
1331 case PT_PXSPACE: /* POSIX space */
1332 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1333 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1334 c == CHAR_FF || c == CHAR_CR;
1335 break;
1337 case PT_WORD:
1338 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1339 PRIV(ucp_gentype)[chartype] == ucp_N ||
1340 c == CHAR_UNDERSCORE;
1341 break;
1343 /* Should never occur, but keep compilers from grumbling. */
1345 default:
1346 OK = codevalue != OP_PROP;
1347 break;
1350 if (OK == (d == OP_PROP))
1352 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1354 active_count--; /* Remove non-match possibility */
1355 next_active_state--;
1357 count++;
1358 ADD_NEW(state_offset, count);
1361 break;
1363 /*-----------------------------------------------------------------*/
1364 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1365 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1366 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1367 count = current_state->count; /* Already matched */
1368 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1369 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1371 const pcre_uchar *nptr = ptr + clen;
1372 int ncount = 0;
1373 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1375 active_count--; /* Remove non-match possibility */
1376 next_active_state--;
1378 while (nptr < end_subject)
1380 int nd;
1381 int ndlen = 1;
1382 GETCHARLEN(nd, nptr, ndlen);
1383 if (UCD_CATEGORY(nd) != ucp_M) break;
1384 ncount++;
1385 nptr += ndlen;
1387 count++;
1388 ADD_NEW_DATA(-state_offset, count, ncount);
1390 break;
1391 #endif
1393 /*-----------------------------------------------------------------*/
1394 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1395 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1396 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1397 count = current_state->count; /* Already matched */
1398 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1399 if (clen > 0)
1401 int ncount = 0;
1402 switch (c)
1404 case 0x000b:
1405 case 0x000c:
1406 case 0x0085:
1407 case 0x2028:
1408 case 0x2029:
1409 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1410 goto ANYNL01;
1412 case 0x000d:
1413 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1414 /* Fall through */
1416 ANYNL01:
1417 case 0x000a:
1418 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1420 active_count--; /* Remove non-match possibility */
1421 next_active_state--;
1423 count++;
1424 ADD_NEW_DATA(-state_offset, count, ncount);
1425 break;
1427 default:
1428 break;
1431 break;
1433 /*-----------------------------------------------------------------*/
1434 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1435 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1436 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1437 count = current_state->count; /* Already matched */
1438 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1439 if (clen > 0)
1441 BOOL OK;
1442 switch (c)
1444 case 0x000a:
1445 case 0x000b:
1446 case 0x000c:
1447 case 0x000d:
1448 case 0x0085:
1449 case 0x2028:
1450 case 0x2029:
1451 OK = TRUE;
1452 break;
1454 default:
1455 OK = FALSE;
1456 break;
1459 if (OK == (d == OP_VSPACE))
1461 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1463 active_count--; /* Remove non-match possibility */
1464 next_active_state--;
1466 count++;
1467 ADD_NEW_DATA(-state_offset, count, 0);
1470 break;
1472 /*-----------------------------------------------------------------*/
1473 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1474 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1475 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1476 count = current_state->count; /* Already matched */
1477 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1478 if (clen > 0)
1480 BOOL OK;
1481 switch (c)
1483 case 0x09: /* HT */
1484 case 0x20: /* SPACE */
1485 case 0xa0: /* NBSP */
1486 case 0x1680: /* OGHAM SPACE MARK */
1487 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1488 case 0x2000: /* EN QUAD */
1489 case 0x2001: /* EM QUAD */
1490 case 0x2002: /* EN SPACE */
1491 case 0x2003: /* EM SPACE */
1492 case 0x2004: /* THREE-PER-EM SPACE */
1493 case 0x2005: /* FOUR-PER-EM SPACE */
1494 case 0x2006: /* SIX-PER-EM SPACE */
1495 case 0x2007: /* FIGURE SPACE */
1496 case 0x2008: /* PUNCTUATION SPACE */
1497 case 0x2009: /* THIN SPACE */
1498 case 0x200A: /* HAIR SPACE */
1499 case 0x202f: /* NARROW NO-BREAK SPACE */
1500 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1501 case 0x3000: /* IDEOGRAPHIC SPACE */
1502 OK = TRUE;
1503 break;
1505 default:
1506 OK = FALSE;
1507 break;
1510 if (OK == (d == OP_HSPACE))
1512 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1514 active_count--; /* Remove non-match possibility */
1515 next_active_state--;
1517 count++;
1518 ADD_NEW_DATA(-state_offset, count, 0);
1521 break;
1523 /*-----------------------------------------------------------------*/
1524 #ifdef SUPPORT_UCP
1525 case OP_PROP_EXTRA + OP_TYPEQUERY:
1526 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1527 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1528 count = 4;
1529 goto QS1;
1531 case OP_PROP_EXTRA + OP_TYPESTAR:
1532 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1533 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1534 count = 0;
1536 QS1:
1538 ADD_ACTIVE(state_offset + 4, 0);
1539 if (clen > 0)
1541 BOOL OK;
1542 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1543 switch(code[2])
1545 case PT_ANY:
1546 OK = TRUE;
1547 break;
1549 case PT_LAMP:
1550 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1551 chartype == ucp_Lt;
1552 break;
1554 case PT_GC:
1555 OK = PRIV(ucp_gentype)[chartype] == code[3];
1556 break;
1558 case PT_PC:
1559 OK = chartype == code[3];
1560 break;
1562 case PT_SC:
1563 OK = UCD_SCRIPT(c) == code[3];
1564 break;
1566 /* These are specials for combination cases. */
1568 case PT_ALNUM:
1569 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1570 PRIV(ucp_gentype)[chartype] == ucp_N;
1571 break;
1573 case PT_SPACE: /* Perl space */
1574 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1575 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1576 break;
1578 case PT_PXSPACE: /* POSIX space */
1579 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1580 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1581 c == CHAR_FF || c == CHAR_CR;
1582 break;
1584 case PT_WORD:
1585 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1586 PRIV(ucp_gentype)[chartype] == ucp_N ||
1587 c == CHAR_UNDERSCORE;
1588 break;
1590 /* Should never occur, but keep compilers from grumbling. */
1592 default:
1593 OK = codevalue != OP_PROP;
1594 break;
1597 if (OK == (d == OP_PROP))
1599 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1600 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1605 ADD_NEW(state_offset + count, 0);
1608 break;
1610 /*-----------------------------------------------------------------*/
1611 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1612 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1613 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1614 count = 2;
1615 goto QS2;
1617 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1618 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1619 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1620 count = 0;
1622 QS2:
1624 ADD_ACTIVE(state_offset + 2, 0);
1625 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1627 const pcre_uchar *nptr = ptr + clen;
1628 int ncount = 0;
1629 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1630 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1632 active_count--; /* Remove non-match possibility */
1633 next_active_state--;
1635 while (nptr < end_subject)
1637 int nd;
1638 int ndlen = 1;
1639 GETCHARLEN(nd, nptr, ndlen);
1640 if (UCD_CATEGORY(nd) != ucp_M) break;
1641 ncount++;
1642 nptr += ndlen;
1644 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1646 break;
1647 #endif
1649 /*-----------------------------------------------------------------*/
1650 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1651 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1652 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1653 count = 2;
1654 goto QS3;
1656 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1657 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1658 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1659 count = 0;
1661 QS3:
1662 ADD_ACTIVE(state_offset + 2, 0);
1663 if (clen > 0)
1665 int ncount = 0;
1666 switch (c)
1668 case 0x000b:
1669 case 0x000c:
1670 case 0x0085:
1671 case 0x2028:
1672 case 0x2029:
1673 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1674 goto ANYNL02;
1676 case 0x000d:
1677 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1678 /* Fall through */
1680 ANYNL02:
1681 case 0x000a:
1682 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1683 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1685 active_count--; /* Remove non-match possibility */
1686 next_active_state--;
1688 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1689 break;
1691 default:
1692 break;
1695 break;
1697 /*-----------------------------------------------------------------*/
1698 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1699 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1700 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1701 count = 2;
1702 goto QS4;
1704 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1705 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1706 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1707 count = 0;
1709 QS4:
1710 ADD_ACTIVE(state_offset + 2, 0);
1711 if (clen > 0)
1713 BOOL OK;
1714 switch (c)
1716 case 0x000a:
1717 case 0x000b:
1718 case 0x000c:
1719 case 0x000d:
1720 case 0x0085:
1721 case 0x2028:
1722 case 0x2029:
1723 OK = TRUE;
1724 break;
1726 default:
1727 OK = FALSE;
1728 break;
1730 if (OK == (d == OP_VSPACE))
1732 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1733 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1735 active_count--; /* Remove non-match possibility */
1736 next_active_state--;
1738 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741 break;
1743 /*-----------------------------------------------------------------*/
1744 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1745 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1746 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1747 count = 2;
1748 goto QS5;
1750 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1751 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1752 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1753 count = 0;
1755 QS5:
1756 ADD_ACTIVE(state_offset + 2, 0);
1757 if (clen > 0)
1759 BOOL OK;
1760 switch (c)
1762 case 0x09: /* HT */
1763 case 0x20: /* SPACE */
1764 case 0xa0: /* NBSP */
1765 case 0x1680: /* OGHAM SPACE MARK */
1766 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1767 case 0x2000: /* EN QUAD */
1768 case 0x2001: /* EM QUAD */
1769 case 0x2002: /* EN SPACE */
1770 case 0x2003: /* EM SPACE */
1771 case 0x2004: /* THREE-PER-EM SPACE */
1772 case 0x2005: /* FOUR-PER-EM SPACE */
1773 case 0x2006: /* SIX-PER-EM SPACE */
1774 case 0x2007: /* FIGURE SPACE */
1775 case 0x2008: /* PUNCTUATION SPACE */
1776 case 0x2009: /* THIN SPACE */
1777 case 0x200A: /* HAIR SPACE */
1778 case 0x202f: /* NARROW NO-BREAK SPACE */
1779 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1780 case 0x3000: /* IDEOGRAPHIC SPACE */
1781 OK = TRUE;
1782 break;
1784 default:
1785 OK = FALSE;
1786 break;
1789 if (OK == (d == OP_HSPACE))
1791 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1792 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1794 active_count--; /* Remove non-match possibility */
1795 next_active_state--;
1797 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800 break;
1802 /*-----------------------------------------------------------------*/
1803 #ifdef SUPPORT_UCP
1804 case OP_PROP_EXTRA + OP_TYPEEXACT:
1805 case OP_PROP_EXTRA + OP_TYPEUPTO:
1806 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1807 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1808 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1809 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1810 count = current_state->count; /* Number already matched */
1811 if (clen > 0)
1813 BOOL OK;
1814 const pcre_uint8 chartype = UCD_CHARTYPE(c);
1815 switch(code[1 + IMM2_SIZE + 1])
1817 case PT_ANY:
1818 OK = TRUE;
1819 break;
1821 case PT_LAMP:
1822 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1823 chartype == ucp_Lt;
1824 break;
1826 case PT_GC:
1827 OK = PRIV(ucp_gentype)[chartype] == code[1 + IMM2_SIZE + 2];
1828 break;
1830 case PT_PC:
1831 OK = chartype == code[1 + IMM2_SIZE + 2];
1832 break;
1834 case PT_SC:
1835 OK = UCD_SCRIPT(c) == code[1 + IMM2_SIZE + 2];
1836 break;
1838 /* These are specials for combination cases. */
1840 case PT_ALNUM:
1841 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1842 PRIV(ucp_gentype)[chartype] == ucp_N;
1843 break;
1845 case PT_SPACE: /* Perl space */
1846 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1847 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1848 break;
1850 case PT_PXSPACE: /* POSIX space */
1851 OK = PRIV(ucp_gentype)[chartype] == ucp_Z ||
1852 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1853 c == CHAR_FF || c == CHAR_CR;
1854 break;
1856 case PT_WORD:
1857 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1858 PRIV(ucp_gentype)[chartype] == ucp_N ||
1859 c == CHAR_UNDERSCORE;
1860 break;
1862 /* Should never occur, but keep compilers from grumbling. */
1864 default:
1865 OK = codevalue != OP_PROP;
1866 break;
1869 if (OK == (d == OP_PROP))
1871 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1873 active_count--; /* Remove non-match possibility */
1874 next_active_state--;
1876 if (++count >= GET2(code, 1))
1877 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1878 else
1879 { ADD_NEW(state_offset, count); }
1882 break;
1884 /*-----------------------------------------------------------------*/
1885 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1886 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1887 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1888 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1889 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1890 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1891 count = current_state->count; /* Number already matched */
1892 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1894 const pcre_uchar *nptr = ptr + clen;
1895 int ncount = 0;
1896 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1898 active_count--; /* Remove non-match possibility */
1899 next_active_state--;
1901 while (nptr < end_subject)
1903 int nd;
1904 int ndlen = 1;
1905 GETCHARLEN(nd, nptr, ndlen);
1906 if (UCD_CATEGORY(nd) != ucp_M) break;
1907 ncount++;
1908 nptr += ndlen;
1910 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1911 reset_could_continue = TRUE;
1912 if (++count >= GET2(code, 1))
1913 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1914 else
1915 { ADD_NEW_DATA(-state_offset, count, ncount); }
1917 break;
1918 #endif
1920 /*-----------------------------------------------------------------*/
1921 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1922 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1923 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1924 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1925 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1926 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1927 count = current_state->count; /* Number already matched */
1928 if (clen > 0)
1930 int ncount = 0;
1931 switch (c)
1933 case 0x000b:
1934 case 0x000c:
1935 case 0x0085:
1936 case 0x2028:
1937 case 0x2029:
1938 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1939 goto ANYNL03;
1941 case 0x000d:
1942 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1943 /* Fall through */
1945 ANYNL03:
1946 case 0x000a:
1947 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1949 active_count--; /* Remove non-match possibility */
1950 next_active_state--;
1952 if (++count >= GET2(code, 1))
1953 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1954 else
1955 { ADD_NEW_DATA(-state_offset, count, ncount); }
1956 break;
1958 default:
1959 break;
1962 break;
1964 /*-----------------------------------------------------------------*/
1965 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1966 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1967 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1968 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1969 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1970 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1971 count = current_state->count; /* Number already matched */
1972 if (clen > 0)
1974 BOOL OK;
1975 switch (c)
1977 case 0x000a:
1978 case 0x000b:
1979 case 0x000c:
1980 case 0x000d:
1981 case 0x0085:
1982 case 0x2028:
1983 case 0x2029:
1984 OK = TRUE;
1985 break;
1987 default:
1988 OK = FALSE;
1991 if (OK == (d == OP_VSPACE))
1993 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1995 active_count--; /* Remove non-match possibility */
1996 next_active_state--;
1998 if (++count >= GET2(code, 1))
1999 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2000 else
2001 { ADD_NEW_DATA(-state_offset, count, 0); }
2004 break;
2006 /*-----------------------------------------------------------------*/
2007 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2008 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2009 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2010 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2011 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2012 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2013 count = current_state->count; /* Number already matched */
2014 if (clen > 0)
2016 BOOL OK;
2017 switch (c)
2019 case 0x09: /* HT */
2020 case 0x20: /* SPACE */
2021 case 0xa0: /* NBSP */
2022 case 0x1680: /* OGHAM SPACE MARK */
2023 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2024 case 0x2000: /* EN QUAD */
2025 case 0x2001: /* EM QUAD */
2026 case 0x2002: /* EN SPACE */
2027 case 0x2003: /* EM SPACE */
2028 case 0x2004: /* THREE-PER-EM SPACE */
2029 case 0x2005: /* FOUR-PER-EM SPACE */
2030 case 0x2006: /* SIX-PER-EM SPACE */
2031 case 0x2007: /* FIGURE SPACE */
2032 case 0x2008: /* PUNCTUATION SPACE */
2033 case 0x2009: /* THIN SPACE */
2034 case 0x200A: /* HAIR SPACE */
2035 case 0x202f: /* NARROW NO-BREAK SPACE */
2036 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2037 case 0x3000: /* IDEOGRAPHIC SPACE */
2038 OK = TRUE;
2039 break;
2041 default:
2042 OK = FALSE;
2043 break;
2046 if (OK == (d == OP_HSPACE))
2048 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2050 active_count--; /* Remove non-match possibility */
2051 next_active_state--;
2053 if (++count >= GET2(code, 1))
2054 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2055 else
2056 { ADD_NEW_DATA(-state_offset, count, 0); }
2059 break;
2061 /* ========================================================================== */
2062 /* These opcodes are followed by a character that is usually compared
2063 to the current subject character; it is loaded into d. We still get
2064 here even if there is no subject character, because in some cases zero
2065 repetitions are permitted. */
2067 /*-----------------------------------------------------------------*/
2068 case OP_CHAR:
2069 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2070 break;
2072 /*-----------------------------------------------------------------*/
2073 case OP_CHARI:
2074 if (clen == 0) break;
2076 #ifdef SUPPORT_UTF
2077 if (utf)
2079 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2081 unsigned int othercase;
2082 if (c < 128)
2083 othercase = fcc[c];
2084 else
2085 /* If we have Unicode property support, we can use it to test the
2086 other case of the character. */
2087 #ifdef SUPPORT_UCP
2088 othercase = UCD_OTHERCASE(c);
2089 #else
2090 othercase = NOTACHAR;
2091 #endif
2093 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096 else
2097 #endif /* SUPPORT_UTF */
2098 /* Not UTF mode */
2100 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2101 { ADD_NEW(state_offset + 2, 0); }
2103 break;
2106 #ifdef SUPPORT_UCP
2107 /*-----------------------------------------------------------------*/
2108 /* This is a tricky one because it can match more than one character.
2109 Find out how many characters to skip, and then set up a negative state
2110 to wait for them to pass before continuing. */
2112 case OP_EXTUNI:
2113 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2115 const pcre_uchar *nptr = ptr + clen;
2116 int ncount = 0;
2117 while (nptr < end_subject)
2119 int nclen = 1;
2120 GETCHARLEN(c, nptr, nclen);
2121 if (UCD_CATEGORY(c) != ucp_M) break;
2122 ncount++;
2123 nptr += nclen;
2125 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2126 reset_could_continue = TRUE;
2127 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2129 break;
2130 #endif
2132 /*-----------------------------------------------------------------*/
2133 /* This is a tricky like EXTUNI because it too can match more than one
2134 character (when CR is followed by LF). In this case, set up a negative
2135 state to wait for one character to pass before continuing. */
2137 case OP_ANYNL:
2138 if (clen > 0) switch(c)
2140 case 0x000b:
2141 case 0x000c:
2142 case 0x0085:
2143 case 0x2028:
2144 case 0x2029:
2145 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2147 case 0x000a:
2148 ADD_NEW(state_offset + 1, 0);
2149 break;
2151 case 0x000d:
2152 if (ptr + 1 >= end_subject)
2154 ADD_NEW(state_offset + 1, 0);
2155 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2156 reset_could_continue = TRUE;
2158 else if (ptr[1] == 0x0a)
2160 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2162 else
2164 ADD_NEW(state_offset + 1, 0);
2166 break;
2168 break;
2170 /*-----------------------------------------------------------------*/
2171 case OP_NOT_VSPACE:
2172 if (clen > 0) switch(c)
2174 case 0x000a:
2175 case 0x000b:
2176 case 0x000c:
2177 case 0x000d:
2178 case 0x0085:
2179 case 0x2028:
2180 case 0x2029:
2181 break;
2183 default:
2184 ADD_NEW(state_offset + 1, 0);
2185 break;
2187 break;
2189 /*-----------------------------------------------------------------*/
2190 case OP_VSPACE:
2191 if (clen > 0) switch(c)
2193 case 0x000a:
2194 case 0x000b:
2195 case 0x000c:
2196 case 0x000d:
2197 case 0x0085:
2198 case 0x2028:
2199 case 0x2029:
2200 ADD_NEW(state_offset + 1, 0);
2201 break;
2203 default: break;
2205 break;
2207 /*-----------------------------------------------------------------*/
2208 case OP_NOT_HSPACE:
2209 if (clen > 0) switch(c)
2211 case 0x09: /* HT */
2212 case 0x20: /* SPACE */
2213 case 0xa0: /* NBSP */
2214 case 0x1680: /* OGHAM SPACE MARK */
2215 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2216 case 0x2000: /* EN QUAD */
2217 case 0x2001: /* EM QUAD */
2218 case 0x2002: /* EN SPACE */
2219 case 0x2003: /* EM SPACE */
2220 case 0x2004: /* THREE-PER-EM SPACE */
2221 case 0x2005: /* FOUR-PER-EM SPACE */
2222 case 0x2006: /* SIX-PER-EM SPACE */
2223 case 0x2007: /* FIGURE SPACE */
2224 case 0x2008: /* PUNCTUATION SPACE */
2225 case 0x2009: /* THIN SPACE */
2226 case 0x200A: /* HAIR SPACE */
2227 case 0x202f: /* NARROW NO-BREAK SPACE */
2228 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2229 case 0x3000: /* IDEOGRAPHIC SPACE */
2230 break;
2232 default:
2233 ADD_NEW(state_offset + 1, 0);
2234 break;
2236 break;
2238 /*-----------------------------------------------------------------*/
2239 case OP_HSPACE:
2240 if (clen > 0) switch(c)
2242 case 0x09: /* HT */
2243 case 0x20: /* SPACE */
2244 case 0xa0: /* NBSP */
2245 case 0x1680: /* OGHAM SPACE MARK */
2246 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2247 case 0x2000: /* EN QUAD */
2248 case 0x2001: /* EM QUAD */
2249 case 0x2002: /* EN SPACE */
2250 case 0x2003: /* EM SPACE */
2251 case 0x2004: /* THREE-PER-EM SPACE */
2252 case 0x2005: /* FOUR-PER-EM SPACE */
2253 case 0x2006: /* SIX-PER-EM SPACE */
2254 case 0x2007: /* FIGURE SPACE */
2255 case 0x2008: /* PUNCTUATION SPACE */
2256 case 0x2009: /* THIN SPACE */
2257 case 0x200A: /* HAIR SPACE */
2258 case 0x202f: /* NARROW NO-BREAK SPACE */
2259 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2260 case 0x3000: /* IDEOGRAPHIC SPACE */
2261 ADD_NEW(state_offset + 1, 0);
2262 break;
2264 break;
2266 /*-----------------------------------------------------------------*/
2267 /* Match a negated single character casefully. */
2269 case OP_NOT:
2270 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2271 break;
2273 /*-----------------------------------------------------------------*/
2274 /* Match a negated single character caselessly. */
2276 case OP_NOTI:
2277 if (clen > 0)
2279 unsigned int otherd;
2280 #ifdef SUPPORT_UTF
2281 if (utf && d >= 128)
2283 #ifdef SUPPORT_UCP
2284 otherd = UCD_OTHERCASE(d);
2285 #endif /* SUPPORT_UCP */
2287 else
2288 #endif /* SUPPORT_UTF */
2289 otherd = TABLE_GET(d, fcc, d);
2290 if (c != d && c != otherd)
2291 { ADD_NEW(state_offset + dlen + 1, 0); }
2293 break;
2295 /*-----------------------------------------------------------------*/
2296 case OP_PLUSI:
2297 case OP_MINPLUSI:
2298 case OP_POSPLUSI:
2299 case OP_NOTPLUSI:
2300 case OP_NOTMINPLUSI:
2301 case OP_NOTPOSPLUSI:
2302 caseless = TRUE;
2303 codevalue -= OP_STARI - OP_STAR;
2305 /* Fall through */
2306 case OP_PLUS:
2307 case OP_MINPLUS:
2308 case OP_POSPLUS:
2309 case OP_NOTPLUS:
2310 case OP_NOTMINPLUS:
2311 case OP_NOTPOSPLUS:
2312 count = current_state->count; /* Already matched */
2313 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2314 if (clen > 0)
2316 unsigned int otherd = NOTACHAR;
2317 if (caseless)
2319 #ifdef SUPPORT_UTF
2320 if (utf && d >= 128)
2322 #ifdef SUPPORT_UCP
2323 otherd = UCD_OTHERCASE(d);
2324 #endif /* SUPPORT_UCP */
2326 else
2327 #endif /* SUPPORT_UTF */
2328 otherd = TABLE_GET(d, fcc, d);
2330 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2332 if (count > 0 &&
2333 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2335 active_count--; /* Remove non-match possibility */
2336 next_active_state--;
2338 count++;
2339 ADD_NEW(state_offset, count);
2342 break;
2344 /*-----------------------------------------------------------------*/
2345 case OP_QUERYI:
2346 case OP_MINQUERYI:
2347 case OP_POSQUERYI:
2348 case OP_NOTQUERYI:
2349 case OP_NOTMINQUERYI:
2350 case OP_NOTPOSQUERYI:
2351 caseless = TRUE;
2352 codevalue -= OP_STARI - OP_STAR;
2353 /* Fall through */
2354 case OP_QUERY:
2355 case OP_MINQUERY:
2356 case OP_POSQUERY:
2357 case OP_NOTQUERY:
2358 case OP_NOTMINQUERY:
2359 case OP_NOTPOSQUERY:
2360 ADD_ACTIVE(state_offset + dlen + 1, 0);
2361 if (clen > 0)
2363 unsigned int otherd = NOTACHAR;
2364 if (caseless)
2366 #ifdef SUPPORT_UTF
2367 if (utf && d >= 128)
2369 #ifdef SUPPORT_UCP
2370 otherd = UCD_OTHERCASE(d);
2371 #endif /* SUPPORT_UCP */
2373 else
2374 #endif /* SUPPORT_UTF */
2375 otherd = TABLE_GET(d, fcc, d);
2377 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2379 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2381 active_count--; /* Remove non-match possibility */
2382 next_active_state--;
2384 ADD_NEW(state_offset + dlen + 1, 0);
2387 break;
2389 /*-----------------------------------------------------------------*/
2390 case OP_STARI:
2391 case OP_MINSTARI:
2392 case OP_POSSTARI:
2393 case OP_NOTSTARI:
2394 case OP_NOTMINSTARI:
2395 case OP_NOTPOSSTARI:
2396 caseless = TRUE;
2397 codevalue -= OP_STARI - OP_STAR;
2398 /* Fall through */
2399 case OP_STAR:
2400 case OP_MINSTAR:
2401 case OP_POSSTAR:
2402 case OP_NOTSTAR:
2403 case OP_NOTMINSTAR:
2404 case OP_NOTPOSSTAR:
2405 ADD_ACTIVE(state_offset + dlen + 1, 0);
2406 if (clen > 0)
2408 unsigned int otherd = NOTACHAR;
2409 if (caseless)
2411 #ifdef SUPPORT_UTF
2412 if (utf && d >= 128)
2414 #ifdef SUPPORT_UCP
2415 otherd = UCD_OTHERCASE(d);
2416 #endif /* SUPPORT_UCP */
2418 else
2419 #endif /* SUPPORT_UTF */
2420 otherd = TABLE_GET(d, fcc, d);
2422 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2424 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2426 active_count--; /* Remove non-match possibility */
2427 next_active_state--;
2429 ADD_NEW(state_offset, 0);
2432 break;
2434 /*-----------------------------------------------------------------*/
2435 case OP_EXACTI:
2436 case OP_NOTEXACTI:
2437 caseless = TRUE;
2438 codevalue -= OP_STARI - OP_STAR;
2439 /* Fall through */
2440 case OP_EXACT:
2441 case OP_NOTEXACT:
2442 count = current_state->count; /* Number already matched */
2443 if (clen > 0)
2445 unsigned int otherd = NOTACHAR;
2446 if (caseless)
2448 #ifdef SUPPORT_UTF
2449 if (utf && d >= 128)
2451 #ifdef SUPPORT_UCP
2452 otherd = UCD_OTHERCASE(d);
2453 #endif /* SUPPORT_UCP */
2455 else
2456 #endif /* SUPPORT_UTF */
2457 otherd = TABLE_GET(d, fcc, d);
2459 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2461 if (++count >= GET2(code, 1))
2462 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2463 else
2464 { ADD_NEW(state_offset, count); }
2467 break;
2469 /*-----------------------------------------------------------------*/
2470 case OP_UPTOI:
2471 case OP_MINUPTOI:
2472 case OP_POSUPTOI:
2473 case OP_NOTUPTOI:
2474 case OP_NOTMINUPTOI:
2475 case OP_NOTPOSUPTOI:
2476 caseless = TRUE;
2477 codevalue -= OP_STARI - OP_STAR;
2478 /* Fall through */
2479 case OP_UPTO:
2480 case OP_MINUPTO:
2481 case OP_POSUPTO:
2482 case OP_NOTUPTO:
2483 case OP_NOTMINUPTO:
2484 case OP_NOTPOSUPTO:
2485 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2486 count = current_state->count; /* Number already matched */
2487 if (clen > 0)
2489 unsigned int otherd = NOTACHAR;
2490 if (caseless)
2492 #ifdef SUPPORT_UTF
2493 if (utf && d >= 128)
2495 #ifdef SUPPORT_UCP
2496 otherd = UCD_OTHERCASE(d);
2497 #endif /* SUPPORT_UCP */
2499 else
2500 #endif /* SUPPORT_UTF */
2501 otherd = TABLE_GET(d, fcc, d);
2503 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2505 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2507 active_count--; /* Remove non-match possibility */
2508 next_active_state--;
2510 if (++count >= GET2(code, 1))
2511 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2512 else
2513 { ADD_NEW(state_offset, count); }
2516 break;
2519 /* ========================================================================== */
2520 /* These are the class-handling opcodes */
2522 case OP_CLASS:
2523 case OP_NCLASS:
2524 case OP_XCLASS:
2526 BOOL isinclass = FALSE;
2527 int next_state_offset;
2528 const pcre_uchar *ecode;
2530 /* For a simple class, there is always just a 32-byte table, and we
2531 can set isinclass from it. */
2533 if (codevalue != OP_XCLASS)
2535 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2536 if (clen > 0)
2538 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2539 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2543 /* An extended class may have a table or a list of single characters,
2544 ranges, or both, and it may be positive or negative. There's a
2545 function that sorts all this out. */
2547 else
2549 ecode = code + GET(code, 1);
2550 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2553 /* At this point, isinclass is set for all kinds of class, and ecode
2554 points to the byte after the end of the class. If there is a
2555 quantifier, this is where it will be. */
2557 next_state_offset = (int)(ecode - start_code);
2559 switch (*ecode)
2561 case OP_CRSTAR:
2562 case OP_CRMINSTAR:
2563 ADD_ACTIVE(next_state_offset + 1, 0);
2564 if (isinclass) { ADD_NEW(state_offset, 0); }
2565 break;
2567 case OP_CRPLUS:
2568 case OP_CRMINPLUS:
2569 count = current_state->count; /* Already matched */
2570 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2571 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2572 break;
2574 case OP_CRQUERY:
2575 case OP_CRMINQUERY:
2576 ADD_ACTIVE(next_state_offset + 1, 0);
2577 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2578 break;
2580 case OP_CRRANGE:
2581 case OP_CRMINRANGE:
2582 count = current_state->count; /* Already matched */
2583 if (count >= GET2(ecode, 1))
2584 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2585 if (isinclass)
2587 int max = GET2(ecode, 1 + IMM2_SIZE);
2588 if (++count >= max && max != 0) /* Max 0 => no limit */
2589 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2590 else
2591 { ADD_NEW(state_offset, count); }
2593 break;
2595 default:
2596 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2597 break;
2600 break;
2602 /* ========================================================================== */
2603 /* These are the opcodes for fancy brackets of various kinds. We have
2604 to use recursion in order to handle them. The "always failing" assertion
2605 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2606 though the other "backtracking verbs" are not supported. */
2608 case OP_FAIL:
2609 forced_fail++; /* Count FAILs for multiple states */
2610 break;
2612 case OP_ASSERT:
2613 case OP_ASSERT_NOT:
2614 case OP_ASSERTBACK:
2615 case OP_ASSERTBACK_NOT:
2617 int rc;
2618 int local_offsets[2];
2619 int local_workspace[1000];
2620 const pcre_uchar *endasscode = code + GET(code, 1);
2622 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2624 rc = internal_dfa_exec(
2625 md, /* static match data */
2626 code, /* this subexpression's code */
2627 ptr, /* where we currently are */
2628 (int)(ptr - start_subject), /* start offset */
2629 local_offsets, /* offset vector */
2630 sizeof(local_offsets)/sizeof(int), /* size of same */
2631 local_workspace, /* workspace vector */
2632 sizeof(local_workspace)/sizeof(int), /* size of same */
2633 rlevel); /* function recursion level */
2635 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2636 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2637 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2639 break;
2641 /*-----------------------------------------------------------------*/
2642 case OP_COND:
2643 case OP_SCOND:
2645 int local_offsets[1000];
2646 int local_workspace[1000];
2647 int codelink = GET(code, 1);
2648 int condcode;
2650 /* Because of the way auto-callout works during compile, a callout item
2651 is inserted between OP_COND and an assertion condition. This does not
2652 happen for the other conditions. */
2654 if (code[LINK_SIZE+1] == OP_CALLOUT)
2656 rrc = 0;
2657 if (PUBL(callout) != NULL)
2659 PUBL(callout_block) cb;
2660 cb.version = 1; /* Version 1 of the callout block */
2661 cb.callout_number = code[LINK_SIZE+2];
2662 cb.offset_vector = offsets;
2663 #ifdef COMPILE_PCRE8
2664 cb.subject = (PCRE_SPTR)start_subject;
2665 #else
2666 cb.subject = (PCRE_SPTR16)start_subject;
2667 #endif
2668 cb.subject_length = (int)(end_subject - start_subject);
2669 cb.start_match = (int)(current_subject - start_subject);
2670 cb.current_position = (int)(ptr - start_subject);
2671 cb.pattern_position = GET(code, LINK_SIZE + 3);
2672 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2673 cb.capture_top = 1;
2674 cb.capture_last = -1;
2675 cb.callout_data = md->callout_data;
2676 cb.mark = NULL; /* No (*MARK) support */
2677 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2679 if (rrc > 0) break; /* Fail this thread */
2680 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2683 condcode = code[LINK_SIZE+1];
2685 /* Back reference conditions are not supported */
2687 if (condcode == OP_CREF || condcode == OP_NCREF)
2688 return PCRE_ERROR_DFA_UCOND;
2690 /* The DEFINE condition is always false */
2692 if (condcode == OP_DEF)
2693 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2695 /* The only supported version of OP_RREF is for the value RREF_ANY,
2696 which means "test if in any recursion". We can't test for specifically
2697 recursed groups. */
2699 else if (condcode == OP_RREF || condcode == OP_NRREF)
2701 int value = GET2(code, LINK_SIZE + 2);
2702 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2703 if (md->recursive != NULL)
2704 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2705 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708 /* Otherwise, the condition is an assertion */
2710 else
2712 int rc;
2713 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2714 const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2716 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2718 rc = internal_dfa_exec(
2719 md, /* fixed match data */
2720 asscode, /* this subexpression's code */
2721 ptr, /* where we currently are */
2722 (int)(ptr - start_subject), /* start offset */
2723 local_offsets, /* offset vector */
2724 sizeof(local_offsets)/sizeof(int), /* size of same */
2725 local_workspace, /* workspace vector */
2726 sizeof(local_workspace)/sizeof(int), /* size of same */
2727 rlevel); /* function recursion level */
2729 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2730 if ((rc >= 0) ==
2731 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2732 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2733 else
2734 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737 break;
2739 /*-----------------------------------------------------------------*/
2740 case OP_RECURSE:
2742 dfa_recursion_info *ri;
2743 int local_offsets[1000];
2744 int local_workspace[1000];
2745 const pcre_uchar *callpat = start_code + GET(code, 1);
2746 int recno = (callpat == md->start_code)? 0 :
2747 GET2(callpat, 1 + LINK_SIZE);
2748 int rc;
2750 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2752 /* Check for repeating a recursion without advancing the subject
2753 pointer. This should catch convoluted mutual recursions. (Some simple
2754 cases are caught at compile time.) */
2756 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2757 if (recno == ri->group_num && ptr == ri->subject_position)
2758 return PCRE_ERROR_RECURSELOOP;
2760 /* Remember this recursion and where we started it so as to
2761 catch infinite loops. */
2763 new_recursive.group_num = recno;
2764 new_recursive.subject_position = ptr;
2765 new_recursive.prevrec = md->recursive;
2766 md->recursive = &new_recursive;
2768 rc = internal_dfa_exec(
2769 md, /* fixed match data */
2770 callpat, /* this subexpression's code */
2771 ptr, /* where we currently are */
2772 (int)(ptr - start_subject), /* start offset */
2773 local_offsets, /* offset vector */
2774 sizeof(local_offsets)/sizeof(int), /* size of same */
2775 local_workspace, /* workspace vector */
2776 sizeof(local_workspace)/sizeof(int), /* size of same */
2777 rlevel); /* function recursion level */
2779 md->recursive = new_recursive.prevrec; /* Done this recursion */
2781 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2782 rc));
2784 /* Ran out of internal offsets */
2786 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2788 /* For each successful matched substring, set up the next state with a
2789 count of characters to skip before trying it. Note that the count is in
2790 characters, not bytes. */
2792 if (rc > 0)
2794 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2796 int charcount = local_offsets[rc+1] - local_offsets[rc];
2797 #ifdef SUPPORT_UTF
2798 if (utf)
2800 const pcre_uchar *p = start_subject + local_offsets[rc];
2801 const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2802 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2804 #endif
2805 if (charcount > 0)
2807 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2809 else
2811 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2815 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2817 break;
2819 /*-----------------------------------------------------------------*/
2820 case OP_BRAPOS:
2821 case OP_SBRAPOS:
2822 case OP_CBRAPOS:
2823 case OP_SCBRAPOS:
2824 case OP_BRAPOSZERO:
2826 int charcount, matched_count;
2827 const pcre_uchar *local_ptr = ptr;
2828 BOOL allow_zero;
2830 if (codevalue == OP_BRAPOSZERO)
2832 allow_zero = TRUE;
2833 codevalue = *(++code); /* Codevalue will be one of above BRAs */
2835 else allow_zero = FALSE;
2837 /* Loop to match the subpattern as many times as possible as if it were
2838 a complete pattern. */
2840 for (matched_count = 0;; matched_count++)
2842 int local_offsets[2];
2843 int local_workspace[1000];
2845 int rc = internal_dfa_exec(
2846 md, /* fixed match data */
2847 code, /* this subexpression's code */
2848 local_ptr, /* where we currently are */
2849 (int)(ptr - start_subject), /* start offset */
2850 local_offsets, /* offset vector */
2851 sizeof(local_offsets)/sizeof(int), /* size of same */
2852 local_workspace, /* workspace vector */
2853 sizeof(local_workspace)/sizeof(int), /* size of same */
2854 rlevel); /* function recursion level */
2856 /* Failed to match */
2858 if (rc < 0)
2860 if (rc != PCRE_ERROR_NOMATCH) return rc;
2861 break;
2864 /* Matched: break the loop if zero characters matched. */
2866 charcount = local_offsets[1] - local_offsets[0];
2867 if (charcount == 0) break;
2868 local_ptr += charcount; /* Advance temporary position ptr */
2871 /* At this point we have matched the subpattern matched_count
2872 times, and local_ptr is pointing to the character after the end of the
2873 last match. */
2875 if (matched_count > 0 || allow_zero)
2877 const pcre_uchar *end_subpattern = code;
2878 int next_state_offset;
2880 do { end_subpattern += GET(end_subpattern, 1); }
2881 while (*end_subpattern == OP_ALT);
2882 next_state_offset =
2883 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2885 /* Optimization: if there are no more active states, and there
2886 are no new states yet set up, then skip over the subject string
2887 right here, to save looping. Otherwise, set up the new state to swing
2888 into action when the end of the matched substring is reached. */
2890 if (i + 1 >= active_count && new_count == 0)
2892 ptr = local_ptr;
2893 clen = 0;
2894 ADD_NEW(next_state_offset, 0);
2896 else
2898 const pcre_uchar *p = ptr;
2899 const pcre_uchar *pp = local_ptr;
2900 charcount = (int)(pp - p);
2901 #ifdef SUPPORT_UTF
2902 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2903 #endif
2904 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2908 break;
2910 /*-----------------------------------------------------------------*/
2911 case OP_ONCE:
2912 case OP_ONCE_NC:
2914 int local_offsets[2];
2915 int local_workspace[1000];
2917 int rc = internal_dfa_exec(
2918 md, /* fixed match data */
2919 code, /* this subexpression's code */
2920 ptr, /* where we currently are */
2921 (int)(ptr - start_subject), /* start offset */
2922 local_offsets, /* offset vector */
2923 sizeof(local_offsets)/sizeof(int), /* size of same */
2924 local_workspace, /* workspace vector */
2925 sizeof(local_workspace)/sizeof(int), /* size of same */
2926 rlevel); /* function recursion level */
2928 if (rc >= 0)
2930 const pcre_uchar *end_subpattern = code;
2931 int charcount = local_offsets[1] - local_offsets[0];
2932 int next_state_offset, repeat_state_offset;
2934 do { end_subpattern += GET(end_subpattern, 1); }
2935 while (*end_subpattern == OP_ALT);
2936 next_state_offset =
2937 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2939 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2940 arrange for the repeat state also to be added to the relevant list.
2941 Calculate the offset, or set -1 for no repeat. */
2943 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2944 *end_subpattern == OP_KETRMIN)?
2945 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2947 /* If we have matched an empty string, add the next state at the
2948 current character pointer. This is important so that the duplicate
2949 checking kicks in, which is what breaks infinite loops that match an
2950 empty string. */
2952 if (charcount == 0)
2954 ADD_ACTIVE(next_state_offset, 0);
2957 /* Optimization: if there are no more active states, and there
2958 are no new states yet set up, then skip over the subject string
2959 right here, to save looping. Otherwise, set up the new state to swing
2960 into action when the end of the matched substring is reached. */
2962 else if (i + 1 >= active_count && new_count == 0)
2964 ptr += charcount;
2965 clen = 0;
2966 ADD_NEW(next_state_offset, 0);
2968 /* If we are adding a repeat state at the new character position,
2969 we must fudge things so that it is the only current state.
2970 Otherwise, it might be a duplicate of one we processed before, and
2971 that would cause it to be skipped. */
2973 if (repeat_state_offset >= 0)
2975 next_active_state = active_states;
2976 active_count = 0;
2977 i = -1;
2978 ADD_ACTIVE(repeat_state_offset, 0);
2981 else
2983 #ifdef SUPPORT_UTF
2984 if (utf)
2986 const pcre_uchar *p = start_subject + local_offsets[0];
2987 const pcre_uchar *pp = start_subject + local_offsets[1];
2988 while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2990 #endif
2991 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2992 if (repeat_state_offset >= 0)
2993 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2998 break;
3001 /* ========================================================================== */
3002 /* Handle callouts */
3004 case OP_CALLOUT:
3005 rrc = 0;
3006 if (PUBL(callout) != NULL)
3008 PUBL(callout_block) cb;
3009 cb.version = 1; /* Version 1 of the callout block */
3010 cb.callout_number = code[1];
3011 cb.offset_vector = offsets;
3012 #ifdef COMPILE_PCRE8
3013 cb.subject = (PCRE_SPTR)start_subject;
3014 #else
3015 cb.subject = (PCRE_SPTR16)start_subject;
3016 #endif
3017 cb.subject_length = (int)(end_subject - start_subject);
3018 cb.start_match = (int)(current_subject - start_subject);
3019 cb.current_position = (int)(ptr - start_subject);
3020 cb.pattern_position = GET(code, 2);
3021 cb.next_item_length = GET(code, 2 + LINK_SIZE);
3022 cb.capture_top = 1;
3023 cb.capture_last = -1;
3024 cb.callout_data = md->callout_data;
3025 cb.mark = NULL; /* No (*MARK) support */
3026 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3028 if (rrc == 0)
3029 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3030 break;
3033 /* ========================================================================== */
3034 default: /* Unsupported opcode */
3035 return PCRE_ERROR_DFA_UITEM;
3038 NEXT_ACTIVE_STATE: continue;
3040 } /* End of loop scanning active states */
3042 /* We have finished the processing at the current subject character. If no
3043 new states have been set for the next character, we have found all the
3044 matches that we are going to find. If we are at the top level and partial
3045 matching has been requested, check for appropriate conditions.
3047 The "forced_ fail" variable counts the number of (*F) encountered for the
3048 character. If it is equal to the original active_count (saved in
3049 workspace[1]) it means that (*F) was found on every active state. In this
3050 case we don't want to give a partial match.
3052 The "could_continue" variable is true if a state could have continued but
3053 for the fact that the end of the subject was reached. */
3055 if (new_count <= 0)
3057 if (rlevel == 1 && /* Top level, and */
3058 could_continue && /* Some could go on, and */
3059 forced_fail != workspace[1] && /* Not all forced fail & */
3060 ( /* either... */
3061 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3062 || /* or... */
3063 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3064 match_count < 0) /* no matches */
3065 ) && /* And... */
3067 partial_newline || /* Either partial NL */
3068 ( /* or ... */
3069 ptr >= end_subject && /* End of subject and */
3070 ptr > md->start_used_ptr) /* Inspected non-empty string */
3074 if (offsetcount >= 2)
3076 offsets[0] = (int)(md->start_used_ptr - start_subject);
3077 offsets[1] = (int)(end_subject - start_subject);
3079 match_count = PCRE_ERROR_PARTIAL;
3082 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3083 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3084 rlevel*2-2, SP));
3085 break; /* In effect, "return", but see the comment below */
3088 /* One or more states are active for the next character. */
3090 ptr += clen; /* Advance to next subject character */
3091 } /* Loop to move along the subject string */
3093 /* Control gets here from "break" a few lines above. We do it this way because
3094 if we use "return" above, we have compiler trouble. Some compilers warn if
3095 there's nothing here because they think the function doesn't return a value. On
3096 the other hand, if we put a dummy statement here, some more clever compilers
3097 complain that it can't be reached. Sigh. */
3099 return match_count;
3105 /*************************************************
3106 * Execute a Regular Expression - DFA engine *
3107 *************************************************/
3109 /* This external function applies a compiled re to a subject string using a DFA
3110 engine. This function calls the internal function multiple times if the pattern
3111 is not anchored.
3113 Arguments:
3114 argument_re points to the compiled expression
3115 extra_data points to extra data or is NULL
3116 subject points to the subject string
3117 length length of subject string (may contain binary zeros)
3118 start_offset where to start in the subject string
3119 options option bits
3120 offsets vector of match offsets
3121 offsetcount size of same
3122 workspace workspace vector
3123 wscount size of same
3125 Returns: > 0 => number of match offset pairs placed in offsets
3126 = 0 => offsets overflowed; longest matches are present
3127 -1 => failed to match
3128 < -1 => some kind of unexpected problem
3131 #ifdef COMPILE_PCRE8
3132 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3133 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3134 const char *subject, int length, int start_offset, int options, int *offsets,
3135 int offsetcount, int *workspace, int wscount)
3136 #else
3137 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3138 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3139 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3140 int offsetcount, int *workspace, int wscount)
3141 #endif
3143 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3144 dfa_match_data match_block;
3145 dfa_match_data *md = &match_block;
3146 BOOL utf, anchored, startline, firstline;
3147 const pcre_uchar *current_subject, *end_subject;
3148 const pcre_study_data *study = NULL;
3150 const pcre_uchar *req_char_ptr;
3151 const pcre_uint8 *start_bits = NULL;
3152 BOOL has_first_char = FALSE;
3153 BOOL has_req_char = FALSE;
3154 pcre_uchar first_char = 0;
3155 pcre_uchar first_char2 = 0;
3156 pcre_uchar req_char = 0;
3157 pcre_uchar req_char2 = 0;
3158 int newline;
3160 /* Plausibility checks */
3162 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3163 if (re == NULL || subject == NULL || workspace == NULL ||
3164 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3165 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3166 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3167 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3169 /* Check that the first field in the block is the magic number. If it is not,
3170 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3171 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3172 means that the pattern is likely compiled with different endianness. */
3174 if (re->magic_number != MAGIC_NUMBER)
3175 return re->magic_number == REVERSED_MAGIC_NUMBER?
3176 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3177 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3179 /* If restarting after a partial match, do some sanity checks on the contents
3180 of the workspace. */
3182 if ((options & PCRE_DFA_RESTART) != 0)
3184 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3185 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3186 return PCRE_ERROR_DFA_BADRESTART;
3189 /* Set up study, callout, and table data */
3191 md->tables = re->tables;
3192 md->callout_data = NULL;
3194 if (extra_data != NULL)
3196 unsigned int flags = extra_data->flags;
3197 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3198 study = (const pcre_study_data *)extra_data->study_data;
3199 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3200 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3201 return PCRE_ERROR_DFA_UMLIMIT;
3202 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3203 md->callout_data = extra_data->callout_data;
3204 if ((flags & PCRE_EXTRA_TABLES) != 0)
3205 md->tables = extra_data->tables;
3208 /* Set some local values */
3210 current_subject = (const pcre_uchar *)subject + start_offset;
3211 end_subject = (const pcre_uchar *)subject + length;
3212 req_char_ptr = current_subject - 1;
3214 #ifdef SUPPORT_UTF
3215 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3216 utf = (re->options & PCRE_UTF8) != 0;
3217 #else
3218 utf = FALSE;
3219 #endif
3221 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3222 (re->options & PCRE_ANCHORED) != 0;
3224 /* The remaining fixed data for passing around. */
3226 md->start_code = (const pcre_uchar *)argument_re +
3227 re->name_table_offset + re->name_count * re->name_entry_size;
3228 md->start_subject = (const pcre_uchar *)subject;
3229 md->end_subject = end_subject;
3230 md->start_offset = start_offset;
3231 md->moptions = options;
3232 md->poptions = re->options;
3234 /* If the BSR option is not set at match time, copy what was set
3235 at compile time. */
3237 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3239 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3240 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3241 #ifdef BSR_ANYCRLF
3242 else md->moptions |= PCRE_BSR_ANYCRLF;
3243 #endif
3246 /* Handle different types of newline. The three bits give eight cases. If
3247 nothing is set at run time, whatever was used at compile time applies. */
3249 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3250 PCRE_NEWLINE_BITS)
3252 case 0: newline = NEWLINE; break; /* Compile-time default */
3253 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3254 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3255 case PCRE_NEWLINE_CR+
3256 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3257 case PCRE_NEWLINE_ANY: newline = -1; break;
3258 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3259 default: return PCRE_ERROR_BADNEWLINE;
3262 if (newline == -2)
3264 md->nltype = NLTYPE_ANYCRLF;
3266 else if (newline < 0)
3268 md->nltype = NLTYPE_ANY;
3270 else
3272 md->nltype = NLTYPE_FIXED;
3273 if (newline > 255)
3275 md->nllen = 2;
3276 md->nl[0] = (newline >> 8) & 255;
3277 md->nl[1] = newline & 255;
3279 else
3281 md->nllen = 1;
3282 md->nl[0] = newline;
3286 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3287 back the character offset. */
3289 #ifdef SUPPORT_UTF
3290 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3292 int erroroffset;
3293 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3294 if (errorcode != 0)
3296 if (offsetcount >= 2)
3298 offsets[0] = erroroffset;
3299 offsets[1] = errorcode;
3301 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3302 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3304 if (start_offset > 0 && start_offset < length &&
3305 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3306 return PCRE_ERROR_BADUTF8_OFFSET;
3308 #endif
3310 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3311 is a feature that makes it possible to save compiled regex and re-use them
3312 in other programs later. */
3314 if (md->tables == NULL) md->tables = PRIV(default_tables);
3316 /* The "must be at the start of a line" flags are used in a loop when finding
3317 where to start. */
3319 startline = (re->flags & PCRE_STARTLINE) != 0;
3320 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3322 /* Set up the first character to match, if available. The first_byte value is
3323 never set for an anchored regular expression, but the anchoring may be forced
3324 at run time, so we have to test for anchoring. The first char may be unset for
3325 an unanchored pattern, of course. If there's no first char and the pattern was
3326 studied, there may be a bitmap of possible first characters. */
3328 if (!anchored)
3330 if ((re->flags & PCRE_FIRSTSET) != 0)
3332 has_first_char = TRUE;
3333 first_char = first_char2 = (pcre_uchar)(re->first_char);
3334 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3336 first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3337 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3338 if (utf && first_char > 127)
3339 first_char2 = UCD_OTHERCASE(first_char);
3340 #endif
3343 else
3345 if (!startline && study != NULL &&
3346 (study->flags & PCRE_STUDY_MAPPED) != 0)
3347 start_bits = study->start_bits;
3351 /* For anchored or unanchored matches, there may be a "last known required
3352 character" set. */
3354 if ((re->flags & PCRE_REQCHSET) != 0)
3356 has_req_char = TRUE;
3357 req_char = req_char2 = (pcre_uchar)(re->req_char);
3358 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3360 req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3361 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3362 if (utf && req_char > 127)
3363 req_char2 = UCD_OTHERCASE(req_char);
3364 #endif
3368 /* Call the main matching function, looping for a non-anchored regex after a
3369 failed match. If not restarting, perform certain optimizations at the start of
3370 a match. */
3372 for (;;)
3374 int rc;
3376 if ((options & PCRE_DFA_RESTART) == 0)
3378 const pcre_uchar *save_end_subject = end_subject;
3380 /* If firstline is TRUE, the start of the match is constrained to the first
3381 line of a multiline string. Implement this by temporarily adjusting
3382 end_subject so that we stop scanning at a newline. If the match fails at
3383 the newline, later code breaks this loop. */
3385 if (firstline)
3387 PCRE_PUCHAR t = current_subject;
3388 #ifdef SUPPORT_UTF
3389 if (utf)
3391 while (t < md->end_subject && !IS_NEWLINE(t))
3393 t++;
3394 ACROSSCHAR(t < end_subject, *t, t++);
3397 else
3398 #endif
3399 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3400 end_subject = t;
3403 /* There are some optimizations that avoid running the match if a known
3404 starting point is not found. However, there is an option that disables
3405 these, for testing and for ensuring that all callouts do actually occur.
3406 The option can be set in the regex by (*NO_START_OPT) or passed in
3407 match-time options. */
3409 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3411 /* Advance to a known first char. */
3413 if (has_first_char)
3415 if (first_char != first_char2)
3416 while (current_subject < end_subject &&
3417 *current_subject != first_char && *current_subject != first_char2)
3418 current_subject++;
3419 else
3420 while (current_subject < end_subject &&
3421 *current_subject != first_char)
3422 current_subject++;
3425 /* Or to just after a linebreak for a multiline match if possible */
3427 else if (startline)
3429 if (current_subject > md->start_subject + start_offset)
3431 #ifdef SUPPORT_UTF
3432 if (utf)
3434 while (current_subject < end_subject &&
3435 !WAS_NEWLINE(current_subject))
3437 current_subject++;
3438 ACROSSCHAR(current_subject < end_subject, *current_subject,
3439 current_subject++);
3442 else
3443 #endif
3444 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3445 current_subject++;
3447 /* If we have just passed a CR and the newline option is ANY or
3448 ANYCRLF, and we are now at a LF, advance the match position by one
3449 more character. */
3451 if (current_subject[-1] == CHAR_CR &&
3452 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3453 current_subject < end_subject &&
3454 *current_subject == CHAR_NL)
3455 current_subject++;
3459 /* Or to a non-unique first char after study */
3461 else if (start_bits != NULL)
3463 while (current_subject < end_subject)
3465 unsigned int c = *current_subject;
3466 #ifndef COMPILE_PCRE8
3467 if (c > 255) c = 255;
3468 #endif
3469 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3471 current_subject++;
3472 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3473 /* In non 8-bit mode, the iteration will stop for
3474 characters > 255 at the beginning or not stop at all. */
3475 if (utf)
3476 ACROSSCHAR(current_subject < end_subject, *current_subject,
3477 current_subject++);
3478 #endif
3480 else break;
3485 /* Restore fudged end_subject */
3487 end_subject = save_end_subject;
3489 /* The following two optimizations are disabled for partial matching or if
3490 disabling is explicitly requested (and of course, by the test above, this
3491 code is not obeyed when restarting after a partial match). */
3493 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3494 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3496 /* If the pattern was studied, a minimum subject length may be set. This
3497 is a lower bound; no actual string of that length may actually match the
3498 pattern. Although the value is, strictly, in characters, we treat it as
3499 bytes to avoid spending too much time in this optimization. */
3501 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3502 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3503 return PCRE_ERROR_NOMATCH;
3505 /* If req_char is set, we know that that character must appear in the
3506 subject for the match to succeed. If the first character is set, req_char
3507 must be later in the subject; otherwise the test starts at the match
3508 point. This optimization can save a huge amount of work in patterns with
3509 nested unlimited repeats that aren't going to match. Writing separate
3510 code for cased/caseless versions makes it go faster, as does using an
3511 autoincrement and backing off on a match.
3513 HOWEVER: when the subject string is very, very long, searching to its end
3514 can take a long time, and give bad performance on quite ordinary
3515 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3516 string... so we don't do this when the string is sufficiently long. */
3518 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3520 PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3522 /* We don't need to repeat the search if we haven't yet reached the
3523 place we found it at last time. */
3525 if (p > req_char_ptr)
3527 if (req_char != req_char2)
3529 while (p < end_subject)
3531 int pp = *p++;
3532 if (pp == req_char || pp == req_char2) { p--; break; }
3535 else
3537 while (p < end_subject)
3539 if (*p++ == req_char) { p--; break; }
3543 /* If we can't find the required character, break the matching loop,
3544 which will cause a return or PCRE_ERROR_NOMATCH. */
3546 if (p >= end_subject) break;
3548 /* If we have found the required character, save the point where we
3549 found it, so that we don't search again next time round the loop if
3550 the start hasn't passed this character yet. */
3552 req_char_ptr = p;
3556 } /* End of optimizations that are done when not restarting */
3558 /* OK, now we can do the business */
3560 md->start_used_ptr = current_subject;
3561 md->recursive = NULL;
3563 rc = internal_dfa_exec(
3564 md, /* fixed match data */
3565 md->start_code, /* this subexpression's code */
3566 current_subject, /* where we currently are */
3567 start_offset, /* start offset in subject */
3568 offsets, /* offset vector */
3569 offsetcount, /* size of same */
3570 workspace, /* workspace vector */
3571 wscount, /* size of same */
3572 0); /* function recurse level */
3574 /* Anything other than "no match" means we are done, always; otherwise, carry
3575 on only if not anchored. */
3577 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3579 /* Advance to the next subject character unless we are at the end of a line
3580 and firstline is set. */
3582 if (firstline && IS_NEWLINE(current_subject)) break;
3583 current_subject++;
3584 #ifdef SUPPORT_UTF
3585 if (utf)
3587 ACROSSCHAR(current_subject < end_subject, *current_subject,
3588 current_subject++);
3590 #endif
3591 if (current_subject > end_subject) break;
3593 /* If we have just passed a CR and we are now at a LF, and the pattern does
3594 not contain any explicit matches for \r or \n, and the newline option is CRLF
3595 or ANY or ANYCRLF, advance the match position by one more character. */
3597 if (current_subject[-1] == CHAR_CR &&
3598 current_subject < end_subject &&
3599 *current_subject == CHAR_NL &&
3600 (re->flags & PCRE_HASCRORLF) == 0 &&
3601 (md->nltype == NLTYPE_ANY ||
3602 md->nltype == NLTYPE_ANYCRLF ||
3603 md->nllen == 2))
3604 current_subject++;
3606 } /* "Bumpalong" loop */
3608 return PCRE_ERROR_NOMATCH;
3611 /* End of pcre_dfa_exec.c */